diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/concrete/__pycache__/__init__.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/concrete/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..12bb67bd19347ed115a61ac97250eb92ef56ce6b Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/concrete/__pycache__/__init__.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/concrete/__pycache__/delta.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/concrete/__pycache__/delta.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ee4a9e20c060ac592a5160aea269b9e07c4f2749 Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/concrete/__pycache__/delta.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/concrete/__pycache__/expr_with_limits.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/concrete/__pycache__/expr_with_limits.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..68b1be840c923799baaff5edacf13a887167e74f Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/concrete/__pycache__/expr_with_limits.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/concrete/__pycache__/guess.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/concrete/__pycache__/guess.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..38131aaf2ac0fd7240267fdee1a36e82909b9bb0 Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/concrete/__pycache__/guess.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/concrete/tests/__pycache__/__init__.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/concrete/tests/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0918a067b3d16355df0972e1c57d9613d41b3d6b Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/concrete/tests/__pycache__/__init__.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/concrete/tests/__pycache__/test_delta.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/concrete/tests/__pycache__/test_delta.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ce3cb7fc06aa9a3a7c99738f1047a62b8689e204 Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/concrete/tests/__pycache__/test_delta.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/concrete/tests/__pycache__/test_gosper.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/concrete/tests/__pycache__/test_gosper.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bd1686a7dc939081482f01e0ff5a87d10a4537d0 Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/concrete/tests/__pycache__/test_gosper.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/concrete/tests/__pycache__/test_guess.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/concrete/tests/__pycache__/test_guess.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..816e3255f1e5326b27f159885b28fa62b0baff5d Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/concrete/tests/__pycache__/test_guess.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/concrete/tests/__pycache__/test_products.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/concrete/tests/__pycache__/test_products.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dc5d42f9a7919e6508d0302296d693d426d7f864 Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/concrete/tests/__pycache__/test_products.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/functions/__pycache__/__init__.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/functions/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..368294f928844b31bcc04d74838edb01df55c675 Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/functions/__pycache__/__init__.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/functions/combinatorial/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/functions/combinatorial/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..584b3c8d46b5c7600d85efc7db46d7aa190397f8 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/functions/combinatorial/__init__.py @@ -0,0 +1 @@ +# Stub __init__.py for sympy.functions.combinatorial diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/functions/combinatorial/factorials.py b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/functions/combinatorial/factorials.py new file mode 100644 index 0000000000000000000000000000000000000000..0c6d2f09524debca29d3040b28a019127a244b33 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/functions/combinatorial/factorials.py @@ -0,0 +1,1133 @@ +from __future__ import annotations +from functools import reduce + +from sympy.core import S, sympify, Dummy, Mod +from sympy.core.cache import cacheit +from sympy.core.function import DefinedFunction, ArgumentIndexError, PoleError +from sympy.core.logic import fuzzy_and +from sympy.core.numbers import Integer, pi, I +from sympy.core.relational import Eq +from sympy.external.gmpy import gmpy as _gmpy +from sympy.ntheory import sieve +from sympy.ntheory.residue_ntheory import binomial_mod +from sympy.polys.polytools import Poly + +from math import factorial as _factorial, prod, sqrt as _sqrt + +class CombinatorialFunction(DefinedFunction): + """Base class for combinatorial functions. """ + + def _eval_simplify(self, **kwargs): + from sympy.simplify.combsimp import combsimp + # combinatorial function with non-integer arguments is + # automatically passed to gammasimp + expr = combsimp(self) + measure = kwargs['measure'] + if measure(expr) <= kwargs['ratio']*measure(self): + return expr + return self + + +############################################################################### +######################## FACTORIAL and MULTI-FACTORIAL ######################## +############################################################################### + + +class factorial(CombinatorialFunction): + r"""Implementation of factorial function over nonnegative integers. + By convention (consistent with the gamma function and the binomial + coefficients), factorial of a negative integer is complex infinity. + + The factorial is very important in combinatorics where it gives + the number of ways in which `n` objects can be permuted. It also + arises in calculus, probability, number theory, etc. + + There is strict relation of factorial with gamma function. In + fact `n! = gamma(n+1)` for nonnegative integers. Rewrite of this + kind is very useful in case of combinatorial simplification. + + Computation of the factorial is done using two algorithms. For + small arguments a precomputed look up table is used. However for bigger + input algorithm Prime-Swing is used. It is the fastest algorithm + known and computes `n!` via prime factorization of special class + of numbers, called here the 'Swing Numbers'. + + Examples + ======== + + >>> from sympy import Symbol, factorial, S + >>> n = Symbol('n', integer=True) + + >>> factorial(0) + 1 + + >>> factorial(7) + 5040 + + >>> factorial(-2) + zoo + + >>> factorial(n) + factorial(n) + + >>> factorial(2*n) + factorial(2*n) + + >>> factorial(S(1)/2) + factorial(1/2) + + See Also + ======== + + factorial2, RisingFactorial, FallingFactorial + """ + + def fdiff(self, argindex=1): + from sympy.functions.special.gamma_functions import (gamma, polygamma) + if argindex == 1: + return gamma(self.args[0] + 1)*polygamma(0, self.args[0] + 1) + else: + raise ArgumentIndexError(self, argindex) + + _small_swing = [ + 1, 1, 1, 3, 3, 15, 5, 35, 35, 315, 63, 693, 231, 3003, 429, 6435, 6435, 109395, + 12155, 230945, 46189, 969969, 88179, 2028117, 676039, 16900975, 1300075, + 35102025, 5014575, 145422675, 9694845, 300540195, 300540195 + ] + + _small_factorials: list[int] = [] + + @classmethod + def _swing(cls, n): + if n < 33: + return cls._small_swing[n] + else: + N, primes = int(_sqrt(n)), [] + + for prime in sieve.primerange(3, N + 1): + p, q = 1, n + + while True: + q //= prime + + if q > 0: + if q & 1 == 1: + p *= prime + else: + break + + if p > 1: + primes.append(p) + + for prime in sieve.primerange(N + 1, n//3 + 1): + if (n // prime) & 1 == 1: + primes.append(prime) + + L_product = prod(sieve.primerange(n//2 + 1, n + 1)) + R_product = prod(primes) + + return L_product*R_product + + @classmethod + def _recursive(cls, n): + if n < 2: + return 1 + else: + return (cls._recursive(n//2)**2)*cls._swing(n) + + @classmethod + def eval(cls, n): + n = sympify(n) + + if n.is_Number: + if n.is_zero: + return S.One + elif n is S.Infinity: + return S.Infinity + elif n.is_Integer: + if n.is_negative: + return S.ComplexInfinity + else: + n = n.p + + if n < 20: + if not cls._small_factorials: + result = 1 + for i in range(1, 20): + result *= i + cls._small_factorials.append(result) + result = cls._small_factorials[n-1] + + # GMPY factorial is faster, use it when available + # + # XXX: There is a sympy.external.gmpy.factorial function + # which provides gmpy.fac if available or the flint version + # if flint is used. It could be used here to avoid the + # conditional logic but it needs to be checked whether the + # pure Python fallback used there is as fast as the + # fallback used here (perhaps the fallback here should be + # moved to sympy.external.ntheory). + elif _gmpy is not None: + result = _gmpy.fac(n) + + else: + bits = bin(n).count('1') + result = cls._recursive(n)*2**(n - bits) + + return Integer(result) + + def _facmod(self, n, q): + res, N = 1, int(_sqrt(n)) + + # Exponent of prime p in n! is e_p(n) = [n/p] + [n/p**2] + ... + # for p > sqrt(n), e_p(n) < sqrt(n), the primes with [n/p] = m, + # occur consecutively and are grouped together in pw[m] for + # simultaneous exponentiation at a later stage + pw = [1]*N + + m = 2 # to initialize the if condition below + for prime in sieve.primerange(2, n + 1): + if m > 1: + m, y = 0, n // prime + while y: + m += y + y //= prime + if m < N: + pw[m] = pw[m]*prime % q + else: + res = res*pow(prime, m, q) % q + + for ex, bs in enumerate(pw): + if ex == 0 or bs == 1: + continue + if bs == 0: + return 0 + res = res*pow(bs, ex, q) % q + + return res + + def _eval_Mod(self, q): + n = self.args[0] + if n.is_integer and n.is_nonnegative and q.is_integer: + aq = abs(q) + d = aq - n + if d.is_nonpositive: + return S.Zero + else: + isprime = aq.is_prime + if d == 1: + # Apply Wilson's theorem (if a natural number n > 1 + # is a prime number, then (n-1)! = -1 mod n) and + # its inverse (if n > 4 is a composite number, then + # (n-1)! = 0 mod n) + if isprime: + return -1 % q + elif isprime is False and (aq - 6).is_nonnegative: + return S.Zero + elif n.is_Integer and q.is_Integer: + n, d, aq = map(int, (n, d, aq)) + if isprime and (d - 1 < n): + fc = self._facmod(d - 1, aq) + fc = pow(fc, aq - 2, aq) + if d%2: + fc = -fc + else: + fc = self._facmod(n, aq) + + return fc % q + + def _eval_rewrite_as_gamma(self, n, piecewise=True, **kwargs): + from sympy.functions.special.gamma_functions import gamma + return gamma(n + 1) + + def _eval_rewrite_as_Product(self, n, **kwargs): + from sympy.concrete.products import Product + if n.is_nonnegative and n.is_integer: + i = Dummy('i', integer=True) + return Product(i, (i, 1, n)) + + def _eval_is_integer(self): + if self.args[0].is_integer and self.args[0].is_nonnegative: + return True + + def _eval_is_positive(self): + if self.args[0].is_integer and self.args[0].is_nonnegative: + return True + + def _eval_is_even(self): + x = self.args[0] + if x.is_integer and x.is_nonnegative: + return (x - 2).is_nonnegative + + def _eval_is_composite(self): + x = self.args[0] + if x.is_integer and x.is_nonnegative: + return (x - 3).is_nonnegative + + def _eval_is_real(self): + x = self.args[0] + if x.is_nonnegative or x.is_noninteger: + return True + + def _eval_as_leading_term(self, x, logx, cdir): + arg = self.args[0].as_leading_term(x) + arg0 = arg.subs(x, 0) + if arg0.is_zero: + return S.One + elif not arg0.is_infinite: + return self.func(arg) + raise PoleError("Cannot expand %s around 0" % (self)) + +class MultiFactorial(CombinatorialFunction): + pass + + +class subfactorial(CombinatorialFunction): + r"""The subfactorial counts the derangements of $n$ items and is + defined for non-negative integers as: + + .. math:: !n = \begin{cases} 1 & n = 0 \\ 0 & n = 1 \\ + (n-1)(!(n-1) + !(n-2)) & n > 1 \end{cases} + + It can also be written as ``int(round(n!/exp(1)))`` but the + recursive definition with caching is implemented for this function. + + An interesting analytic expression is the following [2]_ + + .. math:: !x = \Gamma(x + 1, -1)/e + + which is valid for non-negative integers `x`. The above formula + is not very useful in case of non-integers. `\Gamma(x + 1, -1)` is + single-valued only for integral arguments `x`, elsewhere on the positive + real axis it has an infinite number of branches none of which are real. + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Subfactorial + .. [2] https://mathworld.wolfram.com/Subfactorial.html + + Examples + ======== + + >>> from sympy import subfactorial + >>> from sympy.abc import n + >>> subfactorial(n + 1) + subfactorial(n + 1) + >>> subfactorial(5) + 44 + + See Also + ======== + + factorial, uppergamma, + sympy.utilities.iterables.generate_derangements + """ + + @classmethod + @cacheit + def _eval(self, n): + if not n: + return S.One + elif n == 1: + return S.Zero + else: + z1, z2 = 1, 0 + for i in range(2, n + 1): + z1, z2 = z2, (i - 1)*(z2 + z1) + return z2 + + @classmethod + def eval(cls, arg): + if arg.is_Number: + if arg.is_Integer and arg.is_nonnegative: + return cls._eval(arg) + elif arg is S.NaN: + return S.NaN + elif arg is S.Infinity: + return S.Infinity + + def _eval_is_even(self): + if self.args[0].is_odd and self.args[0].is_nonnegative: + return True + + def _eval_is_integer(self): + if self.args[0].is_integer and self.args[0].is_nonnegative: + return True + + def _eval_rewrite_as_factorial(self, arg, **kwargs): + from sympy.concrete.summations import summation + i = Dummy('i') + f = S.NegativeOne**i / factorial(i) + return factorial(arg) * summation(f, (i, 0, arg)) + + def _eval_rewrite_as_gamma(self, arg, piecewise=True, **kwargs): + from sympy.functions.elementary.exponential import exp + from sympy.functions.special.gamma_functions import (gamma, lowergamma) + return (S.NegativeOne**(arg + 1)*exp(-I*pi*arg)*lowergamma(arg + 1, -1) + + gamma(arg + 1))*exp(-1) + + def _eval_rewrite_as_uppergamma(self, arg, **kwargs): + from sympy.functions.special.gamma_functions import uppergamma + return uppergamma(arg + 1, -1)/S.Exp1 + + def _eval_is_nonnegative(self): + if self.args[0].is_integer and self.args[0].is_nonnegative: + return True + + def _eval_is_odd(self): + if self.args[0].is_even and self.args[0].is_nonnegative: + return True + + +class factorial2(CombinatorialFunction): + r"""The double factorial `n!!`, not to be confused with `(n!)!` + + The double factorial is defined for nonnegative integers and for odd + negative integers as: + + .. math:: n!! = \begin{cases} 1 & n = 0 \\ + n(n-2)(n-4) \cdots 1 & n\ \text{positive odd} \\ + n(n-2)(n-4) \cdots 2 & n\ \text{positive even} \\ + (n+2)!!/(n+2) & n\ \text{negative odd} \end{cases} + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Double_factorial + + Examples + ======== + + >>> from sympy import factorial2, var + >>> n = var('n') + >>> n + n + >>> factorial2(n + 1) + factorial2(n + 1) + >>> factorial2(5) + 15 + >>> factorial2(-1) + 1 + >>> factorial2(-5) + 1/3 + + See Also + ======== + + factorial, RisingFactorial, FallingFactorial + """ + + @classmethod + def eval(cls, arg): + # TODO: extend this to complex numbers? + + if arg.is_Number: + if not arg.is_Integer: + raise ValueError("argument must be nonnegative integer " + "or negative odd integer") + + # This implementation is faster than the recursive one + # It also avoids "maximum recursion depth exceeded" runtime error + if arg.is_nonnegative: + if arg.is_even: + k = arg / 2 + return 2**k * factorial(k) + return factorial(arg) / factorial2(arg - 1) + + + if arg.is_odd: + return arg*(S.NegativeOne)**((1 - arg)/2) / factorial2(-arg) + raise ValueError("argument must be nonnegative integer " + "or negative odd integer") + + + def _eval_is_even(self): + # Double factorial is even for every positive even input + n = self.args[0] + if n.is_integer: + if n.is_odd: + return False + if n.is_even: + if n.is_positive: + return True + if n.is_zero: + return False + + def _eval_is_integer(self): + # Double factorial is an integer for every nonnegative input, and for + # -1 and -3 + n = self.args[0] + if n.is_integer: + if (n + 1).is_nonnegative: + return True + if n.is_odd: + return (n + 3).is_nonnegative + + def _eval_is_odd(self): + # Double factorial is odd for every odd input not smaller than -3, and + # for 0 + n = self.args[0] + if n.is_odd: + return (n + 3).is_nonnegative + if n.is_even: + if n.is_positive: + return False + if n.is_zero: + return True + + def _eval_is_positive(self): + # Double factorial is positive for every nonnegative input, and for + # every odd negative input which is of the form -1-4k for an + # nonnegative integer k + n = self.args[0] + if n.is_integer: + if (n + 1).is_nonnegative: + return True + if n.is_odd: + return ((n + 1) / 2).is_even + + def _eval_rewrite_as_gamma(self, n, piecewise=True, **kwargs): + from sympy.functions.elementary.miscellaneous import sqrt + from sympy.functions.elementary.piecewise import Piecewise + from sympy.functions.special.gamma_functions import gamma + return 2**(n/2)*gamma(n/2 + 1) * Piecewise((1, Eq(Mod(n, 2), 0)), + (sqrt(2/pi), Eq(Mod(n, 2), 1))) + + +############################################################################### +######################## RISING and FALLING FACTORIALS ######################## +############################################################################### + + +class RisingFactorial(CombinatorialFunction): + r""" + Rising factorial (also called Pochhammer symbol [1]_) is a double valued + function arising in concrete mathematics, hypergeometric functions + and series expansions. It is defined by: + + .. math:: \texttt{rf(y, k)} = (x)^k = x \cdot (x+1) \cdots (x+k-1) + + where `x` can be arbitrary expression and `k` is an integer. For + more information check "Concrete mathematics" by Graham, pp. 66 + or visit https://mathworld.wolfram.com/RisingFactorial.html page. + + When `x` is a `~.Poly` instance of degree $\ge 1$ with a single variable, + `(x)^k = x(y) \cdot x(y+1) \cdots x(y+k-1)`, where `y` is the + variable of `x`. This is as described in [2]_. + + Examples + ======== + + >>> from sympy import rf, Poly + >>> from sympy.abc import x + >>> rf(x, 0) + 1 + >>> rf(1, 5) + 120 + >>> rf(x, 5) == x*(1 + x)*(2 + x)*(3 + x)*(4 + x) + True + >>> rf(Poly(x**3, x), 2) + Poly(x**6 + 3*x**5 + 3*x**4 + x**3, x, domain='ZZ') + + Rewriting is complicated unless the relationship between + the arguments is known, but rising factorial can + be rewritten in terms of gamma, factorial, binomial, + and falling factorial. + + >>> from sympy import Symbol, factorial, ff, binomial, gamma + >>> n = Symbol('n', integer=True, positive=True) + >>> R = rf(n, n + 2) + >>> for i in (rf, ff, factorial, binomial, gamma): + ... R.rewrite(i) + ... + RisingFactorial(n, n + 2) + FallingFactorial(2*n + 1, n + 2) + factorial(2*n + 1)/factorial(n - 1) + binomial(2*n + 1, n + 2)*factorial(n + 2) + gamma(2*n + 2)/gamma(n) + + See Also + ======== + + factorial, factorial2, FallingFactorial + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Pochhammer_symbol + .. [2] Peter Paule, "Greatest Factorial Factorization and Symbolic + Summation", Journal of Symbolic Computation, vol. 20, pp. 235-268, + 1995. + + """ + + @classmethod + def eval(cls, x, k): + x = sympify(x) + k = sympify(k) + + if x is S.NaN or k is S.NaN: + return S.NaN + elif x is S.One: + return factorial(k) + elif k.is_Integer: + if k.is_zero: + return S.One + else: + if k.is_positive: + if x is S.Infinity: + return S.Infinity + elif x is S.NegativeInfinity: + if k.is_odd: + return S.NegativeInfinity + else: + return S.Infinity + else: + if isinstance(x, Poly): + gens = x.gens + if len(gens)!= 1: + raise ValueError("rf only defined for " + "polynomials on one generator") + else: + return reduce(lambda r, i: + r*(x.shift(i)), + range(int(k)), 1) + else: + return reduce(lambda r, i: r*(x + i), + range(int(k)), 1) + + else: + if x is S.Infinity: + return S.Infinity + elif x is S.NegativeInfinity: + return S.Infinity + else: + if isinstance(x, Poly): + gens = x.gens + if len(gens)!= 1: + raise ValueError("rf only defined for " + "polynomials on one generator") + else: + return 1/reduce(lambda r, i: + r*(x.shift(-i)), + range(1, abs(int(k)) + 1), 1) + else: + return 1/reduce(lambda r, i: + r*(x - i), + range(1, abs(int(k)) + 1), 1) + + if k.is_integer == False: + if x.is_integer and x.is_negative: + return S.Zero + + def _eval_rewrite_as_gamma(self, x, k, piecewise=True, **kwargs): + from sympy.functions.elementary.piecewise import Piecewise + from sympy.functions.special.gamma_functions import gamma + if not piecewise: + if (x <= 0) == True: + return S.NegativeOne**k*gamma(1 - x) / gamma(-k - x + 1) + return gamma(x + k) / gamma(x) + return Piecewise( + (gamma(x + k) / gamma(x), x > 0), + (S.NegativeOne**k*gamma(1 - x) / gamma(-k - x + 1), True)) + + def _eval_rewrite_as_FallingFactorial(self, x, k, **kwargs): + return FallingFactorial(x + k - 1, k) + + def _eval_rewrite_as_factorial(self, x, k, **kwargs): + from sympy.functions.elementary.piecewise import Piecewise + if x.is_integer and k.is_integer: + return Piecewise( + (factorial(k + x - 1)/factorial(x - 1), x > 0), + (S.NegativeOne**k*factorial(-x)/factorial(-k - x), True)) + + def _eval_rewrite_as_binomial(self, x, k, **kwargs): + if k.is_integer: + return factorial(k) * binomial(x + k - 1, k) + + def _eval_rewrite_as_tractable(self, x, k, limitvar=None, **kwargs): + from sympy.functions.special.gamma_functions import gamma + if limitvar: + k_lim = k.subs(limitvar, S.Infinity) + if k_lim is S.Infinity: + return (gamma(x + k).rewrite('tractable', deep=True) / gamma(x)) + elif k_lim is S.NegativeInfinity: + return (S.NegativeOne**k*gamma(1 - x) / gamma(-k - x + 1).rewrite('tractable', deep=True)) + return self.rewrite(gamma).rewrite('tractable', deep=True) + + def _eval_is_integer(self): + return fuzzy_and((self.args[0].is_integer, self.args[1].is_integer, + self.args[1].is_nonnegative)) + + +class FallingFactorial(CombinatorialFunction): + r""" + Falling factorial (related to rising factorial) is a double valued + function arising in concrete mathematics, hypergeometric functions + and series expansions. It is defined by + + .. math:: \texttt{ff(x, k)} = (x)_k = x \cdot (x-1) \cdots (x-k+1) + + where `x` can be arbitrary expression and `k` is an integer. For + more information check "Concrete mathematics" by Graham, pp. 66 + or [1]_. + + When `x` is a `~.Poly` instance of degree $\ge 1$ with single variable, + `(x)_k = x(y) \cdot x(y-1) \cdots x(y-k+1)`, where `y` is the + variable of `x`. This is as described in + + >>> from sympy import ff, Poly, Symbol + >>> from sympy.abc import x + >>> n = Symbol('n', integer=True) + + >>> ff(x, 0) + 1 + >>> ff(5, 5) + 120 + >>> ff(x, 5) == x*(x - 1)*(x - 2)*(x - 3)*(x - 4) + True + >>> ff(Poly(x**2, x), 2) + Poly(x**4 - 2*x**3 + x**2, x, domain='ZZ') + >>> ff(n, n) + factorial(n) + + Rewriting is complicated unless the relationship between + the arguments is known, but falling factorial can + be rewritten in terms of gamma, factorial and binomial + and rising factorial. + + >>> from sympy import factorial, rf, gamma, binomial, Symbol + >>> n = Symbol('n', integer=True, positive=True) + >>> F = ff(n, n - 2) + >>> for i in (rf, ff, factorial, binomial, gamma): + ... F.rewrite(i) + ... + RisingFactorial(3, n - 2) + FallingFactorial(n, n - 2) + factorial(n)/2 + binomial(n, n - 2)*factorial(n - 2) + gamma(n + 1)/2 + + See Also + ======== + + factorial, factorial2, RisingFactorial + + References + ========== + + .. [1] https://mathworld.wolfram.com/FallingFactorial.html + .. [2] Peter Paule, "Greatest Factorial Factorization and Symbolic + Summation", Journal of Symbolic Computation, vol. 20, pp. 235-268, + 1995. + + """ + + @classmethod + def eval(cls, x, k): + x = sympify(x) + k = sympify(k) + + if x is S.NaN or k is S.NaN: + return S.NaN + elif k.is_integer and x == k: + return factorial(x) + elif k.is_Integer: + if k.is_zero: + return S.One + else: + if k.is_positive: + if x is S.Infinity: + return S.Infinity + elif x is S.NegativeInfinity: + if k.is_odd: + return S.NegativeInfinity + else: + return S.Infinity + else: + if isinstance(x, Poly): + gens = x.gens + if len(gens)!= 1: + raise ValueError("ff only defined for " + "polynomials on one generator") + else: + return reduce(lambda r, i: + r*(x.shift(-i)), + range(int(k)), 1) + else: + return reduce(lambda r, i: r*(x - i), + range(int(k)), 1) + else: + if x is S.Infinity: + return S.Infinity + elif x is S.NegativeInfinity: + return S.Infinity + else: + if isinstance(x, Poly): + gens = x.gens + if len(gens)!= 1: + raise ValueError("rf only defined for " + "polynomials on one generator") + else: + return 1/reduce(lambda r, i: + r*(x.shift(i)), + range(1, abs(int(k)) + 1), 1) + else: + return 1/reduce(lambda r, i: r*(x + i), + range(1, abs(int(k)) + 1), 1) + + def _eval_rewrite_as_gamma(self, x, k, piecewise=True, **kwargs): + from sympy.functions.elementary.piecewise import Piecewise + from sympy.functions.special.gamma_functions import gamma + if not piecewise: + if (x < 0) == True: + return S.NegativeOne**k*gamma(k - x) / gamma(-x) + return gamma(x + 1) / gamma(x - k + 1) + return Piecewise( + (gamma(x + 1) / gamma(x - k + 1), x >= 0), + (S.NegativeOne**k*gamma(k - x) / gamma(-x), True)) + + def _eval_rewrite_as_RisingFactorial(self, x, k, **kwargs): + return rf(x - k + 1, k) + + def _eval_rewrite_as_binomial(self, x, k, **kwargs): + if k.is_integer: + return factorial(k) * binomial(x, k) + + def _eval_rewrite_as_factorial(self, x, k, **kwargs): + from sympy.functions.elementary.piecewise import Piecewise + if x.is_integer and k.is_integer: + return Piecewise( + (factorial(x)/factorial(-k + x), x >= 0), + (S.NegativeOne**k*factorial(k - x - 1)/factorial(-x - 1), True)) + + def _eval_rewrite_as_tractable(self, x, k, limitvar=None, **kwargs): + from sympy.functions.special.gamma_functions import gamma + if limitvar: + k_lim = k.subs(limitvar, S.Infinity) + if k_lim is S.Infinity: + return (S.NegativeOne**k*gamma(k - x).rewrite('tractable', deep=True) / gamma(-x)) + elif k_lim is S.NegativeInfinity: + return (gamma(x + 1) / gamma(x - k + 1).rewrite('tractable', deep=True)) + return self.rewrite(gamma).rewrite('tractable', deep=True) + + def _eval_is_integer(self): + return fuzzy_and((self.args[0].is_integer, self.args[1].is_integer, + self.args[1].is_nonnegative)) + + +rf = RisingFactorial +ff = FallingFactorial + +############################################################################### +########################### BINOMIAL COEFFICIENTS ############################# +############################################################################### + + +class binomial(CombinatorialFunction): + r"""Implementation of the binomial coefficient. It can be defined + in two ways depending on its desired interpretation: + + .. math:: \binom{n}{k} = \frac{n!}{k!(n-k)!}\ \text{or}\ + \binom{n}{k} = \frac{(n)_k}{k!} + + First, in a strict combinatorial sense it defines the + number of ways we can choose `k` elements from a set of + `n` elements. In this case both arguments are nonnegative + integers and binomial is computed using an efficient + algorithm based on prime factorization. + + The other definition is generalization for arbitrary `n`, + however `k` must also be nonnegative. This case is very + useful when evaluating summations. + + For the sake of convenience, for negative integer `k` this function + will return zero no matter the other argument. + + To expand the binomial when `n` is a symbol, use either + ``expand_func()`` or ``expand(func=True)``. The former will keep + the polynomial in factored form while the latter will expand the + polynomial itself. See examples for details. + + Examples + ======== + + >>> from sympy import Symbol, Rational, binomial, expand_func + >>> n = Symbol('n', integer=True, positive=True) + + >>> binomial(15, 8) + 6435 + + >>> binomial(n, -1) + 0 + + Rows of Pascal's triangle can be generated with the binomial function: + + >>> for N in range(8): + ... print([binomial(N, i) for i in range(N + 1)]) + ... + [1] + [1, 1] + [1, 2, 1] + [1, 3, 3, 1] + [1, 4, 6, 4, 1] + [1, 5, 10, 10, 5, 1] + [1, 6, 15, 20, 15, 6, 1] + [1, 7, 21, 35, 35, 21, 7, 1] + + As can a given diagonal, e.g. the 4th diagonal: + + >>> N = -4 + >>> [binomial(N, i) for i in range(1 - N)] + [1, -4, 10, -20, 35] + + >>> binomial(Rational(5, 4), 3) + -5/128 + >>> binomial(Rational(-5, 4), 3) + -195/128 + + >>> binomial(n, 3) + binomial(n, 3) + + >>> binomial(n, 3).expand(func=True) + n**3/6 - n**2/2 + n/3 + + >>> expand_func(binomial(n, 3)) + n*(n - 2)*(n - 1)/6 + + In many cases, we can also compute binomial coefficients modulo a + prime p quickly using Lucas' Theorem [2]_, though we need to include + `evaluate=False` to postpone evaluation: + + >>> from sympy import Mod + >>> Mod(binomial(156675, 4433, evaluate=False), 10**5 + 3) + 28625 + + Using a generalisation of Lucas's Theorem given by Granville [3]_, + we can extend this to arbitrary n: + + >>> Mod(binomial(10**18, 10**12, evaluate=False), (10**5 + 3)**2) + 3744312326 + + References + ========== + + .. [1] https://www.johndcook.com/blog/binomial_coefficients/ + .. [2] https://en.wikipedia.org/wiki/Lucas%27s_theorem + .. [3] Binomial coefficients modulo prime powers, Andrew Granville, + Available: https://web.archive.org/web/20170202003812/http://www.dms.umontreal.ca/~andrew/PDF/BinCoeff.pdf + """ + + def fdiff(self, argindex=1): + from sympy.functions.special.gamma_functions import polygamma + if argindex == 1: + # https://functions.wolfram.com/GammaBetaErf/Binomial/20/01/01/ + n, k = self.args + return binomial(n, k)*(polygamma(0, n + 1) - \ + polygamma(0, n - k + 1)) + elif argindex == 2: + # https://functions.wolfram.com/GammaBetaErf/Binomial/20/01/02/ + n, k = self.args + return binomial(n, k)*(polygamma(0, n - k + 1) - \ + polygamma(0, k + 1)) + else: + raise ArgumentIndexError(self, argindex) + + @classmethod + def _eval(self, n, k): + # n.is_Number and k.is_Integer and k != 1 and n != k + + if k.is_Integer: + if n.is_Integer and n >= 0: + n, k = int(n), int(k) + + if k > n: + return S.Zero + elif k > n // 2: + k = n - k + + # XXX: This conditional logic should be moved to + # sympy.external.gmpy and the pure Python version of bincoef + # should be moved to sympy.external.ntheory. + if _gmpy is not None: + return Integer(_gmpy.bincoef(n, k)) + + d, result = n - k, 1 + for i in range(1, k + 1): + d += 1 + result = result * d // i + return Integer(result) + else: + d, result = n - k, 1 + for i in range(1, k + 1): + d += 1 + result *= d + return result / _factorial(k) + + @classmethod + def eval(cls, n, k): + n, k = map(sympify, (n, k)) + d = n - k + n_nonneg, n_isint = n.is_nonnegative, n.is_integer + if k.is_zero or ((n_nonneg or n_isint is False) + and d.is_zero): + return S.One + if (k - 1).is_zero or ((n_nonneg or n_isint is False) + and (d - 1).is_zero): + return n + if k.is_integer: + if k.is_negative or (n_nonneg and n_isint and d.is_negative): + return S.Zero + elif n.is_number: + res = cls._eval(n, k) + return res.expand(basic=True) if res else res + elif n_nonneg is False and n_isint: + # a special case when binomial evaluates to complex infinity + return S.ComplexInfinity + elif k.is_number: + from sympy.functions.special.gamma_functions import gamma + return gamma(n + 1)/(gamma(k + 1)*gamma(n - k + 1)) + + def _eval_Mod(self, q): + n, k = self.args + + if any(x.is_integer is False for x in (n, k, q)): + raise ValueError("Integers expected for binomial Mod") + + if all(x.is_Integer for x in (n, k, q)): + n, k = map(int, (n, k)) + aq, res = abs(q), 1 + + # handle negative integers k or n + if k < 0: + return S.Zero + if n < 0: + n = -n + k - 1 + res = -1 if k%2 else 1 + + # non negative integers k and n + if k > n: + return S.Zero + + isprime = aq.is_prime + aq = int(aq) + if isprime: + if aq < n: + # use Lucas Theorem + N, K = n, k + while N or K: + res = res*binomial(N % aq, K % aq) % aq + N, K = N // aq, K // aq + + else: + # use Factorial Modulo + d = n - k + if k > d: + k, d = d, k + kf = 1 + for i in range(2, k + 1): + kf = kf*i % aq + df = kf + for i in range(k + 1, d + 1): + df = df*i % aq + res *= df + for i in range(d + 1, n + 1): + res = res*i % aq + + res *= pow(kf*df % aq, aq - 2, aq) + res %= aq + + elif _sqrt(q) < k and q != 1: + res = binomial_mod(n, k, q) + + else: + # Binomial Factorization is performed by calculating the + # exponents of primes <= n in `n! /(k! (n - k)!)`, + # for non-negative integers n and k. As the exponent of + # prime in n! is e_p(n) = [n/p] + [n/p**2] + ... + # the exponent of prime in binomial(n, k) would be + # e_p(n) - e_p(k) - e_p(n - k) + M = int(_sqrt(n)) + for prime in sieve.primerange(2, n + 1): + if prime > n - k: + res = res*prime % aq + elif prime > n // 2: + continue + elif prime > M: + if n % prime < k % prime: + res = res*prime % aq + else: + N, K = n, k + exp = a = 0 + + while N > 0: + a = int((N % prime) < (K % prime + a)) + N, K = N // prime, K // prime + exp += a + + if exp > 0: + res *= pow(prime, exp, aq) + res %= aq + + return S(res % q) + + def _eval_expand_func(self, **hints): + """ + Function to expand binomial(n, k) when m is positive integer + Also, + n is self.args[0] and k is self.args[1] while using binomial(n, k) + """ + n = self.args[0] + if n.is_Number: + return binomial(*self.args) + + k = self.args[1] + if (n-k).is_Integer: + k = n - k + + if k.is_Integer: + if k.is_zero: + return S.One + elif k.is_negative: + return S.Zero + else: + n, result = self.args[0], 1 + for i in range(1, k + 1): + result *= n - k + i + return result / _factorial(k) + else: + return binomial(*self.args) + + def _eval_rewrite_as_factorial(self, n, k, **kwargs): + return factorial(n)/(factorial(k)*factorial(n - k)) + + def _eval_rewrite_as_gamma(self, n, k, piecewise=True, **kwargs): + from sympy.functions.special.gamma_functions import gamma + return gamma(n + 1)/(gamma(k + 1)*gamma(n - k + 1)) + + def _eval_rewrite_as_tractable(self, n, k, limitvar=None, **kwargs): + return self._eval_rewrite_as_gamma(n, k).rewrite('tractable') + + def _eval_rewrite_as_FallingFactorial(self, n, k, **kwargs): + if k.is_integer: + return ff(n, k) / factorial(k) + + def _eval_is_integer(self): + n, k = self.args + if n.is_integer and k.is_integer: + return True + elif k.is_integer is False: + return False + + def _eval_is_nonnegative(self): + n, k = self.args + if n.is_integer and k.is_integer: + if n.is_nonnegative or k.is_negative or k.is_even: + return True + elif k.is_even is False: + return False + + def _eval_as_leading_term(self, x, logx, cdir): + from sympy.functions.special.gamma_functions import gamma + return self.rewrite(gamma)._eval_as_leading_term(x, logx=logx, cdir=cdir) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/functions/combinatorial/numbers.py b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/functions/combinatorial/numbers.py new file mode 100644 index 0000000000000000000000000000000000000000..c0dfc518d4a6784712341edaa5731145469a8d1e --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/functions/combinatorial/numbers.py @@ -0,0 +1,3196 @@ +""" +This module implements some special functions that commonly appear in +combinatorial contexts (e.g. in power series); in particular, +sequences of rational numbers such as Bernoulli and Fibonacci numbers. + +Factorials, binomial coefficients and related functions are located in +the separate 'factorials' module. +""" +from __future__ import annotations +from math import prod +from collections import defaultdict +from typing import Callable + +from sympy.core import S, Symbol, Add, Dummy +from sympy.core.cache import cacheit +from sympy.core.containers import Dict +from sympy.core.expr import Expr +from sympy.core.function import ArgumentIndexError, DefinedFunction, expand_mul +from sympy.core.logic import fuzzy_not +from sympy.core.mul import Mul +from sympy.core.numbers import E, I, pi, oo, Rational, Integer +from sympy.core.relational import Eq, is_le, is_gt, is_lt +from sympy.external.gmpy import SYMPY_INTS, remove, lcm, legendre, jacobi, kronecker +from sympy.functions.combinatorial.factorials import (binomial, + factorial, subfactorial) +from sympy.functions.elementary.exponential import log +from sympy.functions.elementary.piecewise import Piecewise +from sympy.ntheory.factor_ import (factorint, _divisor_sigma, is_carmichael, + find_carmichael_numbers_in_range, find_first_n_carmichaels) +from sympy.ntheory.generate import _primepi +from sympy.ntheory.partitions_ import _partition, _partition_rec +from sympy.ntheory.primetest import isprime, is_square +from sympy.polys.appellseqs import bernoulli_poly, euler_poly, genocchi_poly +from sympy.polys.polytools import cancel +from sympy.utilities.enumerative import MultisetPartitionTraverser +from sympy.utilities.exceptions import sympy_deprecation_warning +from sympy.utilities.iterables import multiset, multiset_derangements, iterable +from sympy.utilities.memoization import recurrence_memo +from sympy.utilities.misc import as_int + +from mpmath import mp, workprec +from mpmath.libmp import ifib as _ifib + + +def _product(a, b): + return prod(range(a, b + 1)) + + +# Dummy symbol used for computing polynomial sequences +_sym = Symbol('x') + + +#----------------------------------------------------------------------------# +# # +# Carmichael numbers # +# # +#----------------------------------------------------------------------------# + +class carmichael(DefinedFunction): + r""" + Carmichael Numbers: + + Certain cryptographic algorithms make use of big prime numbers. + However, checking whether a big number is prime is not so easy. + Randomized prime number checking tests exist that offer a high degree of + confidence of accurate determination at low cost, such as the Fermat test. + + Let 'a' be a random number between $2$ and $n - 1$, where $n$ is the + number whose primality we are testing. Then, $n$ is probably prime if it + satisfies the modular arithmetic congruence relation: + + .. math :: a^{n-1} = 1 \pmod{n} + + (where mod refers to the modulo operation) + + If a number passes the Fermat test several times, then it is prime with a + high probability. + + Unfortunately, certain composite numbers (non-primes) still pass the Fermat + test with every number smaller than themselves. + These numbers are called Carmichael numbers. + + A Carmichael number will pass a Fermat primality test to every base $b$ + relatively prime to the number, even though it is not actually prime. + This makes tests based on Fermat's Little Theorem less effective than + strong probable prime tests such as the Baillie-PSW primality test and + the Miller-Rabin primality test. + + Examples + ======== + + >>> from sympy.ntheory.factor_ import find_first_n_carmichaels, find_carmichael_numbers_in_range + >>> find_first_n_carmichaels(5) + [561, 1105, 1729, 2465, 2821] + >>> find_carmichael_numbers_in_range(0, 562) + [561] + >>> find_carmichael_numbers_in_range(0,1000) + [561] + >>> find_carmichael_numbers_in_range(0,2000) + [561, 1105, 1729] + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Carmichael_number + .. [2] https://en.wikipedia.org/wiki/Fermat_primality_test + .. [3] https://www.jstor.org/stable/23248683?seq=1#metadata_info_tab_contents + """ + + @staticmethod + def is_perfect_square(n): + sympy_deprecation_warning( + """ +is_perfect_square is just a wrapper around sympy.ntheory.primetest.is_square +so use that directly instead. + """, + deprecated_since_version="1.11", + active_deprecations_target='deprecated-carmichael-static-methods', + ) + return is_square(n) + + @staticmethod + def divides(p, n): + sympy_deprecation_warning( + """ + divides can be replaced by directly testing n % p == 0. + """, + deprecated_since_version="1.11", + active_deprecations_target='deprecated-carmichael-static-methods', + ) + return n % p == 0 + + @staticmethod + def is_prime(n): + sympy_deprecation_warning( + """ +is_prime is just a wrapper around sympy.ntheory.primetest.isprime so use that +directly instead. + """, + deprecated_since_version="1.11", + active_deprecations_target='deprecated-carmichael-static-methods', + ) + return isprime(n) + + @staticmethod + def is_carmichael(n): + sympy_deprecation_warning( + """ +is_carmichael is just a wrapper around sympy.ntheory.factor_.is_carmichael so use that +directly instead. + """, + deprecated_since_version="1.13", + active_deprecations_target='deprecated-ntheory-symbolic-functions', + ) + return is_carmichael(n) + + @staticmethod + def find_carmichael_numbers_in_range(x, y): + sympy_deprecation_warning( + """ +find_carmichael_numbers_in_range is just a wrapper around sympy.ntheory.factor_.find_carmichael_numbers_in_range so use that +directly instead. + """, + deprecated_since_version="1.13", + active_deprecations_target='deprecated-ntheory-symbolic-functions', + ) + return find_carmichael_numbers_in_range(x, y) + + @staticmethod + def find_first_n_carmichaels(n): + sympy_deprecation_warning( + """ +find_first_n_carmichaels is just a wrapper around sympy.ntheory.factor_.find_first_n_carmichaels so use that +directly instead. + """, + deprecated_since_version="1.13", + active_deprecations_target='deprecated-ntheory-symbolic-functions', + ) + return find_first_n_carmichaels(n) + + +#----------------------------------------------------------------------------# +# # +# Fibonacci numbers # +# # +#----------------------------------------------------------------------------# + + +class fibonacci(DefinedFunction): + r""" + Fibonacci numbers / Fibonacci polynomials + + The Fibonacci numbers are the integer sequence defined by the + initial terms `F_0 = 0`, `F_1 = 1` and the two-term recurrence + relation `F_n = F_{n-1} + F_{n-2}`. This definition + extended to arbitrary real and complex arguments using + the formula + + .. math :: F_z = \frac{\phi^z - \cos(\pi z) \phi^{-z}}{\sqrt 5} + + The Fibonacci polynomials are defined by `F_1(x) = 1`, + `F_2(x) = x`, and `F_n(x) = x*F_{n-1}(x) + F_{n-2}(x)` for `n > 2`. + For all positive integers `n`, `F_n(1) = F_n`. + + * ``fibonacci(n)`` gives the `n^{th}` Fibonacci number, `F_n` + * ``fibonacci(n, x)`` gives the `n^{th}` Fibonacci polynomial in `x`, `F_n(x)` + + Examples + ======== + + >>> from sympy import fibonacci, Symbol + + >>> [fibonacci(x) for x in range(11)] + [0, 1, 1, 2, 3, 5, 8, 13, 21, 34, 55] + >>> fibonacci(5, Symbol('t')) + t**4 + 3*t**2 + 1 + + See Also + ======== + + bell, bernoulli, catalan, euler, harmonic, lucas, genocchi, partition, tribonacci + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Fibonacci_number + .. [2] https://mathworld.wolfram.com/FibonacciNumber.html + + """ + + @staticmethod + def _fib(n): + return _ifib(n) + + @staticmethod + @recurrence_memo([None, S.One, _sym]) + def _fibpoly(n, prev): + return (prev[-2] + _sym*prev[-1]).expand() + + @classmethod + def eval(cls, n, sym=None): + if n is S.Infinity: + return S.Infinity + + if n.is_Integer: + if sym is None: + n = int(n) + if n < 0: + return S.NegativeOne**(n + 1) * fibonacci(-n) + else: + return Integer(cls._fib(n)) + else: + if n < 1: + raise ValueError("Fibonacci polynomials are defined " + "only for positive integer indices.") + return cls._fibpoly(n).subs(_sym, sym) + + def _eval_rewrite_as_tractable(self, n, **kwargs): + from sympy.functions import sqrt, cos + return (S.GoldenRatio**n - cos(S.Pi*n)/S.GoldenRatio**n)/sqrt(5) + + def _eval_rewrite_as_sqrt(self, n, **kwargs): + from sympy.functions.elementary.miscellaneous import sqrt + return 2**(-n)*sqrt(5)*((1 + sqrt(5))**n - (-sqrt(5) + 1)**n) / 5 + + def _eval_rewrite_as_GoldenRatio(self,n, **kwargs): + return (S.GoldenRatio**n - 1/(-S.GoldenRatio)**n)/(2*S.GoldenRatio-1) + + +#----------------------------------------------------------------------------# +# # +# Lucas numbers # +# # +#----------------------------------------------------------------------------# + + +class lucas(DefinedFunction): + """ + Lucas numbers + + Lucas numbers satisfy a recurrence relation similar to that of + the Fibonacci sequence, in which each term is the sum of the + preceding two. They are generated by choosing the initial + values `L_0 = 2` and `L_1 = 1`. + + * ``lucas(n)`` gives the `n^{th}` Lucas number + + Examples + ======== + + >>> from sympy import lucas + + >>> [lucas(x) for x in range(11)] + [2, 1, 3, 4, 7, 11, 18, 29, 47, 76, 123] + + See Also + ======== + + bell, bernoulli, catalan, euler, fibonacci, harmonic, genocchi, partition, tribonacci + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Lucas_number + .. [2] https://mathworld.wolfram.com/LucasNumber.html + + """ + + @classmethod + def eval(cls, n): + if n is S.Infinity: + return S.Infinity + + if n.is_Integer: + return fibonacci(n + 1) + fibonacci(n - 1) + + def _eval_rewrite_as_sqrt(self, n, **kwargs): + from sympy.functions.elementary.miscellaneous import sqrt + return 2**(-n)*((1 + sqrt(5))**n + (-sqrt(5) + 1)**n) + + +#----------------------------------------------------------------------------# +# # +# Tribonacci numbers # +# # +#----------------------------------------------------------------------------# + + +class tribonacci(DefinedFunction): + r""" + Tribonacci numbers / Tribonacci polynomials + + The Tribonacci numbers are the integer sequence defined by the + initial terms `T_0 = 0`, `T_1 = 1`, `T_2 = 1` and the three-term + recurrence relation `T_n = T_{n-1} + T_{n-2} + T_{n-3}`. + + The Tribonacci polynomials are defined by `T_0(x) = 0`, `T_1(x) = 1`, + `T_2(x) = x^2`, and `T_n(x) = x^2 T_{n-1}(x) + x T_{n-2}(x) + T_{n-3}(x)` + for `n > 2`. For all positive integers `n`, `T_n(1) = T_n`. + + * ``tribonacci(n)`` gives the `n^{th}` Tribonacci number, `T_n` + * ``tribonacci(n, x)`` gives the `n^{th}` Tribonacci polynomial in `x`, `T_n(x)` + + Examples + ======== + + >>> from sympy import tribonacci, Symbol + + >>> [tribonacci(x) for x in range(11)] + [0, 1, 1, 2, 4, 7, 13, 24, 44, 81, 149] + >>> tribonacci(5, Symbol('t')) + t**8 + 3*t**5 + 3*t**2 + + See Also + ======== + + bell, bernoulli, catalan, euler, fibonacci, harmonic, lucas, genocchi, partition + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Generalizations_of_Fibonacci_numbers#Tribonacci_numbers + .. [2] https://mathworld.wolfram.com/TribonacciNumber.html + .. [3] https://oeis.org/A000073 + + """ + + @staticmethod + @recurrence_memo([S.Zero, S.One, S.One]) + def _trib(n, prev): + return (prev[-3] + prev[-2] + prev[-1]) + + @staticmethod + @recurrence_memo([S.Zero, S.One, _sym**2]) + def _tribpoly(n, prev): + return (prev[-3] + _sym*prev[-2] + _sym**2*prev[-1]).expand() + + @classmethod + def eval(cls, n, sym=None): + if n is S.Infinity: + return S.Infinity + + if n.is_Integer: + n = int(n) + if n < 0: + raise ValueError("Tribonacci polynomials are defined " + "only for non-negative integer indices.") + if sym is None: + return Integer(cls._trib(n)) + else: + return cls._tribpoly(n).subs(_sym, sym) + + def _eval_rewrite_as_sqrt(self, n, **kwargs): + from sympy.functions.elementary.miscellaneous import cbrt, sqrt + w = (-1 + S.ImaginaryUnit * sqrt(3)) / 2 + a = (1 + cbrt(19 + 3*sqrt(33)) + cbrt(19 - 3*sqrt(33))) / 3 + b = (1 + w*cbrt(19 + 3*sqrt(33)) + w**2*cbrt(19 - 3*sqrt(33))) / 3 + c = (1 + w**2*cbrt(19 + 3*sqrt(33)) + w*cbrt(19 - 3*sqrt(33))) / 3 + Tn = (a**(n + 1)/((a - b)*(a - c)) + + b**(n + 1)/((b - a)*(b - c)) + + c**(n + 1)/((c - a)*(c - b))) + return Tn + + def _eval_rewrite_as_TribonacciConstant(self, n, **kwargs): + from sympy.functions.elementary.integers import floor + from sympy.functions.elementary.miscellaneous import cbrt, sqrt + b = cbrt(586 + 102*sqrt(33)) + Tn = 3 * b * S.TribonacciConstant**n / (b**2 - 2*b + 4) + return floor(Tn + S.Half) + + +#----------------------------------------------------------------------------# +# # +# Bernoulli numbers # +# # +#----------------------------------------------------------------------------# + + +class bernoulli(DefinedFunction): + r""" + Bernoulli numbers / Bernoulli polynomials / Bernoulli function + + The Bernoulli numbers are a sequence of rational numbers + defined by `B_0 = 1` and the recursive relation (`n > 0`): + + .. math :: n+1 = \sum_{k=0}^n \binom{n+1}{k} B_k + + They are also commonly defined by their exponential generating + function, which is `\frac{x}{1 - e^{-x}}`. For odd indices > 1, + the Bernoulli numbers are zero. + + The Bernoulli polynomials satisfy the analogous formula: + + .. math :: B_n(x) = \sum_{k=0}^n (-1)^k \binom{n}{k} B_k x^{n-k} + + Bernoulli numbers and Bernoulli polynomials are related as + `B_n(1) = B_n`. + + The generalized Bernoulli function `\operatorname{B}(s, a)` + is defined for any complex `s` and `a`, except where `a` is a + nonpositive integer and `s` is not a nonnegative integer. It is + an entire function of `s` for fixed `a`, related to the Hurwitz + zeta function by + + .. math:: \operatorname{B}(s, a) = \begin{cases} + -s \zeta(1-s, a) & s \ne 0 \\ 1 & s = 0 \end{cases} + + When `s` is a nonnegative integer this function reduces to the + Bernoulli polynomials: `\operatorname{B}(n, x) = B_n(x)`. When + `a` is omitted it is assumed to be 1, yielding the (ordinary) + Bernoulli function which interpolates the Bernoulli numbers and is + related to the Riemann zeta function. + + We compute Bernoulli numbers using Ramanujan's formula: + + .. math :: B_n = \frac{A(n) - S(n)}{\binom{n+3}{n}} + + where: + + .. math :: A(n) = \begin{cases} \frac{n+3}{3} & + n \equiv 0\ \text{or}\ 2 \pmod{6} \\ + -\frac{n+3}{6} & n \equiv 4 \pmod{6} \end{cases} + + and: + + .. math :: S(n) = \sum_{k=1}^{[n/6]} \binom{n+3}{n-6k} B_{n-6k} + + This formula is similar to the sum given in the definition, but + cuts `\frac{2}{3}` of the terms. For Bernoulli polynomials, we use + Appell sequences. + + For `n` a nonnegative integer and `s`, `a`, `x` arbitrary complex numbers, + + * ``bernoulli(n)`` gives the nth Bernoulli number, `B_n` + * ``bernoulli(s)`` gives the Bernoulli function `\operatorname{B}(s)` + * ``bernoulli(n, x)`` gives the nth Bernoulli polynomial in `x`, `B_n(x)` + * ``bernoulli(s, a)`` gives the generalized Bernoulli function + `\operatorname{B}(s, a)` + + .. versionchanged:: 1.12 + ``bernoulli(1)`` gives `+\frac{1}{2}` instead of `-\frac{1}{2}`. + This choice of value confers several theoretical advantages [5]_, + including the extension to complex parameters described above + which this function now implements. The previous behavior, defined + only for nonnegative integers `n`, can be obtained with + ``(-1)**n*bernoulli(n)``. + + Examples + ======== + + >>> from sympy import bernoulli + >>> from sympy.abc import x + >>> [bernoulli(n) for n in range(11)] + [1, 1/2, 1/6, 0, -1/30, 0, 1/42, 0, -1/30, 0, 5/66] + >>> bernoulli(1000001) + 0 + >>> bernoulli(3, x) + x**3 - 3*x**2/2 + x/2 + + See Also + ======== + + andre, bell, catalan, euler, fibonacci, harmonic, lucas, genocchi, + partition, tribonacci, sympy.polys.appellseqs.bernoulli_poly + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Bernoulli_number + .. [2] https://en.wikipedia.org/wiki/Bernoulli_polynomial + .. [3] https://mathworld.wolfram.com/BernoulliNumber.html + .. [4] https://mathworld.wolfram.com/BernoulliPolynomial.html + .. [5] Peter Luschny, "The Bernoulli Manifesto", + https://luschny.de/math/zeta/The-Bernoulli-Manifesto.html + .. [6] Peter Luschny, "An introduction to the Bernoulli function", + https://arxiv.org/abs/2009.06743 + + """ + + args: tuple[Integer] + + # Calculates B_n for positive even n + @staticmethod + def _calc_bernoulli(n): + s = 0 + a = int(binomial(n + 3, n - 6)) + for j in range(1, n//6 + 1): + s += a * bernoulli(n - 6*j) + # Avoid computing each binomial coefficient from scratch + a *= _product(n - 6 - 6*j + 1, n - 6*j) + a //= _product(6*j + 4, 6*j + 9) + if n % 6 == 4: + s = -Rational(n + 3, 6) - s + else: + s = Rational(n + 3, 3) - s + return s / binomial(n + 3, n) + + # We implement a specialized memoization scheme to handle each + # case modulo 6 separately + _cache = {0: S.One, 1: Rational(1, 2), 2: Rational(1, 6), 4: Rational(-1, 30)} + _highest = {0: 0, 1: 1, 2: 2, 4: 4} + + @classmethod + def eval(cls, n, x=None): + if x is S.One: + return cls(n) + elif n.is_zero: + return S.One + elif n.is_integer is False or n.is_nonnegative is False: + if x is not None and x.is_Integer and x.is_nonpositive: + return S.NaN + return + # Bernoulli numbers + elif x is None: + if n is S.One: + return S.Half + elif n.is_odd and (n-1).is_positive: + return S.Zero + elif n.is_Number: + n = int(n) + # Use mpmath for enormous Bernoulli numbers + if n > 500: + p, q = mp.bernfrac(n) + return Rational(int(p), int(q)) + case = n % 6 + highest_cached = cls._highest[case] + if n <= highest_cached: + return cls._cache[n] + # To avoid excessive recursion when, say, bernoulli(1000) is + # requested, calculate and cache the entire sequence ... B_988, + # B_994, B_1000 in increasing order + for i in range(highest_cached + 6, n + 6, 6): + b = cls._calc_bernoulli(i) + cls._cache[i] = b + cls._highest[case] = i + return b + # Bernoulli polynomials + elif n.is_Number: + return bernoulli_poly(n, x) + + def _eval_rewrite_as_zeta(self, n, x=1, **kwargs): + from sympy.functions.special.zeta_functions import zeta + return Piecewise((1, Eq(n, 0)), (-n * zeta(1-n, x), True)) + + def _eval_evalf(self, prec): + if not all(x.is_number for x in self.args): + return + n = self.args[0]._to_mpmath(prec) + x = (self.args[1] if len(self.args) > 1 else S.One)._to_mpmath(prec) + with workprec(prec): + if n == 0: + res = mp.mpf(1) + elif n == 1: + res = x - mp.mpf(0.5) + elif mp.isint(n) and n >= 0: + res = mp.bernoulli(n) if x == 1 else mp.bernpoly(n, x) + else: + res = -n * mp.zeta(1-n, x) + return Expr._from_mpmath(res, prec) + + +#----------------------------------------------------------------------------# +# # +# Bell numbers # +# # +#----------------------------------------------------------------------------# + + +class bell(DefinedFunction): + r""" + Bell numbers / Bell polynomials + + The Bell numbers satisfy `B_0 = 1` and + + .. math:: B_n = \sum_{k=0}^{n-1} \binom{n-1}{k} B_k. + + They are also given by: + + .. math:: B_n = \frac{1}{e} \sum_{k=0}^{\infty} \frac{k^n}{k!}. + + The Bell polynomials are given by `B_0(x) = 1` and + + .. math:: B_n(x) = x \sum_{k=1}^{n-1} \binom{n-1}{k-1} B_{k-1}(x). + + The second kind of Bell polynomials (are sometimes called "partial" Bell + polynomials or incomplete Bell polynomials) are defined as + + .. math:: B_{n,k}(x_1, x_2,\dotsc x_{n-k+1}) = + \sum_{j_1+j_2+j_2+\dotsb=k \atop j_1+2j_2+3j_2+\dotsb=n} + \frac{n!}{j_1!j_2!\dotsb j_{n-k+1}!} + \left(\frac{x_1}{1!} \right)^{j_1} + \left(\frac{x_2}{2!} \right)^{j_2} \dotsb + \left(\frac{x_{n-k+1}}{(n-k+1)!} \right) ^{j_{n-k+1}}. + + * ``bell(n)`` gives the `n^{th}` Bell number, `B_n`. + * ``bell(n, x)`` gives the `n^{th}` Bell polynomial, `B_n(x)`. + * ``bell(n, k, (x1, x2, ...))`` gives Bell polynomials of the second kind, + `B_{n,k}(x_1, x_2, \dotsc, x_{n-k+1})`. + + Notes + ===== + + Not to be confused with Bernoulli numbers and Bernoulli polynomials, + which use the same notation. + + Examples + ======== + + >>> from sympy import bell, Symbol, symbols + + >>> [bell(n) for n in range(11)] + [1, 1, 2, 5, 15, 52, 203, 877, 4140, 21147, 115975] + >>> bell(30) + 846749014511809332450147 + >>> bell(4, Symbol('t')) + t**4 + 6*t**3 + 7*t**2 + t + >>> bell(6, 2, symbols('x:6')[1:]) + 6*x1*x5 + 15*x2*x4 + 10*x3**2 + + See Also + ======== + + bernoulli, catalan, euler, fibonacci, harmonic, lucas, genocchi, partition, tribonacci + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Bell_number + .. [2] https://mathworld.wolfram.com/BellNumber.html + .. [3] https://mathworld.wolfram.com/BellPolynomial.html + + """ + + @staticmethod + @recurrence_memo([1, 1]) + def _bell(n, prev): + s = 1 + a = 1 + for k in range(1, n): + a = a * (n - k) // k + s += a * prev[k] + return s + + @staticmethod + @recurrence_memo([S.One, _sym]) + def _bell_poly(n, prev): + s = 1 + a = 1 + for k in range(2, n + 1): + a = a * (n - k + 1) // (k - 1) + s += a * prev[k - 1] + return expand_mul(_sym * s) + + @staticmethod + def _bell_incomplete_poly(n, k, symbols): + r""" + The second kind of Bell polynomials (incomplete Bell polynomials). + + Calculated by recurrence formula: + + .. math:: B_{n,k}(x_1, x_2, \dotsc, x_{n-k+1}) = + \sum_{m=1}^{n-k+1} + \x_m \binom{n-1}{m-1} B_{n-m,k-1}(x_1, x_2, \dotsc, x_{n-m-k}) + + where + `B_{0,0} = 1;` + `B_{n,0} = 0; for n \ge 1` + `B_{0,k} = 0; for k \ge 1` + + """ + if (n == 0) and (k == 0): + return S.One + elif (n == 0) or (k == 0): + return S.Zero + s = S.Zero + a = S.One + for m in range(1, n - k + 2): + s += a * bell._bell_incomplete_poly( + n - m, k - 1, symbols) * symbols[m - 1] + a = a * (n - m) / m + return expand_mul(s) + + @classmethod + def eval(cls, n, k_sym=None, symbols=None): + if n is S.Infinity: + if k_sym is None: + return S.Infinity + else: + raise ValueError("Bell polynomial is not defined") + + if n.is_negative or n.is_integer is False: + raise ValueError("a non-negative integer expected") + + if n.is_Integer and n.is_nonnegative: + if k_sym is None: + return Integer(cls._bell(int(n))) + elif symbols is None: + return cls._bell_poly(int(n)).subs(_sym, k_sym) + else: + r = cls._bell_incomplete_poly(int(n), int(k_sym), symbols) + return r + + def _eval_rewrite_as_Sum(self, n, k_sym=None, symbols=None, **kwargs): + from sympy.concrete.summations import Sum + if (k_sym is not None) or (symbols is not None): + return self + + # Dobinski's formula + if not n.is_nonnegative: + return self + k = Dummy('k', integer=True, nonnegative=True) + return 1 / E * Sum(k**n / factorial(k), (k, 0, S.Infinity)) + + +#----------------------------------------------------------------------------# +# # +# Harmonic numbers # +# # +#----------------------------------------------------------------------------# + + +class harmonic(DefinedFunction): + r""" + Harmonic numbers + + The nth harmonic number is given by `\operatorname{H}_{n} = + 1 + \frac{1}{2} + \frac{1}{3} + \ldots + \frac{1}{n}`. + + More generally: + + .. math:: \operatorname{H}_{n,m} = \sum_{k=1}^{n} \frac{1}{k^m} + + As `n \rightarrow \infty`, `\operatorname{H}_{n,m} \rightarrow \zeta(m)`, + the Riemann zeta function. + + * ``harmonic(n)`` gives the nth harmonic number, `\operatorname{H}_n` + + * ``harmonic(n, m)`` gives the nth generalized harmonic number + of order `m`, `\operatorname{H}_{n,m}`, where + ``harmonic(n) == harmonic(n, 1)`` + + This function can be extended to complex `n` and `m` where `n` is not a + negative integer or `m` is a nonpositive integer as + + .. math:: \operatorname{H}_{n,m} = \begin{cases} \zeta(m) - \zeta(m, n+1) + & m \ne 1 \\ \psi(n+1) + \gamma & m = 1 \end{cases} + + Examples + ======== + + >>> from sympy import harmonic, oo + + >>> [harmonic(n) for n in range(6)] + [0, 1, 3/2, 11/6, 25/12, 137/60] + >>> [harmonic(n, 2) for n in range(6)] + [0, 1, 5/4, 49/36, 205/144, 5269/3600] + >>> harmonic(oo, 2) + pi**2/6 + + >>> from sympy import Symbol, Sum + >>> n = Symbol("n") + + >>> harmonic(n).rewrite(Sum) + Sum(1/_k, (_k, 1, n)) + + We can evaluate harmonic numbers for all integral and positive + rational arguments: + + >>> from sympy import S, expand_func, simplify + >>> harmonic(8) + 761/280 + >>> harmonic(11) + 83711/27720 + + >>> H = harmonic(1/S(3)) + >>> H + harmonic(1/3) + >>> He = expand_func(H) + >>> He + -log(6) - sqrt(3)*pi/6 + 2*Sum(log(sin(_k*pi/3))*cos(2*_k*pi/3), (_k, 1, 1)) + + 3*Sum(1/(3*_k + 1), (_k, 0, 0)) + >>> He.doit() + -log(6) - sqrt(3)*pi/6 - log(sqrt(3)/2) + 3 + >>> H = harmonic(25/S(7)) + >>> He = simplify(expand_func(H).doit()) + >>> He + log(sin(2*pi/7)**(2*cos(16*pi/7))/(14*sin(pi/7)**(2*cos(pi/7))*cos(pi/14)**(2*sin(pi/14)))) + pi*tan(pi/14)/2 + 30247/9900 + >>> He.n(40) + 1.983697455232980674869851942390639915940 + >>> harmonic(25/S(7)).n(40) + 1.983697455232980674869851942390639915940 + + We can rewrite harmonic numbers in terms of polygamma functions: + + >>> from sympy import digamma, polygamma + >>> m = Symbol("m", integer=True, positive=True) + + >>> harmonic(n).rewrite(digamma) + polygamma(0, n + 1) + EulerGamma + + >>> harmonic(n).rewrite(polygamma) + polygamma(0, n + 1) + EulerGamma + + >>> harmonic(n,3).rewrite(polygamma) + polygamma(2, n + 1)/2 + zeta(3) + + >>> simplify(harmonic(n,m).rewrite(polygamma)) + Piecewise((polygamma(0, n + 1) + EulerGamma, Eq(m, 1)), + (-(-1)**m*polygamma(m - 1, n + 1)/factorial(m - 1) + zeta(m), True)) + + Integer offsets in the argument can be pulled out: + + >>> from sympy import expand_func + + >>> expand_func(harmonic(n+4)) + harmonic(n) + 1/(n + 4) + 1/(n + 3) + 1/(n + 2) + 1/(n + 1) + + >>> expand_func(harmonic(n-4)) + harmonic(n) - 1/(n - 1) - 1/(n - 2) - 1/(n - 3) - 1/n + + Some limits can be computed as well: + + >>> from sympy import limit, oo + + >>> limit(harmonic(n), n, oo) + oo + + >>> limit(harmonic(n, 2), n, oo) + pi**2/6 + + >>> limit(harmonic(n, 3), n, oo) + zeta(3) + + For `m > 1`, `H_{n,m}` tends to `\zeta(m)` in the limit of infinite `n`: + + >>> m = Symbol("m", positive=True) + >>> limit(harmonic(n, m+1), n, oo) + zeta(m + 1) + + See Also + ======== + + bell, bernoulli, catalan, euler, fibonacci, lucas, genocchi, partition, tribonacci + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Harmonic_number + .. [2] https://functions.wolfram.com/GammaBetaErf/HarmonicNumber/ + .. [3] https://functions.wolfram.com/GammaBetaErf/HarmonicNumber2/ + + """ + + # This prevents redundant recalculations and speeds up harmonic number computations. + harmonic_cache: dict[Integer, Callable[[int], Rational]] = {} + + @classmethod + def eval(cls, n, m=None): + from sympy.functions.special.zeta_functions import zeta + if m is S.One: + return cls(n) + if m is None: + m = S.One + if n.is_zero: + return S.Zero + elif m.is_zero: + return n + elif n is S.Infinity: + if m.is_negative: + return S.NaN + elif is_le(m, S.One): + return S.Infinity + elif is_gt(m, S.One): + return zeta(m) + elif m.is_Integer and m.is_nonpositive: + return (bernoulli(1-m, n+1) - bernoulli(1-m)) / (1-m) + elif n.is_Integer: + if n.is_negative and (m.is_integer is False or m.is_nonpositive is False): + return S.ComplexInfinity if m is S.One else S.NaN + if n.is_nonnegative: + if m.is_Integer: + if m not in cls.harmonic_cache: + @recurrence_memo([0]) + def f(n, prev): + return prev[-1] + S.One / n**m + cls.harmonic_cache[m] = f + return cls.harmonic_cache[m](int(n)) + return Add(*(k**(-m) for k in range(1, int(n) + 1))) + + def _eval_rewrite_as_polygamma(self, n, m=S.One, **kwargs): + from sympy.functions.special.gamma_functions import gamma, polygamma + if m.is_integer and m.is_positive: + return Piecewise((polygamma(0, n+1) + S.EulerGamma, Eq(m, 1)), + (S.NegativeOne**m * (polygamma(m-1, 1) - polygamma(m-1, n+1)) / + gamma(m), True)) + + def _eval_rewrite_as_digamma(self, n, m=1, **kwargs): + from sympy.functions.special.gamma_functions import polygamma + return self.rewrite(polygamma) + + def _eval_rewrite_as_trigamma(self, n, m=1, **kwargs): + from sympy.functions.special.gamma_functions import polygamma + return self.rewrite(polygamma) + + def _eval_rewrite_as_Sum(self, n, m=None, **kwargs): + from sympy.concrete.summations import Sum + k = Dummy("k", integer=True) + if m is None: + m = S.One + return Sum(k**(-m), (k, 1, n)) + + def _eval_rewrite_as_zeta(self, n, m=S.One, **kwargs): + from sympy.functions.special.zeta_functions import zeta + from sympy.functions.special.gamma_functions import digamma + return Piecewise((digamma(n + 1) + S.EulerGamma, Eq(m, 1)), + (zeta(m) - zeta(m, n+1), True)) + + def _eval_expand_func(self, **hints): + from sympy.concrete.summations import Sum + n = self.args[0] + m = self.args[1] if len(self.args) == 2 else 1 + + if m == S.One: + if n.is_Add: + off = n.args[0] + nnew = n - off + if off.is_Integer and off.is_positive: + result = [S.One/(nnew + i) for i in range(off, 0, -1)] + [harmonic(nnew)] + return Add(*result) + elif off.is_Integer and off.is_negative: + result = [-S.One/(nnew + i) for i in range(0, off, -1)] + [harmonic(nnew)] + return Add(*result) + + if n.is_Rational: + # Expansions for harmonic numbers at general rational arguments (u + p/q) + # Split n as u + p/q with p < q + p, q = n.as_numer_denom() + u = p // q + p = p - u * q + if u.is_nonnegative and p.is_positive and q.is_positive and p < q: + from sympy.functions.elementary.exponential import log + from sympy.functions.elementary.integers import floor + from sympy.functions.elementary.trigonometric import sin, cos, cot + k = Dummy("k") + t1 = q * Sum(1 / (q * k + p), (k, 0, u)) + t2 = 2 * Sum(cos((2 * pi * p * k) / S(q)) * + log(sin((pi * k) / S(q))), + (k, 1, floor((q - 1) / S(2)))) + t3 = (pi / 2) * cot((pi * p) / q) + log(2 * q) + return t1 + t2 - t3 + + return self + + def _eval_rewrite_as_tractable(self, n, m=1, limitvar=None, **kwargs): + from sympy.functions.special.zeta_functions import zeta + from sympy.functions.special.gamma_functions import polygamma + pg = self.rewrite(polygamma) + if not isinstance(pg, harmonic): + return pg.rewrite("tractable", deep=True) + arg = m - S.One + if arg.is_nonzero: + return (zeta(m) - zeta(m, n+1)).rewrite("tractable", deep=True) + + def _eval_evalf(self, prec): + if not all(x.is_number for x in self.args): + return + n = self.args[0]._to_mpmath(prec) + m = (self.args[1] if len(self.args) > 1 else S.One)._to_mpmath(prec) + if mp.isint(n) and n < 0: + return S.NaN + with workprec(prec): + if m == 1: + res = mp.harmonic(n) + else: + res = mp.zeta(m) - mp.zeta(m, n+1) + return Expr._from_mpmath(res, prec) + + def fdiff(self, argindex=1): + from sympy.functions.special.zeta_functions import zeta + if len(self.args) == 2: + n, m = self.args + else: + n, m = self.args + (1,) + if argindex == 1: + return m * zeta(m+1, n+1) + else: + raise ArgumentIndexError + + +#----------------------------------------------------------------------------# +# # +# Euler numbers # +# # +#----------------------------------------------------------------------------# + + +class euler(DefinedFunction): + r""" + Euler numbers / Euler polynomials / Euler function + + The Euler numbers are given by: + + .. math:: E_{2n} = I \sum_{k=1}^{2n+1} \sum_{j=0}^k \binom{k}{j} + \frac{(-1)^j (k-2j)^{2n+1}}{2^k I^k k} + + .. math:: E_{2n+1} = 0 + + Euler numbers and Euler polynomials are related by + + .. math:: E_n = 2^n E_n\left(\frac{1}{2}\right). + + We compute symbolic Euler polynomials using Appell sequences, + but numerical evaluation of the Euler polynomial is computed + more efficiently (and more accurately) using the mpmath library. + + The Euler polynomials are special cases of the generalized Euler function, + related to the Genocchi function as + + .. math:: \operatorname{E}(s, a) = -\frac{\operatorname{G}(s+1, a)}{s+1} + + with the limit of `\psi\left(\frac{a+1}{2}\right) - \psi\left(\frac{a}{2}\right)` + being taken when `s = -1`. The (ordinary) Euler function interpolating + the Euler numbers is then obtained as + `\operatorname{E}(s) = 2^s \operatorname{E}\left(s, \frac{1}{2}\right)`. + + * ``euler(n)`` gives the nth Euler number `E_n`. + * ``euler(s)`` gives the Euler function `\operatorname{E}(s)`. + * ``euler(n, x)`` gives the nth Euler polynomial `E_n(x)`. + * ``euler(s, a)`` gives the generalized Euler function `\operatorname{E}(s, a)`. + + Examples + ======== + + >>> from sympy import euler, Symbol, S + >>> [euler(n) for n in range(10)] + [1, 0, -1, 0, 5, 0, -61, 0, 1385, 0] + >>> [2**n*euler(n,1) for n in range(10)] + [1, 1, 0, -2, 0, 16, 0, -272, 0, 7936] + >>> n = Symbol("n") + >>> euler(n + 2*n) + euler(3*n) + + >>> x = Symbol("x") + >>> euler(n, x) + euler(n, x) + + >>> euler(0, x) + 1 + >>> euler(1, x) + x - 1/2 + >>> euler(2, x) + x**2 - x + >>> euler(3, x) + x**3 - 3*x**2/2 + 1/4 + >>> euler(4, x) + x**4 - 2*x**3 + x + + >>> euler(12, S.Half) + 2702765/4096 + >>> euler(12) + 2702765 + + See Also + ======== + + andre, bell, bernoulli, catalan, fibonacci, harmonic, lucas, genocchi, + partition, tribonacci, sympy.polys.appellseqs.euler_poly + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Euler_numbers + .. [2] https://mathworld.wolfram.com/EulerNumber.html + .. [3] https://en.wikipedia.org/wiki/Alternating_permutation + .. [4] https://mathworld.wolfram.com/AlternatingPermutation.html + + """ + + @classmethod + def eval(cls, n, x=None): + if n.is_zero: + return S.One + elif n is S.NegativeOne: + if x is None: + return S.Pi/2 + from sympy.functions.special.gamma_functions import digamma + return digamma((x+1)/2) - digamma(x/2) + elif n.is_integer is False or n.is_nonnegative is False: + return + # Euler numbers + elif x is None: + if n.is_odd and n.is_positive: + return S.Zero + elif n.is_Number: + from mpmath import mp + n = n._to_mpmath(mp.prec) + res = mp.eulernum(n, exact=True) + return Integer(res) + # Euler polynomials + elif n.is_Number: + from sympy.core.evalf import pure_complex + n = int(n) + reim = pure_complex(x, or_real=True) + if reim and all(a.is_Float or a.is_Integer for a in reim) \ + and any(a.is_Float for a in reim): + from mpmath import mp + prec = min([a._prec for a in reim if a.is_Float]) + with workprec(prec): + res = mp.eulerpoly(n, x) + return Expr._from_mpmath(res, prec) + return euler_poly(n, x) + + def _eval_rewrite_as_Sum(self, n, x=None, **kwargs): + from sympy.concrete.summations import Sum + if x is None and n.is_even: + k = Dummy("k", integer=True) + j = Dummy("j", integer=True) + n = n / 2 + Em = (S.ImaginaryUnit * Sum(Sum(binomial(k, j) * (S.NegativeOne**j * + (k - 2*j)**(2*n + 1)) / + (2**k*S.ImaginaryUnit**k * k), (j, 0, k)), (k, 1, 2*n + 1))) + return Em + if x: + k = Dummy("k", integer=True) + return Sum(binomial(n, k)*euler(k)/2**k*(x - S.Half)**(n - k), (k, 0, n)) + + def _eval_rewrite_as_genocchi(self, n, x=None, **kwargs): + if x is None: + return Piecewise((S.Pi/2, Eq(n, -1)), + (-2**n * genocchi(n+1, S.Half) / (n+1), True)) + from sympy.functions.special.gamma_functions import digamma + return Piecewise((digamma((x+1)/2) - digamma(x/2), Eq(n, -1)), + (-genocchi(n+1, x) / (n+1), True)) + + def _eval_evalf(self, prec): + if not all(i.is_number for i in self.args): + return + from mpmath import mp + m, x = (self.args[0], None) if len(self.args) == 1 else self.args + m = m._to_mpmath(prec) + if x is not None: + x = x._to_mpmath(prec) + with workprec(prec): + if mp.isint(m) and m >= 0: + res = mp.eulernum(m) if x is None else mp.eulerpoly(m, x) + else: + if m == -1: + res = mp.pi if x is None else mp.digamma((x+1)/2) - mp.digamma(x/2) + else: + y = 0.5 if x is None else x + res = 2 * (mp.zeta(-m, y) - 2**(m+1) * mp.zeta(-m, (y+1)/2)) + if x is None: + res *= 2**m + return Expr._from_mpmath(res, prec) + + +#----------------------------------------------------------------------------# +# # +# Catalan numbers # +# # +#----------------------------------------------------------------------------# + + +class catalan(DefinedFunction): + r""" + Catalan numbers + + The `n^{th}` catalan number is given by: + + .. math :: C_n = \frac{1}{n+1} \binom{2n}{n} + + * ``catalan(n)`` gives the `n^{th}` Catalan number, `C_n` + + Examples + ======== + + >>> from sympy import (Symbol, binomial, gamma, hyper, + ... catalan, diff, combsimp, Rational, I) + + >>> [catalan(i) for i in range(1,10)] + [1, 2, 5, 14, 42, 132, 429, 1430, 4862] + + >>> n = Symbol("n", integer=True) + + >>> catalan(n) + catalan(n) + + Catalan numbers can be transformed into several other, identical + expressions involving other mathematical functions + + >>> catalan(n).rewrite(binomial) + binomial(2*n, n)/(n + 1) + + >>> catalan(n).rewrite(gamma) + 4**n*gamma(n + 1/2)/(sqrt(pi)*gamma(n + 2)) + + >>> catalan(n).rewrite(hyper) + hyper((-n, 1 - n), (2,), 1) + + For some non-integer values of n we can get closed form + expressions by rewriting in terms of gamma functions: + + >>> catalan(Rational(1, 2)).rewrite(gamma) + 8/(3*pi) + + We can differentiate the Catalan numbers C(n) interpreted as a + continuous real function in n: + + >>> diff(catalan(n), n) + (polygamma(0, n + 1/2) - polygamma(0, n + 2) + log(4))*catalan(n) + + As a more advanced example consider the following ratio + between consecutive numbers: + + >>> combsimp((catalan(n + 1)/catalan(n)).rewrite(binomial)) + 2*(2*n + 1)/(n + 2) + + The Catalan numbers can be generalized to complex numbers: + + >>> catalan(I).rewrite(gamma) + 4**I*gamma(1/2 + I)/(sqrt(pi)*gamma(2 + I)) + + and evaluated with arbitrary precision: + + >>> catalan(I).evalf(20) + 0.39764993382373624267 - 0.020884341620842555705*I + + See Also + ======== + + andre, bell, bernoulli, euler, fibonacci, harmonic, lucas, genocchi, + partition, tribonacci, sympy.functions.combinatorial.factorials.binomial + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Catalan_number + .. [2] https://mathworld.wolfram.com/CatalanNumber.html + .. [3] https://functions.wolfram.com/GammaBetaErf/CatalanNumber/ + .. [4] http://geometer.org/mathcircles/catalan.pdf + + """ + + @classmethod + def eval(cls, n): + from sympy.functions.special.gamma_functions import gamma + if (n.is_Integer and n.is_nonnegative) or \ + (n.is_noninteger and n.is_negative): + return 4**n*gamma(n + S.Half)/(gamma(S.Half)*gamma(n + 2)) + + if (n.is_integer and n.is_negative): + if (n + 1).is_negative: + return S.Zero + if (n + 1).is_zero: + return Rational(-1, 2) + + def fdiff(self, argindex=1): + from sympy.functions.elementary.exponential import log + from sympy.functions.special.gamma_functions import polygamma + n = self.args[0] + return catalan(n)*(polygamma(0, n + S.Half) - polygamma(0, n + 2) + log(4)) + + def _eval_rewrite_as_binomial(self, n, **kwargs): + return binomial(2*n, n)/(n + 1) + + def _eval_rewrite_as_factorial(self, n, **kwargs): + return factorial(2*n) / (factorial(n+1) * factorial(n)) + + def _eval_rewrite_as_gamma(self, n, piecewise=True, **kwargs): + from sympy.functions.special.gamma_functions import gamma + # The gamma function allows to generalize Catalan numbers to complex n + return 4**n*gamma(n + S.Half)/(gamma(S.Half)*gamma(n + 2)) + + def _eval_rewrite_as_hyper(self, n, **kwargs): + from sympy.functions.special.hyper import hyper + return hyper([1 - n, -n], [2], 1) + + def _eval_rewrite_as_Product(self, n, **kwargs): + from sympy.concrete.products import Product + if not (n.is_integer and n.is_nonnegative): + return self + k = Dummy('k', integer=True, positive=True) + return Product((n + k) / k, (k, 2, n)) + + def _eval_is_integer(self): + if self.args[0].is_integer and self.args[0].is_nonnegative: + return True + + def _eval_is_positive(self): + if self.args[0].is_nonnegative: + return True + + def _eval_is_composite(self): + if self.args[0].is_integer and (self.args[0] - 3).is_positive: + return True + + def _eval_evalf(self, prec): + from sympy.functions.special.gamma_functions import gamma + if self.args[0].is_number: + return self.rewrite(gamma)._eval_evalf(prec) + + +#----------------------------------------------------------------------------# +# # +# Genocchi numbers # +# # +#----------------------------------------------------------------------------# + + +class genocchi(DefinedFunction): + r""" + Genocchi numbers / Genocchi polynomials / Genocchi function + + The Genocchi numbers are a sequence of integers `G_n` that satisfy the + relation: + + .. math:: \frac{-2t}{1 + e^{-t}} = \sum_{n=0}^\infty \frac{G_n t^n}{n!} + + They are related to the Bernoulli numbers by + + .. math:: G_n = 2 (1 - 2^n) B_n + + and generalize like the Bernoulli numbers to the Genocchi polynomials and + function as + + .. math:: \operatorname{G}(s, a) = 2 \left(\operatorname{B}(s, a) - + 2^s \operatorname{B}\left(s, \frac{a+1}{2}\right)\right) + + .. versionchanged:: 1.12 + ``genocchi(1)`` gives `-1` instead of `1`. + + Examples + ======== + + >>> from sympy import genocchi, Symbol + >>> [genocchi(n) for n in range(9)] + [0, -1, -1, 0, 1, 0, -3, 0, 17] + >>> n = Symbol('n', integer=True, positive=True) + >>> genocchi(2*n + 1) + 0 + >>> x = Symbol('x') + >>> genocchi(4, x) + -4*x**3 + 6*x**2 - 1 + + See Also + ======== + + bell, bernoulli, catalan, euler, fibonacci, harmonic, lucas, partition, tribonacci + sympy.polys.appellseqs.genocchi_poly + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Genocchi_number + .. [2] https://mathworld.wolfram.com/GenocchiNumber.html + .. [3] Peter Luschny, "An introduction to the Bernoulli function", + https://arxiv.org/abs/2009.06743 + + """ + + @classmethod + def eval(cls, n, x=None): + if x is S.One: + return cls(n) + elif n.is_integer is False or n.is_nonnegative is False: + return + # Genocchi numbers + elif x is None: + if n.is_odd and (n-1).is_positive: + return S.Zero + elif n.is_Number: + return 2 * (1-S(2)**n) * bernoulli(n) + # Genocchi polynomials + elif n.is_Number: + return genocchi_poly(n, x) + + def _eval_rewrite_as_bernoulli(self, n, x=1, **kwargs): + if x == 1 and n.is_integer and n.is_nonnegative: + return 2 * (1-S(2)**n) * bernoulli(n) + return 2 * (bernoulli(n, x) - 2**n * bernoulli(n, (x+1) / 2)) + + def _eval_rewrite_as_dirichlet_eta(self, n, x=1, **kwargs): + from sympy.functions.special.zeta_functions import dirichlet_eta + return -2*n * dirichlet_eta(1-n, x) + + def _eval_is_integer(self): + if len(self.args) > 1 and self.args[1] != 1: + return + n = self.args[0] + if n.is_integer and n.is_nonnegative: + return True + + def _eval_is_negative(self): + if len(self.args) > 1 and self.args[1] != 1: + return + n = self.args[0] + if n.is_integer and n.is_nonnegative: + if n.is_odd: + return fuzzy_not((n-1).is_positive) + return (n/2).is_odd + + def _eval_is_positive(self): + if len(self.args) > 1 and self.args[1] != 1: + return + n = self.args[0] + if n.is_integer and n.is_nonnegative: + if n.is_zero or n.is_odd: + return False + return (n/2).is_even + + def _eval_is_even(self): + if len(self.args) > 1 and self.args[1] != 1: + return + n = self.args[0] + if n.is_integer and n.is_nonnegative: + if n.is_even: + return n.is_zero + return (n-1).is_positive + + def _eval_is_odd(self): + if len(self.args) > 1 and self.args[1] != 1: + return + n = self.args[0] + if n.is_integer and n.is_nonnegative: + if n.is_even: + return fuzzy_not(n.is_zero) + return fuzzy_not((n-1).is_positive) + + def _eval_is_prime(self): + if len(self.args) > 1 and self.args[1] != 1: + return + n = self.args[0] + # only G_6 = -3 and G_8 = 17 are prime, + # but SymPy does not consider negatives as prime + # so only n=8 is tested + return (n-8).is_zero + + def _eval_evalf(self, prec): + if all(i.is_number for i in self.args): + return self.rewrite(bernoulli)._eval_evalf(prec) + + +#----------------------------------------------------------------------------# +# # +# Andre numbers # +# # +#----------------------------------------------------------------------------# + + +class andre(DefinedFunction): + r""" + Andre numbers / Andre function + + The Andre number `\mathcal{A}_n` is Luschny's name for half the number of + *alternating permutations* on `n` elements, where a permutation is alternating + if adjacent elements alternately compare "greater" and "smaller" going from + left to right. For example, `2 < 3 > 1 < 4` is an alternating permutation. + + This sequence is A000111 in the OEIS, which assigns the names *up/down numbers* + and *Euler zigzag numbers*. It satisfies a recurrence relation similar to that + for the Catalan numbers, with `\mathcal{A}_0 = 1` and + + .. math:: 2 \mathcal{A}_{n+1} = \sum_{k=0}^n \binom{n}{k} \mathcal{A}_k \mathcal{A}_{n-k} + + The Bernoulli and Euler numbers are signed transformations of the odd- and + even-indexed elements of this sequence respectively: + + .. math :: \operatorname{B}_{2k} = \frac{2k \mathcal{A}_{2k-1}}{(-4)^k - (-16)^k} + + .. math :: \operatorname{E}_{2k} = (-1)^k \mathcal{A}_{2k} + + Like the Bernoulli and Euler numbers, the Andre numbers are interpolated by the + entire Andre function: + + .. math :: \mathcal{A}(s) = (-i)^{s+1} \operatorname{Li}_{-s}(i) + + i^{s+1} \operatorname{Li}_{-s}(-i) = \\ \frac{2 \Gamma(s+1)}{(2\pi)^{s+1}} + (\zeta(s+1, 1/4) - \zeta(s+1, 3/4) \cos{\pi s}) + + Examples + ======== + + >>> from sympy import andre, euler, bernoulli + >>> [andre(n) for n in range(11)] + [1, 1, 1, 2, 5, 16, 61, 272, 1385, 7936, 50521] + >>> [(-1)**k * andre(2*k) for k in range(7)] + [1, -1, 5, -61, 1385, -50521, 2702765] + >>> [euler(2*k) for k in range(7)] + [1, -1, 5, -61, 1385, -50521, 2702765] + >>> [andre(2*k-1) * (2*k) / ((-4)**k - (-16)**k) for k in range(1, 8)] + [1/6, -1/30, 1/42, -1/30, 5/66, -691/2730, 7/6] + >>> [bernoulli(2*k) for k in range(1, 8)] + [1/6, -1/30, 1/42, -1/30, 5/66, -691/2730, 7/6] + + See Also + ======== + + bernoulli, catalan, euler, sympy.polys.appellseqs.andre_poly + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Alternating_permutation + .. [2] https://mathworld.wolfram.com/EulerZigzagNumber.html + .. [3] Peter Luschny, "An introduction to the Bernoulli function", + https://arxiv.org/abs/2009.06743 + """ + + @classmethod + def eval(cls, n): + if n is S.NaN: + return S.NaN + elif n is S.Infinity: + return S.Infinity + if n.is_zero: + return S.One + elif n == -1: + return -log(2) + elif n == -2: + return -2*S.Catalan + elif n.is_Integer: + if n.is_nonnegative and n.is_even: + return abs(euler(n)) + elif n.is_odd: + from sympy.functions.special.zeta_functions import zeta + m = -n-1 + return I**m * Rational(1-2**m, 4**m) * zeta(-n) + + def _eval_rewrite_as_zeta(self, s, **kwargs): + from sympy.functions.elementary.trigonometric import cos + from sympy.functions.special.gamma_functions import gamma + from sympy.functions.special.zeta_functions import zeta + return 2 * gamma(s+1) / (2*pi)**(s+1) * \ + (zeta(s+1, S.One/4) - cos(pi*s) * zeta(s+1, S(3)/4)) + + def _eval_rewrite_as_polylog(self, s, **kwargs): + from sympy.functions.special.zeta_functions import polylog + return (-I)**(s+1) * polylog(-s, I) + I**(s+1) * polylog(-s, -I) + + def _eval_is_integer(self): + n = self.args[0] + if n.is_integer and n.is_nonnegative: + return True + + def _eval_is_positive(self): + if self.args[0].is_nonnegative: + return True + + def _eval_evalf(self, prec): + if not self.args[0].is_number: + return + s = self.args[0]._to_mpmath(prec+12) + with workprec(prec+12): + sp, cp = mp.sinpi(s/2), mp.cospi(s/2) + res = 2*mp.dirichlet(-s, (-sp, cp, sp, -cp)) + return Expr._from_mpmath(res, prec) + + +#----------------------------------------------------------------------------# +# # +# Partition numbers # +# # +#----------------------------------------------------------------------------# + +class partition(DefinedFunction): + r""" + Partition numbers + + The Partition numbers are a sequence of integers `p_n` that represent the + number of distinct ways of representing `n` as a sum of natural numbers + (with order irrelevant). The generating function for `p_n` is given by: + + .. math:: \sum_{n=0}^\infty p_n x^n = \prod_{k=1}^\infty (1 - x^k)^{-1} + + Examples + ======== + + >>> from sympy import partition, Symbol + >>> [partition(n) for n in range(9)] + [1, 1, 2, 3, 5, 7, 11, 15, 22] + >>> n = Symbol('n', integer=True, negative=True) + >>> partition(n) + 0 + + See Also + ======== + + bell, bernoulli, catalan, euler, fibonacci, harmonic, lucas, genocchi, tribonacci + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Partition_(number_theory%29 + .. [2] https://en.wikipedia.org/wiki/Pentagonal_number_theorem + + """ + is_integer = True + is_nonnegative = True + + @classmethod + def eval(cls, n): + if n.is_integer is False: + raise TypeError("n should be an integer") + if n.is_negative is True: + return S.Zero + if n.is_zero is True or n is S.One: + return S.One + if n.is_Integer is True: + return S(_partition(as_int(n))) + + def _eval_is_positive(self): + if self.args[0].is_nonnegative is True: + return True + + def _eval_Mod(self, q): + # Ramanujan's congruences + n = self.args[0] + for p, rem in [(5, 4), (7, 5), (11, 6)]: + if q == p and n % q == rem: + return S.Zero + + +class divisor_sigma(DefinedFunction): + r""" + Calculate the divisor function `\sigma_k(n)` for positive integer n + + ``divisor_sigma(n, k)`` is equal to ``sum([x**k for x in divisors(n)])`` + + If n's prime factorization is: + + .. math :: + n = \prod_{i=1}^\omega p_i^{m_i}, + + then + + .. math :: + \sigma_k(n) = \prod_{i=1}^\omega (1+p_i^k+p_i^{2k}+\cdots + + p_i^{m_ik}). + + Examples + ======== + + >>> from sympy.functions.combinatorial.numbers import divisor_sigma + >>> divisor_sigma(18, 0) + 6 + >>> divisor_sigma(39, 1) + 56 + >>> divisor_sigma(12, 2) + 210 + >>> divisor_sigma(37) + 38 + + See Also + ======== + + sympy.ntheory.factor_.divisor_count, totient, sympy.ntheory.factor_.divisors, sympy.ntheory.factor_.factorint + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Divisor_function + + """ + is_integer = True + is_positive = True + + @classmethod + def eval(cls, n, k=S.One): + if n.is_integer is False: + raise TypeError("n should be an integer") + if n.is_positive is False: + raise ValueError("n should be a positive integer") + if k.is_integer is False: + raise TypeError("k should be an integer") + if k.is_nonnegative is False: + raise ValueError("k should be a nonnegative integer") + if n.is_prime is True: + return 1 + n**k + if n is S.One: + return S.One + if n.is_Integer is True: + if k.is_zero is True: + return Mul(*[e + 1 for e in factorint(n).values()]) + if k.is_Integer is True: + return S(_divisor_sigma(as_int(n), as_int(k))) + if k.is_zero is False: + return Mul(*[cancel((p**(k*(e + 1)) - 1) / (p**k - 1)) for p, e in factorint(n).items()]) + + +class udivisor_sigma(DefinedFunction): + r""" + Calculate the unitary divisor function `\sigma_k^*(n)` for positive integer n + + ``udivisor_sigma(n, k)`` is equal to ``sum([x**k for x in udivisors(n)])`` + + If n's prime factorization is: + + .. math :: + n = \prod_{i=1}^\omega p_i^{m_i}, + + then + + .. math :: + \sigma_k^*(n) = \prod_{i=1}^\omega (1+ p_i^{m_ik}). + + Parameters + ========== + + k : power of divisors in the sum + + for k = 0, 1: + ``udivisor_sigma(n, 0)`` is equal to ``udivisor_count(n)`` + ``udivisor_sigma(n, 1)`` is equal to ``sum(udivisors(n))`` + + Default for k is 1. + + Examples + ======== + + >>> from sympy.functions.combinatorial.numbers import udivisor_sigma + >>> udivisor_sigma(18, 0) + 4 + >>> udivisor_sigma(74, 1) + 114 + >>> udivisor_sigma(36, 3) + 47450 + >>> udivisor_sigma(111) + 152 + + See Also + ======== + + sympy.ntheory.factor_.divisor_count, totient, sympy.ntheory.factor_.divisors, + sympy.ntheory.factor_.udivisors, sympy.ntheory.factor_.udivisor_count, divisor_sigma, + sympy.ntheory.factor_.factorint + + References + ========== + + .. [1] https://mathworld.wolfram.com/UnitaryDivisorFunction.html + + """ + is_integer = True + is_positive = True + + @classmethod + def eval(cls, n, k=S.One): + if n.is_integer is False: + raise TypeError("n should be an integer") + if n.is_positive is False: + raise ValueError("n should be a positive integer") + if k.is_integer is False: + raise TypeError("k should be an integer") + if k.is_nonnegative is False: + raise ValueError("k should be a nonnegative integer") + if n.is_prime is True: + return 1 + n**k + if n.is_Integer: + return Mul(*[1+p**(k*e) for p, e in factorint(n).items()]) + + +class legendre_symbol(DefinedFunction): + r""" + Returns the Legendre symbol `(a / p)`. + + For an integer ``a`` and an odd prime ``p``, the Legendre symbol is + defined as + + .. math :: + \genfrac(){}{}{a}{p} = \begin{cases} + 0 & \text{if } p \text{ divides } a\\ + 1 & \text{if } a \text{ is a quadratic residue modulo } p\\ + -1 & \text{if } a \text{ is a quadratic nonresidue modulo } p + \end{cases} + + Examples + ======== + + >>> from sympy.functions.combinatorial.numbers import legendre_symbol + >>> [legendre_symbol(i, 7) for i in range(7)] + [0, 1, 1, -1, 1, -1, -1] + >>> sorted(set([i**2 % 7 for i in range(7)])) + [0, 1, 2, 4] + + See Also + ======== + + sympy.ntheory.residue_ntheory.is_quad_residue, jacobi_symbol + + """ + is_integer = True + is_prime = False + + @classmethod + def eval(cls, a, p): + if a.is_integer is False: + raise TypeError("a should be an integer") + if p.is_integer is False: + raise TypeError("p should be an integer") + if p.is_prime is False or p.is_odd is False: + raise ValueError("p should be an odd prime integer") + if (a % p).is_zero is True: + return S.Zero + if a is S.One: + return S.One + if a.is_Integer is True and p.is_Integer is True: + return S(legendre(as_int(a), as_int(p))) + + +class jacobi_symbol(DefinedFunction): + r""" + Returns the Jacobi symbol `(m / n)`. + + For any integer ``m`` and any positive odd integer ``n`` the Jacobi symbol + is defined as the product of the Legendre symbols corresponding to the + prime factors of ``n``: + + .. math :: + \genfrac(){}{}{m}{n} = + \genfrac(){}{}{m}{p^{1}}^{\alpha_1} + \genfrac(){}{}{m}{p^{2}}^{\alpha_2} + ... + \genfrac(){}{}{m}{p^{k}}^{\alpha_k} + \text{ where } n = + p_1^{\alpha_1} + p_2^{\alpha_2} + ... + p_k^{\alpha_k} + + Like the Legendre symbol, if the Jacobi symbol `\genfrac(){}{}{m}{n} = -1` + then ``m`` is a quadratic nonresidue modulo ``n``. + + But, unlike the Legendre symbol, if the Jacobi symbol + `\genfrac(){}{}{m}{n} = 1` then ``m`` may or may not be a quadratic residue + modulo ``n``. + + Examples + ======== + + >>> from sympy.functions.combinatorial.numbers import jacobi_symbol, legendre_symbol + >>> from sympy import S + >>> jacobi_symbol(45, 77) + -1 + >>> jacobi_symbol(60, 121) + 1 + + The relationship between the ``jacobi_symbol`` and ``legendre_symbol`` can + be demonstrated as follows: + + >>> L = legendre_symbol + >>> S(45).factors() + {3: 2, 5: 1} + >>> jacobi_symbol(7, 45) == L(7, 3)**2 * L(7, 5)**1 + True + + See Also + ======== + + sympy.ntheory.residue_ntheory.is_quad_residue, legendre_symbol + + """ + is_integer = True + is_prime = False + + @classmethod + def eval(cls, m, n): + if m.is_integer is False: + raise TypeError("m should be an integer") + if n.is_integer is False: + raise TypeError("n should be an integer") + if n.is_positive is False or n.is_odd is False: + raise ValueError("n should be an odd positive integer") + if m is S.One or n is S.One: + return S.One + if (m % n).is_zero is True: + return S.Zero + if m.is_Integer is True and n.is_Integer is True: + return S(jacobi(as_int(m), as_int(n))) + + +class kronecker_symbol(DefinedFunction): + r""" + Returns the Kronecker symbol `(a / n)`. + + Examples + ======== + + >>> from sympy.functions.combinatorial.numbers import kronecker_symbol + >>> kronecker_symbol(45, 77) + -1 + >>> kronecker_symbol(13, -120) + 1 + + See Also + ======== + + jacobi_symbol, legendre_symbol + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Kronecker_symbol + + """ + is_integer = True + is_prime = False + + @classmethod + def eval(cls, a, n): + if a.is_integer is False: + raise TypeError("a should be an integer") + if n.is_integer is False: + raise TypeError("n should be an integer") + if a is S.One or n is S.One: + return S.One + if a.is_Integer is True and n.is_Integer is True: + return S(kronecker(as_int(a), as_int(n))) + + +class mobius(DefinedFunction): + """ + Mobius function maps natural number to {-1, 0, 1} + + It is defined as follows: + 1) `1` if `n = 1`. + 2) `0` if `n` has a squared prime factor. + 3) `(-1)^k` if `n` is a square-free positive integer with `k` + number of prime factors. + + It is an important multiplicative function in number theory + and combinatorics. It has applications in mathematical series, + algebraic number theory and also physics (Fermion operator has very + concrete realization with Mobius Function model). + + Examples + ======== + + >>> from sympy.functions.combinatorial.numbers import mobius + >>> mobius(13*7) + 1 + >>> mobius(1) + 1 + >>> mobius(13*7*5) + -1 + >>> mobius(13**2) + 0 + + Even in the case of a symbol, if it clearly contains a squared prime factor, it will be zero. + + >>> from sympy import Symbol + >>> n = Symbol("n", integer=True, positive=True) + >>> mobius(4*n) + 0 + >>> mobius(n**2) + 0 + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/M%C3%B6bius_function + .. [2] Thomas Koshy "Elementary Number Theory with Applications" + .. [3] https://oeis.org/A008683 + + """ + is_integer = True + is_prime = False + + @classmethod + def eval(cls, n): + if n.is_integer is False: + raise TypeError("n should be an integer") + if n.is_positive is False: + raise ValueError("n should be a positive integer") + if n.is_prime is True: + return S.NegativeOne + if n is S.One: + return S.One + result = None + for m, e in (_.as_base_exp() for _ in Mul.make_args(n)): + if m.is_integer is True and m.is_positive is True and \ + e.is_integer is True and e.is_positive is True: + lt = is_lt(S.One, e) # 1 < e + if lt is True: + result = S.Zero + elif m.is_Integer is True: + factors = factorint(m) + if any(v > 1 for v in factors.values()): + result = S.Zero + elif lt is False: + s = S.NegativeOne if len(factors) % 2 else S.One + if result is None: + result = s + else: + result *= s + else: + return + return result + + +class primenu(DefinedFunction): + r""" + Calculate the number of distinct prime factors for a positive integer n. + + If n's prime factorization is: + + .. math :: + n = \prod_{i=1}^k p_i^{m_i}, + + then ``primenu(n)`` or `\nu(n)` is: + + .. math :: + \nu(n) = k. + + Examples + ======== + + >>> from sympy.functions.combinatorial.numbers import primenu + >>> primenu(1) + 0 + >>> primenu(30) + 3 + + See Also + ======== + + sympy.ntheory.factor_.factorint + + References + ========== + + .. [1] https://mathworld.wolfram.com/PrimeFactor.html + .. [2] https://oeis.org/A001221 + + """ + is_integer = True + is_nonnegative = True + + @classmethod + def eval(cls, n): + if n.is_integer is False: + raise TypeError("n should be an integer") + if n.is_positive is False: + raise ValueError("n should be a positive integer") + if n.is_prime is True: + return S.One + if n is S.One: + return S.Zero + if n.is_Integer is True: + return S(len(factorint(n))) + + +class primeomega(DefinedFunction): + r""" + Calculate the number of prime factors counting multiplicities for a + positive integer n. + + If n's prime factorization is: + + .. math :: + n = \prod_{i=1}^k p_i^{m_i}, + + then ``primeomega(n)`` or `\Omega(n)` is: + + .. math :: + \Omega(n) = \sum_{i=1}^k m_i. + + Examples + ======== + + >>> from sympy.functions.combinatorial.numbers import primeomega + >>> primeomega(1) + 0 + >>> primeomega(20) + 3 + + See Also + ======== + + sympy.ntheory.factor_.factorint + + References + ========== + + .. [1] https://mathworld.wolfram.com/PrimeFactor.html + .. [2] https://oeis.org/A001222 + + """ + is_integer = True + is_nonnegative = True + + @classmethod + def eval(cls, n): + if n.is_integer is False: + raise TypeError("n should be an integer") + if n.is_positive is False: + raise ValueError("n should be a positive integer") + if n.is_prime is True: + return S.One + if n is S.One: + return S.Zero + if n.is_Integer is True: + return S(sum(factorint(n).values())) + + +class totient(DefinedFunction): + r""" + Calculate the Euler totient function phi(n) + + ``totient(n)`` or `\phi(n)` is the number of positive integers `\leq` n + that are relatively prime to n. + + Examples + ======== + + >>> from sympy.functions.combinatorial.numbers import totient + >>> totient(1) + 1 + >>> totient(25) + 20 + >>> totient(45) == totient(5)*totient(9) + True + + See Also + ======== + + sympy.ntheory.factor_.divisor_count + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Euler%27s_totient_function + .. [2] https://mathworld.wolfram.com/TotientFunction.html + .. [3] https://oeis.org/A000010 + + """ + is_integer = True + is_positive = True + + @classmethod + def eval(cls, n): + if n.is_integer is False: + raise TypeError("n should be an integer") + if n.is_positive is False: + raise ValueError("n should be a positive integer") + if n is S.One: + return S.One + if n.is_prime is True: + return n - 1 + if isinstance(n, Dict): + return S(prod(p**(k-1)*(p-1) for p, k in n.items())) + if n.is_Integer is True: + return S(prod(p**(k-1)*(p-1) for p, k in factorint(n).items())) + + +class reduced_totient(DefinedFunction): + r""" + Calculate the Carmichael reduced totient function lambda(n) + + ``reduced_totient(n)`` or `\lambda(n)` is the smallest m > 0 such that + `k^m \equiv 1 \mod n` for all k relatively prime to n. + + Examples + ======== + + >>> from sympy.functions.combinatorial.numbers import reduced_totient + >>> reduced_totient(1) + 1 + >>> reduced_totient(8) + 2 + >>> reduced_totient(30) + 4 + + See Also + ======== + + totient + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Carmichael_function + .. [2] https://mathworld.wolfram.com/CarmichaelFunction.html + .. [3] https://oeis.org/A002322 + + """ + is_integer = True + is_positive = True + + @classmethod + def eval(cls, n): + if n.is_integer is False: + raise TypeError("n should be an integer") + if n.is_positive is False: + raise ValueError("n should be a positive integer") + if n is S.One: + return S.One + if n.is_prime is True: + return n - 1 + if isinstance(n, Dict): + t = 1 + if 2 in n: + t = (1 << (n[2] - 2)) if 2 < n[2] else n[2] + return S(lcm(int(t), *(int(p-1)*int(p)**int(k-1) for p, k in n.items() if p != 2))) + if n.is_Integer is True: + n, t = remove(int(n), 2) + if not t: + t = 1 + elif 2 < t: + t = 1 << (t - 2) + return S(lcm(t, *((p-1)*p**(k-1) for p, k in factorint(n).items()))) + + +class primepi(DefinedFunction): + r""" Represents the prime counting function pi(n) = the number + of prime numbers less than or equal to n. + + Examples + ======== + + >>> from sympy.functions.combinatorial.numbers import primepi + >>> from sympy import prime, prevprime, isprime + >>> primepi(25) + 9 + + So there are 9 primes less than or equal to 25. Is 25 prime? + + >>> isprime(25) + False + + It is not. So the first prime less than 25 must be the + 9th prime: + + >>> prevprime(25) == prime(9) + True + + See Also + ======== + + sympy.ntheory.primetest.isprime : Test if n is prime + sympy.ntheory.generate.primerange : Generate all primes in a given range + sympy.ntheory.generate.prime : Return the nth prime + + References + ========== + + .. [1] https://oeis.org/A000720 + + """ + is_integer = True + is_nonnegative = True + + @classmethod + def eval(cls, n): + if n is S.Infinity: + return S.Infinity + if n is S.NegativeInfinity: + return S.Zero + if n.is_real is False: + raise TypeError("n should be a real") + if is_lt(n, S(2)) is True: + return S.Zero + try: + n = int(n) + except TypeError: + return + return S(_primepi(n)) + + +####################################################################### +### +### Functions for enumerating partitions, permutations and combinations +### +####################################################################### + + +class _MultisetHistogram(tuple): + __slots__ = () + + +_N = -1 +_ITEMS = -2 +_M = slice(None, _ITEMS) + + +def _multiset_histogram(n): + """Return tuple used in permutation and combination counting. Input + is a dictionary giving items with counts as values or a sequence of + items (which need not be sorted). + + The data is stored in a class deriving from tuple so it is easily + recognized and so it can be converted easily to a list. + """ + if isinstance(n, dict): # item: count + if not all(isinstance(v, int) and v >= 0 for v in n.values()): + raise ValueError + tot = sum(n.values()) + items = sum(1 for k in n if n[k] > 0) + return _MultisetHistogram([n[k] for k in n if n[k] > 0] + [items, tot]) + else: + n = list(n) + s = set(n) + lens = len(s) + lenn = len(n) + if lens == lenn: + n = [1]*lenn + [lenn, lenn] + return _MultisetHistogram(n) + m = dict(zip(s, range(lens))) + d = dict(zip(range(lens), (0,)*lens)) + for i in n: + d[m[i]] += 1 + return _multiset_histogram(d) + + +def nP(n, k=None, replacement=False): + """Return the number of permutations of ``n`` items taken ``k`` at a time. + + Possible values for ``n``: + + integer - set of length ``n`` + + sequence - converted to a multiset internally + + multiset - {element: multiplicity} + + If ``k`` is None then the total of all permutations of length 0 + through the number of items represented by ``n`` will be returned. + + If ``replacement`` is True then a given item can appear more than once + in the ``k`` items. (For example, for 'ab' permutations of 2 would + include 'aa', 'ab', 'ba' and 'bb'.) The multiplicity of elements in + ``n`` is ignored when ``replacement`` is True but the total number + of elements is considered since no element can appear more times than + the number of elements in ``n``. + + Examples + ======== + + >>> from sympy.functions.combinatorial.numbers import nP + >>> from sympy.utilities.iterables import multiset_permutations, multiset + >>> nP(3, 2) + 6 + >>> nP('abc', 2) == nP(multiset('abc'), 2) == 6 + True + >>> nP('aab', 2) + 3 + >>> nP([1, 2, 2], 2) + 3 + >>> [nP(3, i) for i in range(4)] + [1, 3, 6, 6] + >>> nP(3) == sum(_) + True + + When ``replacement`` is True, each item can have multiplicity + equal to the length represented by ``n``: + + >>> nP('aabc', replacement=True) + 121 + >>> [len(list(multiset_permutations('aaaabbbbcccc', i))) for i in range(5)] + [1, 3, 9, 27, 81] + >>> sum(_) + 121 + + See Also + ======== + sympy.utilities.iterables.multiset_permutations + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Permutation + + """ + try: + n = as_int(n) + except ValueError: + return Integer(_nP(_multiset_histogram(n), k, replacement)) + return Integer(_nP(n, k, replacement)) + + +@cacheit +def _nP(n, k=None, replacement=False): + + if k == 0: + return 1 + if isinstance(n, SYMPY_INTS): # n different items + # assert n >= 0 + if k is None: + return sum(_nP(n, i, replacement) for i in range(n + 1)) + elif replacement: + return n**k + elif k > n: + return 0 + elif k == n: + return factorial(k) + elif k == 1: + return n + else: + # assert k >= 0 + return _product(n - k + 1, n) + elif isinstance(n, _MultisetHistogram): + if k is None: + return sum(_nP(n, i, replacement) for i in range(n[_N] + 1)) + elif replacement: + return n[_ITEMS]**k + elif k == n[_N]: + return factorial(k)/prod([factorial(i) for i in n[_M] if i > 1]) + elif k > n[_N]: + return 0 + elif k == 1: + return n[_ITEMS] + else: + # assert k >= 0 + tot = 0 + n = list(n) + for i in range(len(n[_M])): + if not n[i]: + continue + n[_N] -= 1 + if n[i] == 1: + n[i] = 0 + n[_ITEMS] -= 1 + tot += _nP(_MultisetHistogram(n), k - 1) + n[_ITEMS] += 1 + n[i] = 1 + else: + n[i] -= 1 + tot += _nP(_MultisetHistogram(n), k - 1) + n[i] += 1 + n[_N] += 1 + return tot + + +@cacheit +def _AOP_product(n): + """for n = (m1, m2, .., mk) return the coefficients of the polynomial, + prod(sum(x**i for i in range(nj + 1)) for nj in n); i.e. the coefficients + of the product of AOPs (all-one polynomials) or order given in n. The + resulting coefficient corresponding to x**r is the number of r-length + combinations of sum(n) elements with multiplicities given in n. + The coefficients are given as a default dictionary (so if a query is made + for a key that is not present, 0 will be returned). + + Examples + ======== + + >>> from sympy.functions.combinatorial.numbers import _AOP_product + >>> from sympy.abc import x + >>> n = (2, 2, 3) # e.g. aabbccc + >>> prod = ((x**2 + x + 1)*(x**2 + x + 1)*(x**3 + x**2 + x + 1)).expand() + >>> c = _AOP_product(n); dict(c) + {0: 1, 1: 3, 2: 6, 3: 8, 4: 8, 5: 6, 6: 3, 7: 1} + >>> [c[i] for i in range(8)] == [prod.coeff(x, i) for i in range(8)] + True + + The generating poly used here is the same as that listed in + https://tinyurl.com/cep849r, but in a refactored form. + + """ + + n = list(n) + ord = sum(n) + need = (ord + 2)//2 + rv = [1]*(n.pop() + 1) + rv.extend((0,) * (need - len(rv))) + rv = rv[:need] + while n: + ni = n.pop() + N = ni + 1 + was = rv[:] + for i in range(1, min(N, len(rv))): + rv[i] += rv[i - 1] + for i in range(N, need): + rv[i] += rv[i - 1] - was[i - N] + rev = list(reversed(rv)) + if ord % 2: + rv = rv + rev + else: + rv[-1:] = rev + d = defaultdict(int) + for i, r in enumerate(rv): + d[i] = r + return d + + +def nC(n, k=None, replacement=False): + """Return the number of combinations of ``n`` items taken ``k`` at a time. + + Possible values for ``n``: + + integer - set of length ``n`` + + sequence - converted to a multiset internally + + multiset - {element: multiplicity} + + If ``k`` is None then the total of all combinations of length 0 + through the number of items represented in ``n`` will be returned. + + If ``replacement`` is True then a given item can appear more than once + in the ``k`` items. (For example, for 'ab' sets of 2 would include 'aa', + 'ab', and 'bb'.) The multiplicity of elements in ``n`` is ignored when + ``replacement`` is True but the total number of elements is considered + since no element can appear more times than the number of elements in + ``n``. + + Examples + ======== + + >>> from sympy.functions.combinatorial.numbers import nC + >>> from sympy.utilities.iterables import multiset_combinations + >>> nC(3, 2) + 3 + >>> nC('abc', 2) + 3 + >>> nC('aab', 2) + 2 + + When ``replacement`` is True, each item can have multiplicity + equal to the length represented by ``n``: + + >>> nC('aabc', replacement=True) + 35 + >>> [len(list(multiset_combinations('aaaabbbbcccc', i))) for i in range(5)] + [1, 3, 6, 10, 15] + >>> sum(_) + 35 + + If there are ``k`` items with multiplicities ``m_1, m_2, ..., m_k`` + then the total of all combinations of length 0 through ``k`` is the + product, ``(m_1 + 1)*(m_2 + 1)*...*(m_k + 1)``. When the multiplicity + of each item is 1 (i.e., k unique items) then there are 2**k + combinations. For example, if there are 4 unique items, the total number + of combinations is 16: + + >>> sum(nC(4, i) for i in range(5)) + 16 + + See Also + ======== + + sympy.utilities.iterables.multiset_combinations + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Combination + .. [2] https://tinyurl.com/cep849r + + """ + + if isinstance(n, SYMPY_INTS): + if k is None: + if not replacement: + return 2**n + return sum(nC(n, i, replacement) for i in range(n + 1)) + if k < 0: + raise ValueError("k cannot be negative") + if replacement: + return binomial(n + k - 1, k) + return binomial(n, k) + if isinstance(n, _MultisetHistogram): + N = n[_N] + if k is None: + if not replacement: + return prod(m + 1 for m in n[_M]) + return sum(nC(n, i, replacement) for i in range(N + 1)) + elif replacement: + return nC(n[_ITEMS], k, replacement) + # assert k >= 0 + elif k in (1, N - 1): + return n[_ITEMS] + elif k in (0, N): + return 1 + return _AOP_product(tuple(n[_M]))[k] + else: + return nC(_multiset_histogram(n), k, replacement) + + +def _eval_stirling1(n, k): + if n == k == 0: + return S.One + if 0 in (n, k): + return S.Zero + + # some special values + if n == k: + return S.One + elif k == n - 1: + return binomial(n, 2) + elif k == n - 2: + return (3*n - 1)*binomial(n, 3)/4 + elif k == n - 3: + return binomial(n, 2)*binomial(n, 4) + + return _stirling1(n, k) + + +@cacheit +def _stirling1(n, k): + row = [0, 1]+[0]*(k-1) # for n = 1 + for i in range(2, n+1): + for j in range(min(k,i), 0, -1): + row[j] = (i-1) * row[j] + row[j-1] + return Integer(row[k]) + + +def _eval_stirling2(n, k): + if n == k == 0: + return S.One + if 0 in (n, k): + return S.Zero + + # some special values + if n == k: + return S.One + elif k == n - 1: + return binomial(n, 2) + elif k == 1: + return S.One + elif k == 2: + return Integer(2**(n - 1) - 1) + + return _stirling2(n, k) + + +@cacheit +def _stirling2(n, k): + row = [0, 1]+[0]*(k-1) # for n = 1 + for i in range(2, n+1): + for j in range(min(k,i), 0, -1): + row[j] = j * row[j] + row[j-1] + return Integer(row[k]) + + +def stirling(n, k, d=None, kind=2, signed=False): + r"""Return Stirling number $S(n, k)$ of the first or second (default) kind. + + The sum of all Stirling numbers of the second kind for $k = 1$ + through $n$ is ``bell(n)``. The recurrence relationship for these numbers + is: + + .. math :: {0 \brace 0} = 1; {n \brace 0} = {0 \brace k} = 0; + + .. math :: {{n+1} \brace k} = j {n \brace k} + {n \brace {k-1}} + + where $j$ is: + $n$ for Stirling numbers of the first kind, + $-n$ for signed Stirling numbers of the first kind, + $k$ for Stirling numbers of the second kind. + + The first kind of Stirling number counts the number of permutations of + ``n`` distinct items that have ``k`` cycles; the second kind counts the + ways in which ``n`` distinct items can be partitioned into ``k`` parts. + If ``d`` is given, the "reduced Stirling number of the second kind" is + returned: $S^{d}(n, k) = S(n - d + 1, k - d + 1)$ with $n \ge k \ge d$. + (This counts the ways to partition $n$ consecutive integers into $k$ + groups with no pairwise difference less than $d$. See example below.) + + To obtain the signed Stirling numbers of the first kind, use keyword + ``signed=True``. Using this keyword automatically sets ``kind`` to 1. + + Examples + ======== + + >>> from sympy.functions.combinatorial.numbers import stirling, bell + >>> from sympy.combinatorics import Permutation + >>> from sympy.utilities.iterables import multiset_partitions, permutations + + First kind (unsigned by default): + + >>> [stirling(6, i, kind=1) for i in range(7)] + [0, 120, 274, 225, 85, 15, 1] + >>> perms = list(permutations(range(4))) + >>> [sum(Permutation(p).cycles == i for p in perms) for i in range(5)] + [0, 6, 11, 6, 1] + >>> [stirling(4, i, kind=1) for i in range(5)] + [0, 6, 11, 6, 1] + + First kind (signed): + + >>> [stirling(4, i, signed=True) for i in range(5)] + [0, -6, 11, -6, 1] + + Second kind: + + >>> [stirling(10, i) for i in range(12)] + [0, 1, 511, 9330, 34105, 42525, 22827, 5880, 750, 45, 1, 0] + >>> sum(_) == bell(10) + True + >>> len(list(multiset_partitions(range(4), 2))) == stirling(4, 2) + True + + Reduced second kind: + + >>> from sympy import subsets, oo + >>> def delta(p): + ... if len(p) == 1: + ... return oo + ... return min(abs(i[0] - i[1]) for i in subsets(p, 2)) + >>> parts = multiset_partitions(range(5), 3) + >>> d = 2 + >>> sum(1 for p in parts if all(delta(i) >= d for i in p)) + 7 + >>> stirling(5, 3, 2) + 7 + + See Also + ======== + sympy.utilities.iterables.multiset_partitions + + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Stirling_numbers_of_the_first_kind + .. [2] https://en.wikipedia.org/wiki/Stirling_numbers_of_the_second_kind + + """ + # TODO: make this a class like bell() + + n = as_int(n) + k = as_int(k) + if n < 0: + raise ValueError('n must be nonnegative') + if k > n: + return S.Zero + if d: + # assert k >= d + # kind is ignored -- only kind=2 is supported + return _eval_stirling2(n - d + 1, k - d + 1) + elif signed: + # kind is ignored -- only kind=1 is supported + return S.NegativeOne**(n - k)*_eval_stirling1(n, k) + + if kind == 1: + return _eval_stirling1(n, k) + elif kind == 2: + return _eval_stirling2(n, k) + else: + raise ValueError('kind must be 1 or 2, not %s' % k) + + +@cacheit +def _nT(n, k): + """Return the partitions of ``n`` items into ``k`` parts. This + is used by ``nT`` for the case when ``n`` is an integer.""" + # really quick exits + if k > n or k < 0: + return 0 + if k in (1, n): + return 1 + if k == 0: + return 0 + # exits that could be done below but this is quicker + if k == 2: + return n//2 + d = n - k + if d <= 3: + return d + # quick exit + if 3*k >= n: # or, equivalently, 2*k >= d + # all the information needed in this case + # will be in the cache needed to calculate + # partition(d), so... + # update cache + tot = _partition_rec(d) + # and correct for values not needed + if d - k > 0: + tot -= sum(_partition_rec.fetch_item(slice(d - k))) + return tot + # regular exit + # nT(n, k) = Sum(nT(n - k, m), (m, 1, k)); + # calculate needed nT(i, j) values + p = [1]*d + for i in range(2, k + 1): + for m in range(i + 1, d): + p[m] += p[m - i] + d -= 1 + # if p[0] were appended to the end of p then the last + # k values of p are the nT(n, j) values for 0 < j < k in reverse + # order p[-1] = nT(n, 1), p[-2] = nT(n, 2), etc.... Instead of + # putting the 1 from p[0] there, however, it is simply added to + # the sum below which is valid for 1 < k <= n//2 + return (1 + sum(p[1 - k:])) + + +def nT(n, k=None): + """Return the number of ``k``-sized partitions of ``n`` items. + + Possible values for ``n``: + + integer - ``n`` identical items + + sequence - converted to a multiset internally + + multiset - {element: multiplicity} + + Note: the convention for ``nT`` is different than that of ``nC`` and + ``nP`` in that + here an integer indicates ``n`` *identical* items instead of a set of + length ``n``; this is in keeping with the ``partitions`` function which + treats its integer-``n`` input like a list of ``n`` 1s. One can use + ``range(n)`` for ``n`` to indicate ``n`` distinct items. + + If ``k`` is None then the total number of ways to partition the elements + represented in ``n`` will be returned. + + Examples + ======== + + >>> from sympy.functions.combinatorial.numbers import nT + + Partitions of the given multiset: + + >>> [nT('aabbc', i) for i in range(1, 7)] + [1, 8, 11, 5, 1, 0] + >>> nT('aabbc') == sum(_) + True + + >>> [nT("mississippi", i) for i in range(1, 12)] + [1, 74, 609, 1521, 1768, 1224, 579, 197, 50, 9, 1] + + Partitions when all items are identical: + + >>> [nT(5, i) for i in range(1, 6)] + [1, 2, 2, 1, 1] + >>> nT('1'*5) == sum(_) + True + + When all items are different: + + >>> [nT(range(5), i) for i in range(1, 6)] + [1, 15, 25, 10, 1] + >>> nT(range(5)) == sum(_) + True + + Partitions of an integer expressed as a sum of positive integers: + + >>> from sympy import partition + >>> partition(4) + 5 + >>> nT(4, 1) + nT(4, 2) + nT(4, 3) + nT(4, 4) + 5 + >>> nT('1'*4) + 5 + + See Also + ======== + sympy.utilities.iterables.partitions + sympy.utilities.iterables.multiset_partitions + sympy.functions.combinatorial.numbers.partition + + References + ========== + + .. [1] https://web.archive.org/web/20210507012732/https://teaching.csse.uwa.edu.au/units/CITS7209/partition.pdf + + """ + + if isinstance(n, SYMPY_INTS): + # n identical items + if k is None: + return partition(n) + if isinstance(k, SYMPY_INTS): + n = as_int(n) + k = as_int(k) + return Integer(_nT(n, k)) + if not isinstance(n, _MultisetHistogram): + try: + # if n contains hashable items there is some + # quick handling that can be done + u = len(set(n)) + if u <= 1: + return nT(len(n), k) + elif u == len(n): + n = range(u) + raise TypeError + except TypeError: + n = _multiset_histogram(n) + N = n[_N] + if k is None and N == 1: + return 1 + if k in (1, N): + return 1 + if k == 2 or N == 2 and k is None: + m, r = divmod(N, 2) + rv = sum(nC(n, i) for i in range(1, m + 1)) + if not r: + rv -= nC(n, m)//2 + if k is None: + rv += 1 # for k == 1 + return rv + if N == n[_ITEMS]: + # all distinct + if k is None: + return bell(N) + return stirling(N, k) + m = MultisetPartitionTraverser() + if k is None: + return m.count_partitions(n[_M]) + # MultisetPartitionTraverser does not have a range-limited count + # method, so need to enumerate and count + tot = 0 + for discard in m.enum_range(n[_M], k-1, k): + tot += 1 + return tot + + +#-----------------------------------------------------------------------------# +# # +# Motzkin numbers # +# # +#-----------------------------------------------------------------------------# + + +class motzkin(DefinedFunction): + """ + The nth Motzkin number is the number + of ways of drawing non-intersecting chords + between n points on a circle (not necessarily touching + every point by a chord). The Motzkin numbers are named + after Theodore Motzkin and have diverse applications + in geometry, combinatorics and number theory. + + Motzkin numbers are the integer sequence defined by the + initial terms `M_0 = 1`, `M_1 = 1` and the two-term recurrence relation + `M_n = \frac{2*n + 1}{n + 2} * M_{n-1} + \frac{3n - 3}{n + 2} * M_{n-2}`. + + + Examples + ======== + + >>> from sympy import motzkin + + >>> motzkin.is_motzkin(5) + False + >>> motzkin.find_motzkin_numbers_in_range(2,300) + [2, 4, 9, 21, 51, 127] + >>> motzkin.find_motzkin_numbers_in_range(2,900) + [2, 4, 9, 21, 51, 127, 323, 835] + >>> motzkin.find_first_n_motzkins(10) + [1, 1, 2, 4, 9, 21, 51, 127, 323, 835] + + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Motzkin_number + .. [2] https://mathworld.wolfram.com/MotzkinNumber.html + + """ + + @staticmethod + def is_motzkin(n): + try: + n = as_int(n) + except ValueError: + return False + if n > 0: + if n in (1, 2): + return True + + tn1 = 1 + tn = 2 + i = 3 + while tn < n: + a = ((2*i + 1)*tn + (3*i - 3)*tn1)/(i + 2) + i += 1 + tn1 = tn + tn = a + + if tn == n: + return True + else: + return False + + else: + return False + + @staticmethod + def find_motzkin_numbers_in_range(x, y): + if 0 <= x <= y: + motzkins = [] + if x <= 1 <= y: + motzkins.append(1) + tn1 = 1 + tn = 2 + i = 3 + while tn <= y: + if tn >= x: + motzkins.append(tn) + a = ((2*i + 1)*tn + (3*i - 3)*tn1)/(i + 2) + i += 1 + tn1 = tn + tn = int(a) + + return motzkins + + else: + raise ValueError('The provided range is not valid. This condition should satisfy x <= y') + + @staticmethod + def find_first_n_motzkins(n): + try: + n = as_int(n) + except ValueError: + raise ValueError('The provided number must be a positive integer') + if n < 0: + raise ValueError('The provided number must be a positive integer') + motzkins = [1] + if n >= 1: + motzkins.append(1) + tn1 = 1 + tn = 2 + i = 3 + while i <= n: + motzkins.append(tn) + a = ((2*i + 1)*tn + (3*i - 3)*tn1)/(i + 2) + i += 1 + tn1 = tn + tn = int(a) + + return motzkins + + @staticmethod + @recurrence_memo([S.One, S.One]) + def _motzkin(n, prev): + return ((2*n + 1)*prev[-1] + (3*n - 3)*prev[-2]) // (n + 2) + + @classmethod + def eval(cls, n): + try: + n = as_int(n) + except ValueError: + raise ValueError('The provided number must be a positive integer') + if n < 0: + raise ValueError('The provided number must be a positive integer') + return Integer(cls._motzkin(n - 1)) + + +def nD(i=None, brute=None, *, n=None, m=None): + """return the number of derangements for: ``n`` unique items, ``i`` + items (as a sequence or multiset), or multiplicities, ``m`` given + as a sequence or multiset. + + Examples + ======== + + >>> from sympy.utilities.iterables import generate_derangements as enum + >>> from sympy.functions.combinatorial.numbers import nD + + A derangement ``d`` of sequence ``s`` has all ``d[i] != s[i]``: + + >>> set([''.join(i) for i in enum('abc')]) + {'bca', 'cab'} + >>> nD('abc') + 2 + + Input as iterable or dictionary (multiset form) is accepted: + + >>> assert nD([1, 2, 2, 3, 3, 3]) == nD({1: 1, 2: 2, 3: 3}) + + By default, a brute-force enumeration and count of multiset permutations + is only done if there are fewer than 9 elements. There may be cases when + there is high multiplicity with few unique elements that will benefit + from a brute-force enumeration, too. For this reason, the `brute` + keyword (default None) is provided. When False, the brute-force + enumeration will never be used. When True, it will always be used. + + >>> nD('1111222233', brute=True) + 44 + + For convenience, one may specify ``n`` distinct items using the + ``n`` keyword: + + >>> assert nD(n=3) == nD('abc') == 2 + + Since the number of derangments depends on the multiplicity of the + elements and not the elements themselves, it may be more convenient + to give a list or multiset of multiplicities using keyword ``m``: + + >>> assert nD('abc') == nD(m=(1,1,1)) == nD(m={1:3}) == 2 + + """ + from sympy.integrals.integrals import integrate + from sympy.functions.special.polynomials import laguerre + from sympy.abc import x + def ok(x): + if not isinstance(x, SYMPY_INTS): + raise TypeError('expecting integer values') + if x < 0: + raise ValueError('value must not be negative') + return True + + if (i, n, m).count(None) != 2: + raise ValueError('enter only 1 of i, n, or m') + if i is not None: + if isinstance(i, SYMPY_INTS): + raise TypeError('items must be a list or dictionary') + if not i: + return S.Zero + if type(i) is not dict: + s = list(i) + ms = multiset(s) + elif type(i) is dict: + all(ok(_) for _ in i.values()) + ms = {k: v for k, v in i.items() if v} + s = None + if not ms: + return S.Zero + N = sum(ms.values()) + counts = multiset(ms.values()) + nkey = len(ms) + elif n is not None: + ok(n) + if not n: + return S.Zero + return subfactorial(n) + elif m is not None: + if isinstance(m, dict): + all(ok(i) and ok(j) for i, j in m.items()) + counts = {k: v for k, v in m.items() if k*v} + elif iterable(m) or isinstance(m, str): + m = list(m) + all(ok(i) for i in m) + counts = multiset([i for i in m if i]) + else: + raise TypeError('expecting iterable') + if not counts: + return S.Zero + N = sum(k*v for k, v in counts.items()) + nkey = sum(counts.values()) + s = None + big = int(max(counts)) + if big == 1: # no repetition + return subfactorial(nkey) + nval = len(counts) + if big*2 > N: + return S.Zero + if big*2 == N: + if nkey == 2 and nval == 1: + return S.One # aaabbb + if nkey - 1 == big: # one element repeated + return factorial(big) # e.g. abc part of abcddd + if N < 9 and brute is None or brute: + # for all possibilities, this was found to be faster + if s is None: + s = [] + i = 0 + for m, v in counts.items(): + for j in range(v): + s.extend([i]*m) + i += 1 + return Integer(sum(1 for i in multiset_derangements(s))) + from sympy.functions.elementary.exponential import exp + return Integer(abs(integrate(exp(-x)*Mul(*[ + laguerre(i, x)**m for i, m in counts.items()]), (x, 0, oo)))) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/functions/elementary/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/functions/elementary/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..78034e72ef2ed722c3ae685a87cf4df618a982b0 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/functions/elementary/__init__.py @@ -0,0 +1 @@ +# Stub __init__.py for sympy.functions.elementary diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/functions/elementary/_trigonometric_special.py b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/functions/elementary/_trigonometric_special.py new file mode 100644 index 0000000000000000000000000000000000000000..fdf8c9d06241b46e791afe76836ea33e6d4fb1c8 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/functions/elementary/_trigonometric_special.py @@ -0,0 +1,261 @@ +r"""A module for special angle formulas for trigonometric functions + +TODO +==== + +This module should be developed in the future to contain direct square root +representation of + +.. math + F(\frac{n}{m} \pi) + +for every + +- $m \in \{ 3, 5, 17, 257, 65537 \}$ +- $n \in \mathbb{N}$, $0 \le n < m$ +- $F \in \{\sin, \cos, \tan, \csc, \sec, \cot\}$ + +Without multi-step rewrites +(e.g. $\tan \to \cos/\sin \to \cos/\sqrt \to \ sqrt$) +or using chebyshev identities +(e.g. $\cos \to \cos + \cos^2 + \cdots \to \sqrt{} + \sqrt{}^2 + \cdots $), +which are trivial to implement in sympy, +and had used to give overly complicated expressions. + +The reference can be found below, if anyone may need help implementing them. + +References +========== + +.. [*] Gottlieb, Christian. (1999). The Simple and straightforward construction + of the regular 257-gon. The Mathematical Intelligencer. 21. 31-37. + 10.1007/BF03024829. +.. [*] https://resources.wolframcloud.com/FunctionRepository/resources/Cos2PiOverFermatPrime +""" +from __future__ import annotations +from typing import Callable +from functools import reduce +from sympy.core.expr import Expr +from sympy.core.singleton import S +from sympy.core.intfunc import igcdex +from sympy.core.numbers import Integer +from sympy.functions.elementary.miscellaneous import sqrt +from sympy.core.cache import cacheit + + +def migcdex(*x: int) -> tuple[tuple[int, ...], int]: + r"""Compute extended gcd for multiple integers. + + Explanation + =========== + + Given the integers $x_1, \cdots, x_n$ and + an extended gcd for multiple arguments are defined as a solution + $(y_1, \cdots, y_n), g$ for the diophantine equation + $x_1 y_1 + \cdots + x_n y_n = g$ such that + $g = \gcd(x_1, \cdots, x_n)$. + + Examples + ======== + + >>> from sympy.functions.elementary._trigonometric_special import migcdex + >>> migcdex() + ((), 0) + >>> migcdex(4) + ((1,), 4) + >>> migcdex(4, 6) + ((-1, 1), 2) + >>> migcdex(6, 10, 15) + ((1, 1, -1), 1) + """ + if not x: + return (), 0 + + if len(x) == 1: + return (1,), x[0] + + if len(x) == 2: + u, v, h = igcdex(x[0], x[1]) + return (u, v), h + + y, g = migcdex(*x[1:]) + u, v, h = igcdex(x[0], g) + return (u, *(v * i for i in y)), h + + +def ipartfrac(*denoms: int) -> tuple[int, ...]: + r"""Compute the partial fraction decomposition. + + Explanation + =========== + + Given a rational number $\frac{1}{q_1 \cdots q_n}$ where all + $q_1, \cdots, q_n$ are pairwise coprime, + + A partial fraction decomposition is defined as + + .. math:: + \frac{1}{q_1 \cdots q_n} = \frac{p_1}{q_1} + \cdots + \frac{p_n}{q_n} + + And it can be derived from solving the following diophantine equation for + the $p_1, \cdots, p_n$ + + .. math:: + 1 = p_1 \prod_{i \ne 1}q_i + \cdots + p_n \prod_{i \ne n}q_i + + Where $q_1, \cdots, q_n$ being pairwise coprime implies + $\gcd(\prod_{i \ne 1}q_i, \cdots, \prod_{i \ne n}q_i) = 1$, + which guarantees the existence of the solution. + + It is sufficient to compute partial fraction decomposition only + for numerator $1$ because partial fraction decomposition for any + $\frac{n}{q_1 \cdots q_n}$ can be easily computed by multiplying + the result by $n$ afterwards. + + Parameters + ========== + + denoms : int + The pairwise coprime integer denominators $q_i$ which defines the + rational number $\frac{1}{q_1 \cdots q_n}$ + + Returns + ======= + + tuple[int, ...] + The list of numerators which semantically corresponds to $p_i$ of the + partial fraction decomposition + $\frac{1}{q_1 \cdots q_n} = \frac{p_1}{q_1} + \cdots + \frac{p_n}{q_n}$ + + Examples + ======== + + >>> from sympy import Rational, Mul + >>> from sympy.functions.elementary._trigonometric_special import ipartfrac + + >>> denoms = 2, 3, 5 + >>> numers = ipartfrac(2, 3, 5) + >>> numers + (1, 7, -14) + + >>> Rational(1, Mul(*denoms)) + 1/30 + >>> out = 0 + >>> for n, d in zip(numers, denoms): + ... out += Rational(n, d) + >>> out + 1/30 + """ + if not denoms: + return () + + def mul(x: int, y: int) -> int: + return x * y + + denom = reduce(mul, denoms) + a = [denom // x for x in denoms] + h, _ = migcdex(*a) + return h + + +def fermat_coords(n: int) -> list[int] | None: + """If n can be factored in terms of Fermat primes with + multiplicity of each being 1, return those primes, else + None + """ + primes = [] + for p in [3, 5, 17, 257, 65537]: + quotient, remainder = divmod(n, p) + if remainder == 0: + n = quotient + primes.append(p) + if n == 1: + return primes + return None + + +@cacheit +def cos_3() -> Expr: + r"""Computes $\cos \frac{\pi}{3}$ in square roots""" + return S.Half + + +@cacheit +def cos_5() -> Expr: + r"""Computes $\cos \frac{\pi}{5}$ in square roots""" + return (sqrt(5) + 1) / 4 + + +@cacheit +def cos_17() -> Expr: + r"""Computes $\cos \frac{\pi}{17}$ in square roots""" + return sqrt( + (15 + sqrt(17)) / 32 + sqrt(2) * (sqrt(17 - sqrt(17)) + + sqrt(sqrt(2) * (-8 * sqrt(17 + sqrt(17)) - (1 - sqrt(17)) + * sqrt(17 - sqrt(17))) + 6 * sqrt(17) + 34)) / 32) + + +@cacheit +def cos_257() -> Expr: + r"""Computes $\cos \frac{\pi}{257}$ in square roots + + References + ========== + + .. [*] https://math.stackexchange.com/questions/516142/how-does-cos2-pi-257-look-like-in-real-radicals + .. [*] https://r-knott.surrey.ac.uk/Fibonacci/simpleTrig.html + """ + def f1(a: Expr, b: Expr) -> tuple[Expr, Expr]: + return (a + sqrt(a**2 + b)) / 2, (a - sqrt(a**2 + b)) / 2 + + def f2(a: Expr, b: Expr) -> Expr: + return (a - sqrt(a**2 + b))/2 + + t1, t2 = f1(S.NegativeOne, Integer(256)) + z1, z3 = f1(t1, Integer(64)) + z2, z4 = f1(t2, Integer(64)) + y1, y5 = f1(z1, 4*(5 + t1 + 2*z1)) + y6, y2 = f1(z2, 4*(5 + t2 + 2*z2)) + y3, y7 = f1(z3, 4*(5 + t1 + 2*z3)) + y8, y4 = f1(z4, 4*(5 + t2 + 2*z4)) + x1, x9 = f1(y1, -4*(t1 + y1 + y3 + 2*y6)) + x2, x10 = f1(y2, -4*(t2 + y2 + y4 + 2*y7)) + x3, x11 = f1(y3, -4*(t1 + y3 + y5 + 2*y8)) + x4, x12 = f1(y4, -4*(t2 + y4 + y6 + 2*y1)) + x5, x13 = f1(y5, -4*(t1 + y5 + y7 + 2*y2)) + x6, x14 = f1(y6, -4*(t2 + y6 + y8 + 2*y3)) + x15, x7 = f1(y7, -4*(t1 + y7 + y1 + 2*y4)) + x8, x16 = f1(y8, -4*(t2 + y8 + y2 + 2*y5)) + v1 = f2(x1, -4*(x1 + x2 + x3 + x6)) + v2 = f2(x2, -4*(x2 + x3 + x4 + x7)) + v3 = f2(x8, -4*(x8 + x9 + x10 + x13)) + v4 = f2(x9, -4*(x9 + x10 + x11 + x14)) + v5 = f2(x10, -4*(x10 + x11 + x12 + x15)) + v6 = f2(x16, -4*(x16 + x1 + x2 + x5)) + u1 = -f2(-v1, -4*(v2 + v3)) + u2 = -f2(-v4, -4*(v5 + v6)) + w1 = -2*f2(-u1, -4*u2) + return sqrt(sqrt(2)*sqrt(w1 + 4)/8 + S.Half) + + +def cos_table() -> dict[int, Callable[[], Expr]]: + r"""Lazily evaluated table for $\cos \frac{\pi}{n}$ in square roots for + $n \in \{3, 5, 17, 257, 65537\}$. + + Notes + ===== + + 65537 is the only other known Fermat prime and it is nearly impossible to + build in the current SymPy due to performance issues. + + References + ========== + + https://r-knott.surrey.ac.uk/Fibonacci/simpleTrig.html + """ + return { + 3: cos_3, + 5: cos_5, + 17: cos_17, + 257: cos_257 + } diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/functions/elementary/complexes.py b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/functions/elementary/complexes.py new file mode 100644 index 0000000000000000000000000000000000000000..dd837e4e242057050370f38c4b4e9c26aa5d06c9 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/functions/elementary/complexes.py @@ -0,0 +1,1492 @@ +from __future__ import annotations + +from sympy.core import S, Add, Mul, sympify, Symbol, Dummy, Basic +from sympy.core.expr import Expr +from sympy.core.exprtools import factor_terms +from sympy.core.function import (DefinedFunction, Derivative, ArgumentIndexError, + AppliedUndef, expand_mul, PoleError) +from sympy.core.logic import fuzzy_not, fuzzy_or +from sympy.core.numbers import pi, I, oo +from sympy.core.power import Pow +from sympy.core.relational import Eq +from sympy.functions.elementary.miscellaneous import sqrt +from sympy.functions.elementary.piecewise import Piecewise + +############################################################################### +######################### REAL and IMAGINARY PARTS ############################ +############################################################################### + + +class re(DefinedFunction): + """ + Returns real part of expression. This function performs only + elementary analysis and so it will fail to decompose properly + more complicated expressions. If completely simplified result + is needed then use ``Basic.as_real_imag()`` or perform complex + expansion on instance of this function. + + Examples + ======== + + >>> from sympy import re, im, I, E, symbols + >>> x, y = symbols('x y', real=True) + >>> re(2*E) + 2*E + >>> re(2*I + 17) + 17 + >>> re(2*I) + 0 + >>> re(im(x) + x*I + 2) + 2 + >>> re(5 + I + 2) + 7 + + Parameters + ========== + + arg : Expr + Real or complex expression. + + Returns + ======= + + expr : Expr + Real part of expression. + + See Also + ======== + + im + """ + + args: tuple[Expr] + + is_extended_real = True + unbranched = True # implicitly works on the projection to C + _singularities = True # non-holomorphic + + @classmethod + def eval(cls, arg): + if arg is S.NaN: + return S.NaN + elif arg is S.ComplexInfinity: + return S.NaN + elif arg.is_extended_real: + return arg + elif arg.is_imaginary or (I*arg).is_extended_real: + return S.Zero + elif arg.is_Matrix: + return arg.as_real_imag()[0] + elif arg.is_Function and isinstance(arg, conjugate): + return re(arg.args[0]) + else: + + included, reverted, excluded = [], [], [] + args = Add.make_args(arg) + for term in args: + coeff = term.as_coefficient(I) + + if coeff is not None: + if not coeff.is_extended_real: + reverted.append(coeff) + elif not term.has(I) and term.is_extended_real: + excluded.append(term) + else: + # Try to do some advanced expansion. If + # impossible, don't try to do re(arg) again + # (because this is what we are trying to do now). + real_imag = term.as_real_imag(ignore=arg) + if real_imag: + excluded.append(real_imag[0]) + else: + included.append(term) + + if len(args) != len(included): + a, b, c = (Add(*xs) for xs in [included, reverted, excluded]) + + return cls(a) - im(b) + c + + def as_real_imag(self, deep=True, **hints): + """ + Returns the real number with a zero imaginary part. + + """ + return (self, S.Zero) + + def _eval_derivative(self, x): + if x.is_extended_real or self.args[0].is_extended_real: + return re(Derivative(self.args[0], x, evaluate=True)) + if x.is_imaginary or self.args[0].is_imaginary: + return -I \ + * im(Derivative(self.args[0], x, evaluate=True)) + + def _eval_rewrite_as_im(self, arg, **kwargs): + return self.args[0] - I*im(self.args[0]) + + def _eval_is_algebraic(self): + return self.args[0].is_algebraic + + def _eval_is_zero(self): + # is_imaginary implies nonzero + return fuzzy_or([self.args[0].is_imaginary, self.args[0].is_zero]) + + def _eval_is_finite(self): + if self.args[0].is_finite: + return True + + def _eval_is_complex(self): + if self.args[0].is_finite: + return True + + +class im(DefinedFunction): + """ + Returns imaginary part of expression. This function performs only + elementary analysis and so it will fail to decompose properly more + complicated expressions. If completely simplified result is needed then + use ``Basic.as_real_imag()`` or perform complex expansion on instance of + this function. + + Examples + ======== + + >>> from sympy import re, im, E, I + >>> from sympy.abc import x, y + >>> im(2*E) + 0 + >>> im(2*I + 17) + 2 + >>> im(x*I) + re(x) + >>> im(re(x) + y) + im(y) + >>> im(2 + 3*I) + 3 + + Parameters + ========== + + arg : Expr + Real or complex expression. + + Returns + ======= + + expr : Expr + Imaginary part of expression. + + See Also + ======== + + re + """ + + args: tuple[Expr] + + is_extended_real = True + unbranched = True # implicitly works on the projection to C + _singularities = True # non-holomorphic + + @classmethod + def eval(cls, arg): + if arg is S.NaN: + return S.NaN + elif arg is S.ComplexInfinity: + return S.NaN + elif arg.is_extended_real: + return S.Zero + elif arg.is_imaginary or (I*arg).is_extended_real: + return -I * arg + elif arg.is_Matrix: + return arg.as_real_imag()[1] + elif arg.is_Function and isinstance(arg, conjugate): + return -im(arg.args[0]) + else: + included, reverted, excluded = [], [], [] + args = Add.make_args(arg) + for term in args: + coeff = term.as_coefficient(I) + + if coeff is not None: + if not coeff.is_extended_real: + reverted.append(coeff) + else: + excluded.append(coeff) + elif term.has(I) or not term.is_extended_real: + # Try to do some advanced expansion. If + # impossible, don't try to do im(arg) again + # (because this is what we are trying to do now). + real_imag = term.as_real_imag(ignore=arg) + if real_imag: + excluded.append(real_imag[1]) + else: + included.append(term) + + if len(args) != len(included): + a, b, c = (Add(*xs) for xs in [included, reverted, excluded]) + + return cls(a) + re(b) + c + + def as_real_imag(self, deep=True, **hints): + """ + Return the imaginary part with a zero real part. + + """ + return (self, S.Zero) + + def _eval_derivative(self, x): + if x.is_extended_real or self.args[0].is_extended_real: + return im(Derivative(self.args[0], x, evaluate=True)) + if x.is_imaginary or self.args[0].is_imaginary: + return -I \ + * re(Derivative(self.args[0], x, evaluate=True)) + + def _eval_rewrite_as_re(self, arg, **kwargs): + return -I*(self.args[0] - re(self.args[0])) + + def _eval_is_algebraic(self): + return self.args[0].is_algebraic + + def _eval_is_zero(self): + return self.args[0].is_extended_real + + def _eval_is_finite(self): + if self.args[0].is_finite: + return True + + def _eval_is_complex(self): + if self.args[0].is_finite: + return True + +############################################################################### +############### SIGN, ABSOLUTE VALUE, ARGUMENT and CONJUGATION ################ +############################################################################### + +class sign(DefinedFunction): + """ + Returns the complex sign of an expression: + + Explanation + =========== + + If the expression is real the sign will be: + + * $1$ if expression is positive + * $0$ if expression is equal to zero + * $-1$ if expression is negative + + If the expression is imaginary the sign will be: + + * $I$ if im(expression) is positive + * $-I$ if im(expression) is negative + + Otherwise an unevaluated expression will be returned. When evaluated, the + result (in general) will be ``cos(arg(expr)) + I*sin(arg(expr))``. + + Examples + ======== + + >>> from sympy import sign, I + + >>> sign(-1) + -1 + >>> sign(0) + 0 + >>> sign(-3*I) + -I + >>> sign(1 + I) + sign(1 + I) + >>> _.evalf() + 0.707106781186548 + 0.707106781186548*I + + Parameters + ========== + + arg : Expr + Real or imaginary expression. + + Returns + ======= + + expr : Expr + Complex sign of expression. + + See Also + ======== + + Abs, conjugate + """ + + is_complex = True + _singularities = True + + def doit(self, **hints): + s = super().doit() + if s == self and self.args[0].is_zero is False: + return self.args[0] / Abs(self.args[0]) + return s + + @classmethod + def eval(cls, arg): + # handle what we can + if arg.is_Mul: + c, args = arg.as_coeff_mul() + unk = [] + s = sign(c) + for a in args: + if a.is_extended_negative: + s = -s + elif a.is_extended_positive: + pass + else: + if a.is_imaginary: + ai = im(a) + if ai.is_comparable: # i.e. a = I*real + s *= I + if ai.is_extended_negative: + # can't use sign(ai) here since ai might not be + # a Number + s = -s + else: + unk.append(a) + else: + unk.append(a) + if c is S.One and len(unk) == len(args): + return None + return s * cls(arg._new_rawargs(*unk)) + if arg is S.NaN: + return S.NaN + if arg.is_zero: # it may be an Expr that is zero + return S.Zero + if arg.is_extended_positive: + return S.One + if arg.is_extended_negative: + return S.NegativeOne + if arg.is_Function: + if isinstance(arg, sign): + return arg + if arg.is_imaginary: + if arg.is_Pow and arg.exp is S.Half: + # we catch this because non-trivial sqrt args are not expanded + # e.g. sqrt(1-sqrt(2)) --x--> to I*sqrt(sqrt(2) - 1) + return I + arg2 = -I * arg + if arg2.is_extended_positive: + return I + if arg2.is_extended_negative: + return -I + + def _eval_Abs(self): + if fuzzy_not(self.args[0].is_zero): + return S.One + + def _eval_conjugate(self): + return sign(conjugate(self.args[0])) + + def _eval_derivative(self, x): + if self.args[0].is_extended_real: + from sympy.functions.special.delta_functions import DiracDelta + return 2 * Derivative(self.args[0], x, evaluate=True) \ + * DiracDelta(self.args[0]) + elif self.args[0].is_imaginary: + from sympy.functions.special.delta_functions import DiracDelta + return 2 * Derivative(self.args[0], x, evaluate=True) \ + * DiracDelta(-I * self.args[0]) + + def _eval_is_nonnegative(self): + if self.args[0].is_nonnegative: + return True + + def _eval_is_nonpositive(self): + if self.args[0].is_nonpositive: + return True + + def _eval_is_imaginary(self): + return self.args[0].is_imaginary + + def _eval_is_integer(self): + return self.args[0].is_extended_real + + def _eval_is_zero(self): + return self.args[0].is_zero + + def _eval_power(self, other): + if ( + fuzzy_not(self.args[0].is_zero) and + other.is_integer and + other.is_even + ): + return S.One + + def _eval_nseries(self, x, n, logx, cdir=0): + arg0 = self.args[0] + x0 = arg0.subs(x, 0) + if x0 != 0: + return self.func(x0) + if cdir != 0: + cdir = arg0.dir(x, cdir) + return -S.One if re(cdir) < 0 else S.One + + def _eval_rewrite_as_Piecewise(self, arg, **kwargs): + if arg.is_extended_real: + return Piecewise((1, arg > 0), (-1, arg < 0), (0, True)) + + def _eval_rewrite_as_Heaviside(self, arg, **kwargs): + from sympy.functions.special.delta_functions import Heaviside + if arg.is_extended_real: + return Heaviside(arg) * 2 - 1 + + def _eval_rewrite_as_Abs(self, arg, **kwargs): + return Piecewise((0, Eq(arg, 0)), (arg / Abs(arg), True)) + + def _eval_simplify(self, **kwargs): + return self.func(factor_terms(self.args[0])) # XXX include doit? + + +class Abs(DefinedFunction): + """ + Return the absolute value of the argument. + + Explanation + =========== + + This is an extension of the built-in function ``abs()`` to accept symbolic + values. If you pass a SymPy expression to the built-in ``abs()``, it will + pass it automatically to ``Abs()``. + + Examples + ======== + + >>> from sympy import Abs, Symbol, S, I + >>> Abs(-1) + 1 + >>> x = Symbol('x', real=True) + >>> Abs(-x) + Abs(x) + >>> Abs(x**2) + x**2 + >>> abs(-x) # The Python built-in + Abs(x) + >>> Abs(3*x + 2*I) + sqrt(9*x**2 + 4) + >>> Abs(8*I) + 8 + + Note that the Python built-in will return either an Expr or int depending on + the argument:: + + >>> type(abs(-1)) + <... 'int'> + >>> type(abs(S.NegativeOne)) + + + Abs will always return a SymPy object. + + Parameters + ========== + + arg : Expr + Real or complex expression. + + Returns + ======= + + expr : Expr + Absolute value returned can be an expression or integer depending on + input arg. + + See Also + ======== + + sign, conjugate + """ + + args: tuple[Expr] + + is_extended_real = True + is_extended_negative = False + is_extended_nonnegative = True + unbranched = True + _singularities = True # non-holomorphic + + def fdiff(self, argindex=1): + """ + Get the first derivative of the argument to Abs(). + + """ + if argindex == 1: + return sign(self.args[0]) + else: + raise ArgumentIndexError(self, argindex) + + @classmethod + def eval(cls, arg): + from sympy.simplify.simplify import signsimp + + if hasattr(arg, '_eval_Abs'): + obj = arg._eval_Abs() + if obj is not None: + return obj + if not isinstance(arg, Expr): + raise TypeError("Bad argument type for Abs(): %s" % type(arg)) + + # handle what we can + arg = signsimp(arg, evaluate=False) + n, d = arg.as_numer_denom() + if d.free_symbols and not n.free_symbols: + return cls(n)/cls(d) + + if arg.is_Mul: + known = [] + unk = [] + for t in arg.args: + if t.is_Pow and t.exp.is_integer and t.exp.is_negative: + bnew = cls(t.base) + if isinstance(bnew, cls): + unk.append(t) + else: + known.append(Pow(bnew, t.exp)) + else: + tnew = cls(t) + if isinstance(tnew, cls): + unk.append(t) + else: + known.append(tnew) + known = Mul(*known) + unk = cls(Mul(*unk), evaluate=False) if unk else S.One + return known*unk + if arg is S.NaN: + return S.NaN + if arg is S.ComplexInfinity: + return oo + from sympy.functions.elementary.exponential import exp, log + + if arg.is_Pow: + base, exponent = arg.as_base_exp() + if base.is_extended_real: + if exponent.is_integer: + if exponent.is_even: + return arg + if base is S.NegativeOne: + return S.One + return Abs(base)**exponent + if base.is_extended_nonnegative: + return base**re(exponent) + if base.is_extended_negative: + return (-base)**re(exponent)*exp(-pi*im(exponent)) + return + elif not base.has(Symbol): # complex base + # express base**exponent as exp(exponent*log(base)) + a, b = log(base).as_real_imag() + z = a + I*b + return exp(re(exponent*z)) + if isinstance(arg, exp): + return exp(re(arg.args[0])) + if isinstance(arg, AppliedUndef): + if arg.is_positive: + return arg + elif arg.is_negative: + return -arg + return + if arg.is_Add and arg.has(oo, S.NegativeInfinity): + if any(a.is_infinite for a in arg.as_real_imag()): + return oo + if arg.is_zero: + return S.Zero + if arg.is_extended_nonnegative: + return arg + if arg.is_extended_nonpositive: + return -arg + if arg.is_imaginary: + arg2 = -I * arg + if arg2.is_extended_nonnegative: + return arg2 + if arg.is_extended_real: + return + # reject result if all new conjugates are just wrappers around + # an expression that was already in the arg + conj = signsimp(arg.conjugate(), evaluate=False) + new_conj = conj.atoms(conjugate) - arg.atoms(conjugate) + if new_conj and all(arg.has(i.args[0]) for i in new_conj): + return + if arg != conj and arg != -conj: + ignore = arg.atoms(Abs) + abs_free_arg = arg.xreplace({i: Dummy(real=True) for i in ignore}) + unk = [a for a in abs_free_arg.free_symbols if a.is_extended_real is None] + if not unk or not all(conj.has(conjugate(u)) for u in unk): + return sqrt(expand_mul(arg*conj)) + + def _eval_is_real(self): + if self.args[0].is_finite: + return True + + def _eval_is_integer(self): + if self.args[0].is_extended_real: + return self.args[0].is_integer + + def _eval_is_extended_nonzero(self): + return fuzzy_not(self._args[0].is_zero) + + def _eval_is_zero(self): + return self._args[0].is_zero + + def _eval_is_extended_positive(self): + return fuzzy_not(self._args[0].is_zero) + + def _eval_is_rational(self): + if self.args[0].is_extended_real: + return self.args[0].is_rational + + def _eval_is_even(self): + if self.args[0].is_extended_real: + return self.args[0].is_even + + def _eval_is_odd(self): + if self.args[0].is_extended_real: + return self.args[0].is_odd + + def _eval_is_algebraic(self): + return self.args[0].is_algebraic + + def _eval_power(self, exponent): + if self.args[0].is_extended_real and exponent.is_integer: + if exponent.is_even: + return self.args[0]**exponent + elif exponent is not S.NegativeOne and exponent.is_Integer: + return self.args[0]**(exponent - 1)*self + return + + def _eval_nseries(self, x, n, logx, cdir=0): + from sympy.functions.elementary.exponential import log + direction = self.args[0].leadterm(x)[0] + if direction.has(log(x)): + direction = direction.subs(log(x), logx) + s = self.args[0]._eval_nseries(x, n=n, logx=logx) + return (sign(direction)*s).expand() + + def _eval_derivative(self, x): + if self.args[0].is_extended_real or self.args[0].is_imaginary: + return Derivative(self.args[0], x, evaluate=True) \ + * sign(conjugate(self.args[0])) + rv = (re(self.args[0]) * Derivative(re(self.args[0]), x, + evaluate=True) + im(self.args[0]) * Derivative(im(self.args[0]), + x, evaluate=True)) / Abs(self.args[0]) + return rv.rewrite(sign) + + def _eval_rewrite_as_Heaviside(self, arg, **kwargs): + # Note this only holds for real arg (since Heaviside is not defined + # for complex arguments). + from sympy.functions.special.delta_functions import Heaviside + if arg.is_extended_real: + return arg*(Heaviside(arg) - Heaviside(-arg)) + + def _eval_rewrite_as_Piecewise(self, arg, **kwargs): + if arg.is_extended_real: + return Piecewise((arg, arg >= 0), (-arg, True)) + elif arg.is_imaginary: + return Piecewise((I*arg, I*arg >= 0), (-I*arg, True)) + + def _eval_rewrite_as_sign(self, arg, **kwargs): + return arg/sign(arg) + + def _eval_rewrite_as_conjugate(self, arg, **kwargs): + return sqrt(arg*conjugate(arg)) + + +class arg(DefinedFunction): + r""" + Returns the argument (in radians) of a complex number. The argument is + evaluated in consistent convention with ``atan2`` where the branch-cut is + taken along the negative real axis and ``arg(z)`` is in the interval + $(-\pi,\pi]$. For a positive number, the argument is always 0; the + argument of a negative number is $\pi$; and the argument of 0 + is undefined and returns ``nan``. So the ``arg`` function will never nest + greater than 3 levels since at the 4th application, the result must be + nan; for a real number, nan is returned on the 3rd application. + + Examples + ======== + + >>> from sympy import arg, I, sqrt, Dummy + >>> from sympy.abc import x + >>> arg(2.0) + 0 + >>> arg(I) + pi/2 + >>> arg(sqrt(2) + I*sqrt(2)) + pi/4 + >>> arg(sqrt(3)/2 + I/2) + pi/6 + >>> arg(4 + 3*I) + atan(3/4) + >>> arg(0.8 + 0.6*I) + 0.643501108793284 + >>> arg(arg(arg(arg(x)))) + nan + >>> real = Dummy(real=True) + >>> arg(arg(arg(real))) + nan + + Parameters + ========== + + arg : Expr + Real or complex expression. + + Returns + ======= + + value : Expr + Returns arc tangent of arg measured in radians. + + """ + + is_extended_real = True + is_real = True + is_finite = True + _singularities = True # non-holomorphic + + @classmethod + def eval(cls, arg): + a = arg + for i in range(3): + if isinstance(a, cls): + a = a.args[0] + else: + if i == 2 and a.is_extended_real: + return S.NaN + break + else: + return S.NaN + from sympy.functions.elementary.exponential import exp, exp_polar + if isinstance(arg, exp_polar): + return periodic_argument(arg, oo) + elif isinstance(arg, exp): + i_ = im(arg.args[0]) + if i_.is_comparable: + i_ %= 2*S.Pi + if i_ > S.Pi: + i_ -= 2*S.Pi + return i_ + + if not arg.is_Atom: + c, arg_ = factor_terms(arg).as_coeff_Mul() + if arg_.is_Mul: + arg_ = Mul(*[a if (sign(a) not in (-1, 1)) else + sign(a) for a in arg_.args]) + arg_ = sign(c)*arg_ + else: + arg_ = arg + if any(i.is_extended_positive is None for i in arg_.atoms(AppliedUndef)): + return + from sympy.functions.elementary.trigonometric import atan2 + x, y = arg_.as_real_imag() + rv = atan2(y, x) + if rv.is_number: + return rv + if arg_ != arg: + return cls(arg_, evaluate=False) + + def _eval_derivative(self, t): + x, y = self.args[0].as_real_imag() + return (x * Derivative(y, t, evaluate=True) - y * + Derivative(x, t, evaluate=True)) / (x**2 + y**2) + + def _eval_rewrite_as_atan2(self, arg, **kwargs): + from sympy.functions.elementary.trigonometric import atan2 + x, y = self.args[0].as_real_imag() + return atan2(y, x) + + def _eval_as_leading_term(self, x, logx, cdir): + arg0 = self.args[0] + t = Dummy('t', positive=True) + if cdir == 0: + cdir = 1 + z = arg0.subs(x, cdir*t) + if z.is_positive: + return S.Zero + elif z.is_negative: + return S.Pi + else: + raise PoleError("Cannot expand %s around 0" % (self)) + + def _eval_nseries(self, x, n, logx, cdir=0): + from sympy.series.order import Order + if n <= 0: + return Order(1) + return self._eval_as_leading_term(x, logx=logx, cdir=cdir) + + +class conjugate(DefinedFunction): + """ + Returns the *complex conjugate* [1]_ of an argument. + In mathematics, the complex conjugate of a complex number + is given by changing the sign of the imaginary part. + + Thus, the conjugate of the complex number + :math:`a + ib` (where $a$ and $b$ are real numbers) is :math:`a - ib` + + Examples + ======== + + >>> from sympy import conjugate, I + >>> conjugate(2) + 2 + >>> conjugate(I) + -I + >>> conjugate(3 + 2*I) + 3 - 2*I + >>> conjugate(5 - I) + 5 + I + + Parameters + ========== + + arg : Expr + Real or complex expression. + + Returns + ======= + + arg : Expr + Complex conjugate of arg as real, imaginary or mixed expression. + + See Also + ======== + + sign, Abs + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Complex_conjugation + """ + _singularities = True # non-holomorphic + + @classmethod + def eval(cls, arg): + obj = arg._eval_conjugate() + if obj is not None: + return obj + + def inverse(self): + return conjugate + + def _eval_Abs(self): + return Abs(self.args[0], evaluate=True) + + def _eval_adjoint(self): + return transpose(self.args[0]) + + def _eval_conjugate(self): + return self.args[0] + + def _eval_derivative(self, x): + if x.is_real: + return conjugate(Derivative(self.args[0], x, evaluate=True)) + elif x.is_imaginary: + return -conjugate(Derivative(self.args[0], x, evaluate=True)) + + def _eval_transpose(self): + return adjoint(self.args[0]) + + def _eval_is_algebraic(self): + return self.args[0].is_algebraic + + +class transpose(DefinedFunction): + """ + Linear map transposition. + + Examples + ======== + + >>> from sympy import transpose, Matrix, MatrixSymbol + >>> A = MatrixSymbol('A', 25, 9) + >>> transpose(A) + A.T + >>> B = MatrixSymbol('B', 9, 22) + >>> transpose(B) + B.T + >>> transpose(A*B) + B.T*A.T + >>> M = Matrix([[4, 5], [2, 1], [90, 12]]) + >>> M + Matrix([ + [ 4, 5], + [ 2, 1], + [90, 12]]) + >>> transpose(M) + Matrix([ + [4, 2, 90], + [5, 1, 12]]) + + Parameters + ========== + + arg : Matrix + Matrix or matrix expression to take the transpose of. + + Returns + ======= + + value : Matrix + Transpose of arg. + + """ + + @classmethod + def eval(cls, arg): + obj = arg._eval_transpose() + if obj is not None: + return obj + + def _eval_adjoint(self): + return conjugate(self.args[0]) + + def _eval_conjugate(self): + return adjoint(self.args[0]) + + def _eval_transpose(self): + return self.args[0] + + +class adjoint(DefinedFunction): + """ + Conjugate transpose or Hermite conjugation. + + Examples + ======== + + >>> from sympy import adjoint, MatrixSymbol + >>> A = MatrixSymbol('A', 10, 5) + >>> adjoint(A) + Adjoint(A) + + Parameters + ========== + + arg : Matrix + Matrix or matrix expression to take the adjoint of. + + Returns + ======= + + value : Matrix + Represents the conjugate transpose or Hermite + conjugation of arg. + + """ + + @classmethod + def eval(cls, arg): + obj = arg._eval_adjoint() + if obj is not None: + return obj + obj = arg._eval_transpose() + if obj is not None: + return conjugate(obj) + + def _eval_adjoint(self): + return self.args[0] + + def _eval_conjugate(self): + return transpose(self.args[0]) + + def _eval_transpose(self): + return conjugate(self.args[0]) + + def _latex(self, printer, exp=None, *args): + arg = printer._print(self.args[0]) + tex = r'%s^{\dagger}' % arg + if exp: + tex = r'\left(%s\right)^{%s}' % (tex, exp) + return tex + + def _pretty(self, printer, *args): + from sympy.printing.pretty.stringpict import prettyForm + pform = printer._print(self.args[0], *args) + if printer._use_unicode: + pform = pform**prettyForm('\N{DAGGER}') + else: + pform = pform**prettyForm('+') + return pform + +############################################################################### +############### HANDLING OF POLAR NUMBERS ##################################### +############################################################################### + + +class polar_lift(DefinedFunction): + """ + Lift argument to the Riemann surface of the logarithm, using the + standard branch. + + Examples + ======== + + >>> from sympy import Symbol, polar_lift, I + >>> p = Symbol('p', polar=True) + >>> x = Symbol('x') + >>> polar_lift(4) + 4*exp_polar(0) + >>> polar_lift(-4) + 4*exp_polar(I*pi) + >>> polar_lift(-I) + exp_polar(-I*pi/2) + >>> polar_lift(I + 2) + polar_lift(2 + I) + + >>> polar_lift(4*x) + 4*polar_lift(x) + >>> polar_lift(4*p) + 4*p + + Parameters + ========== + + arg : Expr + Real or complex expression. + + See Also + ======== + + sympy.functions.elementary.exponential.exp_polar + periodic_argument + """ + + is_polar = True + is_comparable = False # Cannot be evalf'd. + + @classmethod + def eval(cls, arg): + from sympy.functions.elementary.complexes import arg as argument + if arg.is_number: + ar = argument(arg) + # In general we want to affirm that something is known, + # e.g. `not ar.has(argument) and not ar.has(atan)` + # but for now we will just be more restrictive and + # see that it has evaluated to one of the known values. + if ar in (0, pi/2, -pi/2, pi): + from sympy.functions.elementary.exponential import exp_polar + return exp_polar(I*ar)*abs(arg) + + if arg.is_Mul: + args = arg.args + else: + args = [arg] + included = [] + excluded = [] + positive = [] + for arg in args: + if arg.is_polar: + included += [arg] + elif arg.is_positive: + positive += [arg] + else: + excluded += [arg] + if len(excluded) < len(args): + if excluded: + return Mul(*(included + positive))*polar_lift(Mul(*excluded)) + elif included: + return Mul(*(included + positive)) + else: + from sympy.functions.elementary.exponential import exp_polar + return Mul(*positive)*exp_polar(0) + + def _eval_evalf(self, prec): + """ Careful! any evalf of polar numbers is flaky """ + return self.args[0]._eval_evalf(prec) + + def _eval_Abs(self): + return Abs(self.args[0], evaluate=True) + + +class periodic_argument(DefinedFunction): + r""" + Represent the argument on a quotient of the Riemann surface of the + logarithm. That is, given a period $P$, always return a value in + $(-P/2, P/2]$, by using $\exp(PI) = 1$. + + Examples + ======== + + >>> from sympy import exp_polar, periodic_argument + >>> from sympy import I, pi + >>> periodic_argument(exp_polar(10*I*pi), 2*pi) + 0 + >>> periodic_argument(exp_polar(5*I*pi), 4*pi) + pi + >>> from sympy import exp_polar, periodic_argument + >>> from sympy import I, pi + >>> periodic_argument(exp_polar(5*I*pi), 2*pi) + pi + >>> periodic_argument(exp_polar(5*I*pi), 3*pi) + -pi + >>> periodic_argument(exp_polar(5*I*pi), pi) + 0 + + Parameters + ========== + + ar : Expr + A polar number. + + period : Expr + The period $P$. + + See Also + ======== + + sympy.functions.elementary.exponential.exp_polar + polar_lift : Lift argument to the Riemann surface of the logarithm + principal_branch + """ + + @classmethod + def _getunbranched(cls, ar): + from sympy.functions.elementary.exponential import exp_polar, log + if ar.is_Mul: + args = ar.args + else: + args = [ar] + unbranched = 0 + for a in args: + if not a.is_polar: + unbranched += arg(a) + elif isinstance(a, exp_polar): + unbranched += a.exp.as_real_imag()[1] + elif a.is_Pow: + re, im = a.exp.as_real_imag() + unbranched += re*unbranched_argument( + a.base) + im*log(abs(a.base)) + elif isinstance(a, polar_lift): + unbranched += arg(a.args[0]) + else: + return None + return unbranched + + @classmethod + def eval(cls, ar, period): + # Our strategy is to evaluate the argument on the Riemann surface of the + # logarithm, and then reduce. + # NOTE evidently this means it is a rather bad idea to use this with + # period != 2*pi and non-polar numbers. + if not period.is_extended_positive: + return None + if period == oo and isinstance(ar, principal_branch): + return periodic_argument(*ar.args) + if isinstance(ar, polar_lift) and period >= 2*pi: + return periodic_argument(ar.args[0], period) + if ar.is_Mul: + newargs = [x for x in ar.args if not x.is_positive] + if len(newargs) != len(ar.args): + return periodic_argument(Mul(*newargs), period) + unbranched = cls._getunbranched(ar) + if unbranched is None: + return None + from sympy.functions.elementary.trigonometric import atan, atan2 + if unbranched.has(periodic_argument, atan2, atan): + return None + if period == oo: + return unbranched + if period != oo: + from sympy.functions.elementary.integers import ceiling + n = ceiling(unbranched/period - S.Half)*period + if not n.has(ceiling): + return unbranched - n + + def _eval_evalf(self, prec): + z, period = self.args + if period == oo: + unbranched = periodic_argument._getunbranched(z) + if unbranched is None: + return self + return unbranched._eval_evalf(prec) + ub = periodic_argument(z, oo)._eval_evalf(prec) + from sympy.functions.elementary.integers import ceiling + return (ub - ceiling(ub/period - S.Half)*period)._eval_evalf(prec) + + +def unbranched_argument(arg): + ''' + Returns periodic argument of arg with period as infinity. + + Examples + ======== + + >>> from sympy import exp_polar, unbranched_argument + >>> from sympy import I, pi + >>> unbranched_argument(exp_polar(15*I*pi)) + 15*pi + >>> unbranched_argument(exp_polar(7*I*pi)) + 7*pi + + See also + ======== + + periodic_argument + ''' + return periodic_argument(arg, oo) + + +class principal_branch(DefinedFunction): + """ + Represent a polar number reduced to its principal branch on a quotient + of the Riemann surface of the logarithm. + + Explanation + =========== + + This is a function of two arguments. The first argument is a polar + number `z`, and the second one a positive real number or infinity, `p`. + The result is ``z mod exp_polar(I*p)``. + + Examples + ======== + + >>> from sympy import exp_polar, principal_branch, oo, I, pi + >>> from sympy.abc import z + >>> principal_branch(z, oo) + z + >>> principal_branch(exp_polar(2*pi*I)*3, 2*pi) + 3*exp_polar(0) + >>> principal_branch(exp_polar(2*pi*I)*3*z, 2*pi) + 3*principal_branch(z, 2*pi) + + Parameters + ========== + + x : Expr + A polar number. + + period : Expr + Positive real number or infinity. + + See Also + ======== + + sympy.functions.elementary.exponential.exp_polar + polar_lift : Lift argument to the Riemann surface of the logarithm + periodic_argument + """ + + is_polar = True + is_comparable = False # cannot always be evalf'd + + @classmethod + def eval(self, x, period): + from sympy.functions.elementary.exponential import exp_polar + if isinstance(x, polar_lift): + return principal_branch(x.args[0], period) + if period == oo: + return x + ub = periodic_argument(x, oo) + barg = periodic_argument(x, period) + if ub != barg and not ub.has(periodic_argument) \ + and not barg.has(periodic_argument): + pl = polar_lift(x) + + def mr(expr): + if not isinstance(expr, Symbol): + return polar_lift(expr) + return expr + pl = pl.replace(polar_lift, mr) + # Recompute unbranched argument + ub = periodic_argument(pl, oo) + if not pl.has(polar_lift): + if ub != barg: + res = exp_polar(I*(barg - ub))*pl + else: + res = pl + if not res.is_polar and not res.has(exp_polar): + res *= exp_polar(0) + return res + + if not x.free_symbols: + c, m = x, () + else: + c, m = x.as_coeff_mul(*x.free_symbols) + others = [] + for y in m: + if y.is_positive: + c *= y + else: + others += [y] + m = tuple(others) + arg = periodic_argument(c, period) + if arg.has(periodic_argument): + return None + if arg.is_number and (unbranched_argument(c) != arg or + (arg == 0 and m != () and c != 1)): + if arg == 0: + return abs(c)*principal_branch(Mul(*m), period) + return principal_branch(exp_polar(I*arg)*Mul(*m), period)*abs(c) + if arg.is_number and ((abs(arg) < period/2) == True or arg == period/2) \ + and m == (): + return exp_polar(arg*I)*abs(c) + + def _eval_evalf(self, prec): + z, period = self.args + p = periodic_argument(z, period)._eval_evalf(prec) + if abs(p) > pi or p == -pi: + return self # Cannot evalf for this argument. + from sympy.functions.elementary.exponential import exp + return (abs(z)*exp(I*p))._eval_evalf(prec) + + +def _polarify(eq, lift, pause=False): + from sympy.integrals.integrals import Integral + if eq.is_polar: + return eq + if eq.is_number and not pause: + return polar_lift(eq) + if isinstance(eq, Symbol) and not pause and lift: + return polar_lift(eq) + elif eq.is_Atom: + return eq + elif eq.is_Add: + r = eq.func(*[_polarify(arg, lift, pause=True) for arg in eq.args]) + if lift: + return polar_lift(r) + return r + elif eq.is_Pow and eq.base == S.Exp1: + return eq.func(S.Exp1, _polarify(eq.exp, lift, pause=False)) + elif eq.is_Function: + return eq.func(*[_polarify(arg, lift, pause=False) for arg in eq.args]) + elif isinstance(eq, Integral): + # Don't lift the integration variable + func = _polarify(eq.function, lift, pause=pause) + limits = [] + for limit in eq.args[1:]: + var = _polarify(limit[0], lift=False, pause=pause) + rest = _polarify(limit[1:], lift=lift, pause=pause) + limits.append((var,) + rest) + return Integral(*((func,) + tuple(limits))) + else: + return eq.func(*[_polarify(arg, lift, pause=pause) + if isinstance(arg, Expr) else arg for arg in eq.args]) + + +def polarify(eq, subs=True, lift=False): + """ + Turn all numbers in eq into their polar equivalents (under the standard + choice of argument). + + Note that no attempt is made to guess a formal convention of adding + polar numbers, expressions like $1 + x$ will generally not be altered. + + Note also that this function does not promote ``exp(x)`` to ``exp_polar(x)``. + + If ``subs`` is ``True``, all symbols which are not already polar will be + substituted for polar dummies; in this case the function behaves much + like :func:`~.posify`. + + If ``lift`` is ``True``, both addition statements and non-polar symbols are + changed to their ``polar_lift()``ed versions. + Note that ``lift=True`` implies ``subs=False``. + + Examples + ======== + + >>> from sympy import polarify, sin, I + >>> from sympy.abc import x, y + >>> expr = (-x)**y + >>> expr.expand() + (-x)**y + >>> polarify(expr) + ((_x*exp_polar(I*pi))**_y, {_x: x, _y: y}) + >>> polarify(expr)[0].expand() + _x**_y*exp_polar(_y*I*pi) + >>> polarify(x, lift=True) + polar_lift(x) + >>> polarify(x*(1+y), lift=True) + polar_lift(x)*polar_lift(y + 1) + + Adds are treated carefully: + + >>> polarify(1 + sin((1 + I)*x)) + (sin(_x*polar_lift(1 + I)) + 1, {_x: x}) + """ + if lift: + subs = False + eq = _polarify(sympify(eq), lift) + if not subs: + return eq + reps = {s: Dummy(s.name, polar=True) for s in eq.free_symbols} + eq = eq.subs(reps) + return eq, {r: s for s, r in reps.items()} + + +def _unpolarify(eq, exponents_only, pause=False): + if not isinstance(eq, Basic) or eq.is_Atom: + return eq + + if not pause: + from sympy.functions.elementary.exponential import exp, exp_polar + if isinstance(eq, exp_polar): + return exp(_unpolarify(eq.exp, exponents_only)) + if isinstance(eq, principal_branch) and eq.args[1] == 2*pi: + return _unpolarify(eq.args[0], exponents_only) + if ( + eq.is_Add or eq.is_Mul or eq.is_Boolean or + eq.is_Relational and ( + eq.rel_op in ('==', '!=') and 0 in eq.args or + eq.rel_op not in ('==', '!=')) + ): + return eq.func(*[_unpolarify(x, exponents_only) for x in eq.args]) + if isinstance(eq, polar_lift): + return _unpolarify(eq.args[0], exponents_only) + + if eq.is_Pow: + expo = _unpolarify(eq.exp, exponents_only) + base = _unpolarify(eq.base, exponents_only, + not (expo.is_integer and not pause)) + return base**expo + + if eq.is_Function and getattr(eq.func, 'unbranched', False): + return eq.func(*[_unpolarify(x, exponents_only, exponents_only) + for x in eq.args]) + + return eq.func(*[_unpolarify(x, exponents_only, True) for x in eq.args]) + + +def unpolarify(eq, subs=None, exponents_only=False): + """ + If `p` denotes the projection from the Riemann surface of the logarithm to + the complex line, return a simplified version `eq'` of `eq` such that + `p(eq') = p(eq)`. + Also apply the substitution subs in the end. (This is a convenience, since + ``unpolarify``, in a certain sense, undoes :func:`polarify`.) + + Examples + ======== + + >>> from sympy import unpolarify, polar_lift, sin, I + >>> unpolarify(polar_lift(I + 2)) + 2 + I + >>> unpolarify(sin(polar_lift(I + 7))) + sin(7 + I) + """ + if isinstance(eq, bool): + return eq + + eq = sympify(eq) + if subs is not None: + return unpolarify(eq.subs(subs)) + changed = True + pause = False + if exponents_only: + pause = True + while changed: + changed = False + res = _unpolarify(eq, exponents_only, pause) + if res != eq: + changed = True + eq = res + if isinstance(res, bool): + return res + # Finally, replacing Exp(0) by 1 is always correct. + # So is polar_lift(0) -> 0. + from sympy.functions.elementary.exponential import exp_polar + return res.subs({exp_polar(0): 1, polar_lift(0): 0}) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/functions/elementary/exponential.py b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/functions/elementary/exponential.py new file mode 100644 index 0000000000000000000000000000000000000000..2bb0333cb34a35a96248c12a4640e848986f2feb --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/functions/elementary/exponential.py @@ -0,0 +1,1286 @@ +from __future__ import annotations +from itertools import product + +from sympy.core.add import Add +from sympy.core.cache import cacheit +from sympy.core.expr import Expr +from sympy.core.function import (DefinedFunction, ArgumentIndexError, expand_log, + expand_mul, FunctionClass, PoleError, expand_multinomial, expand_complex) +from sympy.core.logic import fuzzy_and, fuzzy_not, fuzzy_or +from sympy.core.mul import Mul +from sympy.core.numbers import Integer, Rational, pi, I +from sympy.core.parameters import global_parameters +from sympy.core.power import Pow +from sympy.core.singleton import S +from sympy.core.symbol import Wild, Dummy +from sympy.core.sympify import sympify +from sympy.functions.combinatorial.factorials import factorial +from sympy.functions.elementary.complexes import arg, unpolarify, im, re, Abs +from sympy.functions.elementary.miscellaneous import sqrt +from sympy.ntheory import multiplicity, perfect_power +from sympy.ntheory.factor_ import factorint + +# NOTE IMPORTANT +# The series expansion code in this file is an important part of the gruntz +# algorithm for determining limits. _eval_nseries has to return a generalized +# power series with coefficients in C(log(x), log). +# In more detail, the result of _eval_nseries(self, x, n) must be +# c_0*x**e_0 + ... (finitely many terms) +# where e_i are numbers (not necessarily integers) and c_i involve only +# numbers, the function log, and log(x). [This also means it must not contain +# log(x(1+p)), this *has* to be expanded to log(x)+log(1+p) if x.is_positive and +# p.is_positive.] + + +class ExpBase(DefinedFunction): + + unbranched = True + _singularities = (S.ComplexInfinity,) + + @property + def kind(self): + return self.exp.kind + + def inverse(self, argindex=1): + """ + Returns the inverse function of ``exp(x)``. + """ + return log + + def as_numer_denom(self): + """ + Returns this with a positive exponent as a 2-tuple (a fraction). + + Examples + ======== + + >>> from sympy import exp + >>> from sympy.abc import x + >>> exp(-x).as_numer_denom() + (1, exp(x)) + >>> exp(x).as_numer_denom() + (exp(x), 1) + """ + # this should be the same as Pow.as_numer_denom wrt + # exponent handling + if not self.is_commutative: + return self, S.One + exp = self.exp + neg_exp = exp.is_negative + if not neg_exp and not (-exp).is_negative: + neg_exp = exp.could_extract_minus_sign() + if neg_exp: + return S.One, self.func(-exp) + return self, S.One + + @property + def exp(self): + """ + Returns the exponent of the function. + """ + return self.args[0] + + def as_base_exp(self): + """ + Returns the 2-tuple (base, exponent). + """ + return self.func(1), Mul(*self.args) + + def _eval_adjoint(self): + return self.func(self.exp.adjoint()) + + def _eval_conjugate(self): + return self.func(self.exp.conjugate()) + + def _eval_transpose(self): + return self.func(self.exp.transpose()) + + def _eval_is_finite(self): + arg = self.exp + if arg.is_infinite: + if arg.is_extended_negative: + return True + if arg.is_extended_positive: + return False + if arg.is_finite: + return True + + def _eval_is_rational(self): + s = self.func(*self.args) + if s.func == self.func: + z = s.exp.is_zero + if z: + return True + elif s.exp.is_rational and fuzzy_not(z): + return False + else: + return s.is_rational + + def _eval_is_zero(self): + return self.exp is S.NegativeInfinity + + def _eval_power(self, other): + """exp(arg)**e -> exp(arg*e) if assumptions allow it. + """ + b, e = self.as_base_exp() + return Pow._eval_power(Pow(b, e, evaluate=False), other) + + def _eval_expand_power_exp(self, **hints): + from sympy.concrete.products import Product + from sympy.concrete.summations import Sum + arg = self.args[0] + if arg.is_Add and arg.is_commutative: + return Mul.fromiter(self.func(x) for x in arg.args) + elif isinstance(arg, Sum) and arg.is_commutative: + return Product(self.func(arg.function), *arg.limits) + return self.func(arg) + + +class exp_polar(ExpBase): + r""" + Represent a *polar number* (see g-function Sphinx documentation). + + Explanation + =========== + + ``exp_polar`` represents the function + `Exp: \mathbb{C} \rightarrow \mathcal{S}`, sending the complex number + `z = a + bi` to the polar number `r = exp(a), \theta = b`. It is one of + the main functions to construct polar numbers. + + Examples + ======== + + >>> from sympy import exp_polar, pi, I, exp + + The main difference is that polar numbers do not "wrap around" at `2 \pi`: + + >>> exp(2*pi*I) + 1 + >>> exp_polar(2*pi*I) + exp_polar(2*I*pi) + + apart from that they behave mostly like classical complex numbers: + + >>> exp_polar(2)*exp_polar(3) + exp_polar(5) + + See Also + ======== + + sympy.simplify.powsimp.powsimp + polar_lift + periodic_argument + principal_branch + """ + + is_polar = True + is_comparable = False # cannot be evalf'd + + def _eval_Abs(self): # Abs is never a polar number + return exp(re(self.args[0])) + + def _eval_evalf(self, prec): + """ Careful! any evalf of polar numbers is flaky """ + i = im(self.args[0]) + try: + bad = (i <= -pi or i > pi) + except TypeError: + bad = True + if bad: + return self # cannot evalf for this argument + res = exp(self.args[0])._eval_evalf(prec) + if i > 0 and im(res) < 0: + # i ~ pi, but exp(I*i) evaluated to argument slightly bigger than pi + return re(res) + return res + + def _eval_power(self, other): + return self.func(self.args[0]*other) + + def _eval_is_extended_real(self): + if self.args[0].is_extended_real: + return True + + def as_base_exp(self): + # XXX exp_polar(0) is special! + if self.args[0] == 0: + return self, S.One + return ExpBase.as_base_exp(self) + + +class ExpMeta(FunctionClass): + def __instancecheck__(cls, instance): + if exp in instance.__class__.__mro__: + return True + return isinstance(instance, Pow) and instance.base is S.Exp1 + + +class exp(ExpBase, metaclass=ExpMeta): + """ + The exponential function, :math:`e^x`. + + Examples + ======== + + >>> from sympy import exp, I, pi + >>> from sympy.abc import x + >>> exp(x) + exp(x) + >>> exp(x).diff(x) + exp(x) + >>> exp(I*pi) + -1 + + Parameters + ========== + + arg : Expr + + See Also + ======== + + log + """ + + def fdiff(self, argindex=1): + """ + Returns the first derivative of this function. + """ + if argindex == 1: + return self + else: + raise ArgumentIndexError(self, argindex) + + def _eval_refine(self, assumptions): + from sympy.assumptions import ask, Q + arg = self.args[0] + if arg.is_Mul: + Ioo = I*S.Infinity + if arg in [Ioo, -Ioo]: + return S.NaN + + coeff = arg.as_coefficient(pi*I) + if coeff: + if ask(Q.integer(2*coeff)): + if ask(Q.even(coeff)): + return S.One + elif ask(Q.odd(coeff)): + return S.NegativeOne + elif ask(Q.even(coeff + S.Half)): + return -I + elif ask(Q.odd(coeff + S.Half)): + return I + + @classmethod + def eval(cls, arg): + from sympy.calculus import AccumBounds + from sympy.matrices.matrixbase import MatrixBase + from sympy.sets.setexpr import SetExpr + from sympy.simplify.simplify import logcombine + if isinstance(arg, MatrixBase): + return arg.exp() + elif global_parameters.exp_is_pow: + return Pow(S.Exp1, arg) + elif arg.is_Number: + if arg is S.NaN: + return S.NaN + elif arg.is_zero: + return S.One + elif arg is S.One: + return S.Exp1 + elif arg is S.Infinity: + return S.Infinity + elif arg is S.NegativeInfinity: + return S.Zero + elif arg is S.ComplexInfinity: + return S.NaN + elif isinstance(arg, log): + return arg.args[0] + elif isinstance(arg, AccumBounds): + return AccumBounds(exp(arg.min), exp(arg.max)) + elif isinstance(arg, SetExpr): + return arg._eval_func(cls) + elif arg.is_Mul: + coeff = arg.as_coefficient(pi*I) + if coeff: + if (2*coeff).is_integer: + if coeff.is_even: + return S.One + elif coeff.is_odd: + return S.NegativeOne + elif (coeff + S.Half).is_even: + return -I + elif (coeff + S.Half).is_odd: + return I + elif coeff.is_Rational: + ncoeff = coeff % 2 # restrict to [0, 2pi) + if ncoeff > 1: # restrict to (-pi, pi] + ncoeff -= 2 + if ncoeff != coeff: + return cls(ncoeff*pi*I) + + # Warning: code in risch.py will be very sensitive to changes + # in this (see DifferentialExtension). + + # look for a single log factor + + coeff, terms = arg.as_coeff_Mul() + + # but it can't be multiplied by oo + if coeff in [S.NegativeInfinity, S.Infinity]: + if terms.is_number: + if coeff is S.NegativeInfinity: + terms = -terms + if re(terms).is_zero and terms is not S.Zero: + return S.NaN + if re(terms).is_positive and im(terms) is not S.Zero: + return S.ComplexInfinity + if re(terms).is_negative: + return S.Zero + return None + + coeffs, log_term = [coeff], None + for term in Mul.make_args(terms): + term_ = logcombine(term) + if isinstance(term_, log): + if log_term is None: + log_term = term_.args[0] + else: + return None + elif term.is_comparable: + coeffs.append(term) + else: + return None + + return log_term**Mul(*coeffs) if log_term else None + + elif arg.is_Add: + out = [] + add = [] + argchanged = False + for a in arg.args: + if a is S.One: + add.append(a) + continue + newa = cls(a) + if isinstance(newa, cls): + if newa.args[0] != a: + add.append(newa.args[0]) + argchanged = True + else: + add.append(a) + else: + out.append(newa) + if out or argchanged: + return Mul(*out)*cls(Add(*add), evaluate=False) + + if arg.is_zero: + return S.One + + @property + def base(self): + """ + Returns the base of the exponential function. + """ + return S.Exp1 + + @staticmethod + @cacheit + def taylor_term(n, x, *previous_terms): + """ + Calculates the next term in the Taylor series expansion. + """ + if n < 0: + return S.Zero + if n == 0: + return S.One + x = sympify(x) + if previous_terms: + p = previous_terms[-1] + if p is not None: + return p * x / n + return x**n/factorial(n) + + def as_real_imag(self, deep=True, **hints): + """ + Returns this function as a 2-tuple representing a complex number. + + Examples + ======== + + >>> from sympy import exp, I + >>> from sympy.abc import x + >>> exp(x).as_real_imag() + (exp(re(x))*cos(im(x)), exp(re(x))*sin(im(x))) + >>> exp(1).as_real_imag() + (E, 0) + >>> exp(I).as_real_imag() + (cos(1), sin(1)) + >>> exp(1+I).as_real_imag() + (E*cos(1), E*sin(1)) + + See Also + ======== + + sympy.functions.elementary.complexes.re + sympy.functions.elementary.complexes.im + """ + from sympy.functions.elementary.trigonometric import cos, sin + re, im = self.args[0].as_real_imag() + if deep: + re = re.expand(deep, **hints) + im = im.expand(deep, **hints) + cos, sin = cos(im), sin(im) + return (exp(re)*cos, exp(re)*sin) + + def _eval_subs(self, old, new): + # keep processing of power-like args centralized in Pow + if old.is_Pow: # handle (exp(3*log(x))).subs(x**2, z) -> z**(3/2) + old = exp(old.exp*log(old.base)) + elif old is S.Exp1 and new.is_Function: + old = exp + if isinstance(old, exp) or old is S.Exp1: + f = lambda a: Pow(*a.as_base_exp(), evaluate=False) if ( + a.is_Pow or isinstance(a, exp)) else a + return Pow._eval_subs(f(self), f(old), new) + + if old is exp and not new.is_Function: + return new**self.exp._subs(old, new) + return super()._eval_subs(old, new) + + def _eval_is_extended_real(self): + if self.args[0].is_extended_real: + return True + elif self.args[0].is_imaginary: + arg2 = -S(2) * I * self.args[0] / pi + return arg2.is_even + + def _eval_is_complex(self): + def complex_extended_negative(arg): + yield arg.is_complex + yield arg.is_extended_negative + return fuzzy_or(complex_extended_negative(self.args[0])) + + def _eval_is_algebraic(self): + if (self.exp / pi / I).is_rational: + return True + if fuzzy_not(self.exp.is_zero): + if self.exp.is_algebraic: + return False + elif (self.exp / pi).is_rational: + return False + + def _eval_is_extended_positive(self): + if self.exp.is_extended_real: + return self.args[0] is not S.NegativeInfinity + elif self.exp.is_imaginary: + arg2 = -I * self.args[0] / pi + return arg2.is_even + + def _eval_nseries(self, x, n, logx, cdir=0): + # NOTE Please see the comment at the beginning of this file, labelled + # IMPORTANT. + from sympy.functions.elementary.complexes import sign + from sympy.functions.elementary.integers import ceiling + from sympy.series.limits import limit + from sympy.series.order import Order + from sympy.simplify.powsimp import powsimp + arg = self.exp + arg_series = arg._eval_nseries(x, n=n, logx=logx) + if arg_series.is_Order: + return 1 + arg_series + arg0 = limit(arg_series.removeO(), x, 0) + if arg0 is S.NegativeInfinity: + return Order(x**n, x) + if arg0 is S.Infinity: + return self + if arg0.is_infinite: + raise PoleError("Cannot expand %s around 0" % (self)) + # checking for indecisiveness/ sign terms in arg0 + if any(isinstance(arg, sign) for arg in arg0.args): + return self + t = Dummy("t") + nterms = n + try: + cf = Order(arg.as_leading_term(x, logx=logx), x).getn() + except (NotImplementedError, PoleError): + cf = 0 + if cf and cf > 0: + nterms = ceiling(n/cf) + exp_series = exp(t)._taylor(t, nterms) + r = exp(arg0)*exp_series.subs(t, arg_series - arg0) + rep = {logx: log(x)} if logx is not None else {} + if r.subs(rep) == self: + return r + if cf and cf > 1: + r += Order((arg_series - arg0)**n, x)/x**((cf-1)*n) + else: + r += Order((arg_series - arg0)**n, x) + r = r.expand() + r = powsimp(r, deep=True, combine='exp') + # powsimp may introduce unexpanded (-1)**Rational; see PR #17201 + simplerat = lambda x: x.is_Rational and x.q in [3, 4, 6] + w = Wild('w', properties=[simplerat]) + r = r.replace(S.NegativeOne**w, expand_complex(S.NegativeOne**w)) + return r + + def _taylor(self, x, n): + l = [] + g = None + for i in range(n): + g = self.taylor_term(i, self.args[0], g) + g = g.nseries(x, n=n) + l.append(g.removeO()) + return Add(*l) + + def _eval_as_leading_term(self, x, logx, cdir): + from sympy.calculus.util import AccumBounds + arg = self.args[0].cancel().as_leading_term(x, logx=logx) + arg0 = arg.subs(x, 0) + if arg is S.NaN: + return S.NaN + if isinstance(arg0, AccumBounds): + # This check addresses a corner case involving AccumBounds. + # if isinstance(arg, AccumBounds) is True, then arg0 can either be 0, + # AccumBounds(-oo, 0) or AccumBounds(-oo, oo). + # Check out function: test_issue_18473() in test_exponential.py and + # test_limits.py for more information. + if re(cdir) < S.Zero: + return exp(-arg0) + return exp(arg0) + if arg0 is S.NaN: + arg0 = arg.limit(x, 0) + if arg0.is_infinite is False: + return exp(arg0) + raise PoleError("Cannot expand %s around 0" % (self)) + + def _eval_rewrite_as_sin(self, arg, **kwargs): + from sympy.functions.elementary.trigonometric import sin + return sin(I*arg + pi/2) - I*sin(I*arg) + + def _eval_rewrite_as_cos(self, arg, **kwargs): + from sympy.functions.elementary.trigonometric import cos + return cos(I*arg) + I*cos(I*arg + pi/2) + + def _eval_rewrite_as_tanh(self, arg, **kwargs): + from sympy.functions.elementary.hyperbolic import tanh + return (1 + tanh(arg/2))/(1 - tanh(arg/2)) + + def _eval_rewrite_as_sqrt(self, arg, **kwargs): + from sympy.functions.elementary.trigonometric import sin, cos + if arg.is_Mul: + coeff = arg.coeff(pi*I) + if coeff and coeff.is_number: + cosine, sine = cos(pi*coeff), sin(pi*coeff) + if not isinstance(cosine, cos) and not isinstance (sine, sin): + return cosine + I*sine + + def _eval_rewrite_as_Pow(self, arg, **kwargs): + if arg.is_Mul: + logs = [a for a in arg.args if isinstance(a, log) and len(a.args) == 1] + if logs: + return Pow(logs[0].args[0], arg.coeff(logs[0])) + + +def match_real_imag(expr): + r""" + Try to match expr with $a + Ib$ for real $a$ and $b$. + + ``match_real_imag`` returns a tuple containing the real and imaginary + parts of expr or ``(None, None)`` if direct matching is not possible. Contrary + to :func:`~.re`, :func:`~.im``, and ``as_real_imag()``, this helper will not force things + by returning expressions themselves containing ``re()`` or ``im()`` and it + does not expand its argument either. + + """ + r_, i_ = expr.as_independent(I, as_Add=True) + if i_ == 0 and r_.is_real: + return (r_, i_) + i_ = i_.as_coefficient(I) + if i_ and i_.is_real and r_.is_real: + return (r_, i_) + else: + return (None, None) # simpler to check for than None + + +class log(DefinedFunction): + r""" + The natural logarithm function `\ln(x)` or `\log(x)`. + + Explanation + =========== + + Logarithms are taken with the natural base, `e`. To get + a logarithm of a different base ``b``, use ``log(x, b)``, + which is essentially short-hand for ``log(x)/log(b)``. + + ``log`` represents the principal branch of the natural + logarithm. As such it has a branch cut along the negative + real axis and returns values having a complex argument in + `(-\pi, \pi]`. + + Examples + ======== + + >>> from sympy import log, sqrt, S, I + >>> log(8, 2) + 3 + >>> log(S(8)/3, 2) + -log(3)/log(2) + 3 + >>> log(-1 + I*sqrt(3)) + log(2) + 2*I*pi/3 + + See Also + ======== + + exp + + """ + + args: tuple[Expr] + + _singularities = (S.Zero, S.ComplexInfinity) + + def fdiff(self, argindex=1): + """ + Returns the first derivative of the function. + """ + if argindex == 1: + return 1/self.args[0] + else: + raise ArgumentIndexError(self, argindex) + + def inverse(self, argindex=1): + r""" + Returns `e^x`, the inverse function of `\log(x)`. + """ + return exp + + @classmethod + def eval(cls, arg, base=None): + from sympy.calculus import AccumBounds + from sympy.sets.setexpr import SetExpr + + arg = sympify(arg) + + if base is not None: + base = sympify(base) + if base == 1: + if arg == 1: + return S.NaN + else: + return S.ComplexInfinity + try: + # handle extraction of powers of the base now + # or else expand_log in Mul would have to handle this + n = multiplicity(base, arg) + if n: + return n + log(arg / base**n) / log(base) + else: + return log(arg)/log(base) + except ValueError: + pass + if base is not S.Exp1: + return cls(arg)/cls(base) + else: + return cls(arg) + + if arg.is_Number: + if arg.is_zero: + return S.ComplexInfinity + elif arg is S.One: + return S.Zero + elif arg is S.Infinity: + return S.Infinity + elif arg is S.NegativeInfinity: + return S.Infinity + elif arg is S.NaN: + return S.NaN + elif arg.is_Rational and arg.p == 1: + return -cls(arg.q) + + if arg.is_Pow and arg.base is S.Exp1 and arg.exp.is_extended_real: + return arg.exp + if isinstance(arg, exp) and arg.exp.is_extended_real: + return arg.exp + elif isinstance(arg, exp) and arg.exp.is_number: + r_, i_ = match_real_imag(arg.exp) + if i_ and i_.is_comparable: + i_ %= 2*pi + if i_ > pi: + i_ -= 2*pi + return r_ + expand_mul(i_ * I, deep=False) + elif isinstance(arg, exp_polar): + return unpolarify(arg.exp) + elif isinstance(arg, AccumBounds): + if arg.min.is_positive: + return AccumBounds(log(arg.min), log(arg.max)) + elif arg.min.is_zero: + return AccumBounds(S.NegativeInfinity, log(arg.max)) + else: + return S.NaN + elif isinstance(arg, SetExpr): + return arg._eval_func(cls) + + if arg.is_number: + if arg.is_negative: + return pi * I + cls(-arg) + elif arg is S.ComplexInfinity: + return S.ComplexInfinity + elif arg is S.Exp1: + return S.One + + if arg.is_zero: + return S.ComplexInfinity + + # don't autoexpand Pow or Mul (see the issue 3351): + if not arg.is_Add: + coeff = arg.as_coefficient(I) + + if coeff is not None: + if coeff is S.Infinity: + return S.Infinity + elif coeff is S.NegativeInfinity: + return S.Infinity + elif coeff.is_Rational: + if coeff.is_nonnegative: + return pi * I * S.Half + cls(coeff) + else: + return -pi * I * S.Half + cls(-coeff) + + if arg.is_number and arg.is_algebraic: + # Match arg = coeff*(r_ + i_*I) with coeff>0, r_ and i_ real. + coeff, arg_ = arg.as_independent(I, as_Add=False) + if coeff.is_negative: + coeff *= -1 + arg_ *= -1 + arg_ = expand_mul(arg_, deep=False) + r_, i_ = arg_.as_independent(I, as_Add=True) + i_ = i_.as_coefficient(I) + if coeff.is_real and i_ and i_.is_real and r_.is_real: + if r_.is_zero: + if i_.is_positive: + return pi * I * S.Half + cls(coeff * i_) + elif i_.is_negative: + return -pi * I * S.Half + cls(coeff * -i_) + else: + from sympy.simplify import ratsimp + # Check for arguments involving rational multiples of pi + t = (i_/r_).cancel() + t1 = (-t).cancel() + atan_table = _log_atan_table() + if t in atan_table: + modulus = ratsimp(coeff * Abs(arg_)) + if r_.is_positive: + return cls(modulus) + I * atan_table[t] + else: + return cls(modulus) + I * (atan_table[t] - pi) + elif t1 in atan_table: + modulus = ratsimp(coeff * Abs(arg_)) + if r_.is_positive: + return cls(modulus) + I * (-atan_table[t1]) + else: + return cls(modulus) + I * (pi - atan_table[t1]) + + @staticmethod + @cacheit + def taylor_term(n, x, *previous_terms): # of log(1+x) + r""" + Returns the next term in the Taylor series expansion of `\log(1+x)`. + """ + from sympy.simplify.powsimp import powsimp + if n < 0: + return S.Zero + x = sympify(x) + if n == 0: + return x + if previous_terms: + p = previous_terms[-1] + if p is not None: + return powsimp((-n) * p * x / (n + 1), deep=True, combine='exp') + return (1 - 2*(n % 2)) * x**(n + 1)/(n + 1) + + def _eval_expand_log(self, deep=True, **hints): + from sympy.concrete import Sum, Product + force = hints.get('force', False) + factor = hints.get('factor', False) + if (len(self.args) == 2): + return expand_log(self.func(*self.args), deep=deep, force=force) + arg = self.args[0] + if arg.is_Integer: + # remove perfect powers + p = perfect_power(arg) + logarg = None + coeff = 1 + if p is not False: + arg, coeff = p + logarg = self.func(arg) + # expand as product of its prime factors if factor=True + if factor: + p = factorint(arg) + if arg not in p.keys(): + logarg = sum(n*log(val) for val, n in p.items()) + if logarg is not None: + return coeff*logarg + elif arg.is_Rational: + return log(arg.p) - log(arg.q) + elif arg.is_Mul: + expr = [] + nonpos = [] + for x in arg.args: + if force or x.is_positive or x.is_polar: + a = self.func(x) + if isinstance(a, log): + expr.append(self.func(x)._eval_expand_log(**hints)) + else: + expr.append(a) + elif x.is_negative: + a = self.func(-x) + expr.append(a) + nonpos.append(S.NegativeOne) + else: + nonpos.append(x) + return Add(*expr) + log(Mul(*nonpos)) + elif arg.is_Pow or isinstance(arg, exp): + if force or (arg.exp.is_extended_real and (arg.base.is_positive or ((arg.exp+1) + .is_positive and (arg.exp-1).is_nonpositive))) or arg.base.is_polar: + b = arg.base + e = arg.exp + a = self.func(b) + if isinstance(a, log): + return unpolarify(e) * a._eval_expand_log(**hints) + else: + return unpolarify(e) * a + elif isinstance(arg, Product): + if force or arg.function.is_positive: + return Sum(log(arg.function), *arg.limits) + + return self.func(arg) + + def _eval_simplify(self, **kwargs): + from sympy.simplify.simplify import expand_log, simplify, inversecombine + if len(self.args) == 2: # it's unevaluated + return simplify(self.func(*self.args), **kwargs) + + expr = self.func(simplify(self.args[0], **kwargs)) + if kwargs['inverse']: + expr = inversecombine(expr) + expr = expand_log(expr, deep=True) + return min([expr, self], key=kwargs['measure']) + + def as_real_imag(self, deep=True, **hints): + """ + Returns this function as a complex coordinate. + + Examples + ======== + + >>> from sympy import I, log + >>> from sympy.abc import x + >>> log(x).as_real_imag() + (log(Abs(x)), arg(x)) + >>> log(I).as_real_imag() + (0, pi/2) + >>> log(1 + I).as_real_imag() + (log(sqrt(2)), pi/4) + >>> log(I*x).as_real_imag() + (log(Abs(x)), arg(I*x)) + + """ + sarg = self.args[0] + if deep: + sarg = self.args[0].expand(deep, **hints) + sarg_abs = Abs(sarg) + if sarg_abs == sarg: + return self, S.Zero + sarg_arg = arg(sarg) + if hints.get('log', False): # Expand the log + hints['complex'] = False + return (log(sarg_abs).expand(deep, **hints), sarg_arg) + else: + return log(sarg_abs), sarg_arg + + def _eval_is_rational(self): + s = self.func(*self.args) + if s.func == self.func: + if (self.args[0] - 1).is_zero: + return True + if s.args[0].is_rational and fuzzy_not((self.args[0] - 1).is_zero): + return False + else: + return s.is_rational + + def _eval_is_algebraic(self): + s = self.func(*self.args) + if s.func == self.func: + if (self.args[0] - 1).is_zero: + return True + elif fuzzy_not((self.args[0] - 1).is_zero): + if self.args[0].is_algebraic: + return False + else: + return s.is_algebraic + + def _eval_is_extended_real(self): + return self.args[0].is_extended_positive + + def _eval_is_complex(self): + z = self.args[0] + return fuzzy_and([z.is_complex, fuzzy_not(z.is_zero)]) + + def _eval_is_finite(self): + arg = self.args[0] + if arg.is_zero: + return False + return arg.is_finite + + def _eval_is_extended_positive(self): + return (self.args[0] - 1).is_extended_positive + + def _eval_is_zero(self): + return (self.args[0] - 1).is_zero + + def _eval_is_extended_nonnegative(self): + return (self.args[0] - 1).is_extended_nonnegative + + def _eval_nseries(self, x, n, logx, cdir=0): + # NOTE Please see the comment at the beginning of this file, labelled + # IMPORTANT. + from sympy.series.order import Order + from sympy.simplify.simplify import logcombine + from sympy.core.symbol import Dummy + + if self.args[0] == x: + return log(x) if logx is None else logx + arg = self.args[0] + t = Dummy('t', positive=True) + if cdir == 0: + cdir = 1 + z = arg.subs(x, cdir*t) + + k, l = Wild("k"), Wild("l") + r = z.match(k*t**l) + if r is not None: + k, l = r[k], r[l] + if l != 0 and not l.has(t) and not k.has(t): + r = l*log(x) if logx is None else l*logx + r += log(k) - l*log(cdir) # XXX true regardless of assumptions? + return r + + def coeff_exp(term, x): + coeff, exp = S.One, S.Zero + for factor in Mul.make_args(term): + if factor.has(x): + base, exp = factor.as_base_exp() + if base != x: + try: + return term.leadterm(x) + except ValueError: + return term, S.Zero + else: + coeff *= factor + return coeff, exp + + # TODO new and probably slow + try: + a, b = z.leadterm(t, logx=logx, cdir=1) + except (ValueError, NotImplementedError, PoleError): + s = z._eval_nseries(t, n=n, logx=logx, cdir=1) + while s.is_Order: + n += 1 + s = z._eval_nseries(t, n=n, logx=logx, cdir=1) + try: + a, b = s.removeO().leadterm(t, cdir=1) + except ValueError: + a, b = s.removeO().as_leading_term(t, cdir=1), S.Zero + + p = (z/(a*t**b) - 1).cancel()._eval_nseries(t, n=n, logx=logx, cdir=1) + if p.has(exp): + p = logcombine(p) + if isinstance(p, Order): + n = p.getn() + _, d = coeff_exp(p, t) + logx = log(x) if logx is None else logx + + if not d.is_positive: + res = log(a) - b*log(cdir) + b*logx + _res = res + logflags = {"deep": True, "log": True, "mul": False, "power_exp": False, + "power_base": False, "multinomial": False, "basic": False, "force": True, + "factor": False} + expr = self.expand(**logflags) + if (not a.could_extract_minus_sign() and + logx.could_extract_minus_sign()): + _res = _res.subs(-logx, -log(x)).expand(**logflags) + else: + _res = _res.subs(logx, log(x)).expand(**logflags) + if _res == expr: + return res + return res + Order(x**n, x) + + def mul(d1, d2): + res = {} + for e1, e2 in product(d1, d2): + ex = e1 + e2 + if ex < n: + res[ex] = res.get(ex, S.Zero) + d1[e1]*d2[e2] + return res + + pterms = {} + + for term in Add.make_args(p.removeO()): + co1, e1 = coeff_exp(term, t) + pterms[e1] = pterms.get(e1, S.Zero) + co1 + + k = S.One + terms = {} + pk = pterms + + while k*d < n: + coeff = -S.NegativeOne**k/k + for ex in pk: + terms[ex] = terms.get(ex, S.Zero) + coeff*pk[ex] + pk = mul(pk, pterms) + k += S.One + + res = log(a) - b*log(cdir) + b*logx + for ex in terms: + res += terms[ex].cancel()*t**(ex) + + if a.is_negative and im(z) != 0: + from sympy.functions.special.delta_functions import Heaviside + for i, term in enumerate(z.lseries(t)): + if not term.is_real or i == 5: + break + if i < 5: + coeff, _ = term.as_coeff_exponent(t) + res += -2*I*pi*Heaviside(-im(coeff), 0) + + res = res.subs(t, x/cdir) + return res + Order(x**n, x) + + def _eval_as_leading_term(self, x, logx, cdir): + # NOTE + # Refer https://github.com/sympy/sympy/pull/23592 for more information + # on each of the following steps involved in this method. + arg0 = self.args[0].together() + + # STEP 1 + t = Dummy('t', positive=True) + if cdir == 0: + cdir = 1 + z = arg0.subs(x, cdir*t) + + # STEP 2 + try: + c, e = z.leadterm(t, logx=logx, cdir=1) + except ValueError: + arg = arg0.as_leading_term(x, logx=logx, cdir=cdir) + return log(arg) + if c.has(t): + c = c.subs(t, x/cdir) + if e != 0: + raise PoleError("Cannot expand %s around 0" % (self)) + return log(c) + + # STEP 3 + if c == S.One and e == S.Zero: + return (arg0 - S.One).as_leading_term(x, logx=logx) + + # STEP 4 + res = log(c) - e*log(cdir) + logx = log(x) if logx is None else logx + res += e*logx + + # STEP 5 + if c.is_negative and im(z) != 0: + from sympy.functions.special.delta_functions import Heaviside + for i, term in enumerate(z.lseries(t)): + if not term.is_real or i == 5: + break + if i < 5: + coeff, _ = term.as_coeff_exponent(t) + res += -2*I*pi*Heaviside(-im(coeff), 0) + return res + + +class LambertW(DefinedFunction): + r""" + The Lambert W function $W(z)$ is defined as the inverse + function of $w \exp(w)$ [1]_. + + Explanation + =========== + + In other words, the value of $W(z)$ is such that $z = W(z) \exp(W(z))$ + for any complex number $z$. The Lambert W function is a multivalued + function with infinitely many branches $W_k(z)$, indexed by + $k \in \mathbb{Z}$. Each branch gives a different solution $w$ + of the equation $z = w \exp(w)$. + + The Lambert W function has two partially real branches: the + principal branch ($k = 0$) is real for real $z > -1/e$, and the + $k = -1$ branch is real for $-1/e < z < 0$. All branches except + $k = 0$ have a logarithmic singularity at $z = 0$. + + Examples + ======== + + >>> from sympy import LambertW + >>> LambertW(1.2) + 0.635564016364870 + >>> LambertW(1.2, -1).n() + -1.34747534407696 - 4.41624341514535*I + >>> LambertW(-1).is_real + False + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Lambert_W_function + """ + _singularities = (-Pow(S.Exp1, -1, evaluate=False), S.ComplexInfinity) + + @classmethod + def eval(cls, x, k=None): + if k == S.Zero: + return cls(x) + elif k is None: + k = S.Zero + + if k.is_zero: + if x.is_zero: + return S.Zero + if x is S.Exp1: + return S.One + if x == -1/S.Exp1: + return S.NegativeOne + if x == -log(2)/2: + return -log(2) + if x == 2*log(2): + return log(2) + if x == -pi/2: + return I*pi/2 + if x == exp(1 + S.Exp1): + return S.Exp1 + if x is S.Infinity: + return S.Infinity + + if fuzzy_not(k.is_zero): + if x.is_zero: + return S.NegativeInfinity + if k is S.NegativeOne: + if x == -pi/2: + return -I*pi/2 + elif x == -1/S.Exp1: + return S.NegativeOne + elif x == -2*exp(-2): + return -Integer(2) + + def fdiff(self, argindex=1): + """ + Return the first derivative of this function. + """ + x = self.args[0] + + if len(self.args) == 1: + if argindex == 1: + return LambertW(x)/(x*(1 + LambertW(x))) + else: + k = self.args[1] + if argindex == 1: + return LambertW(x, k)/(x*(1 + LambertW(x, k))) + + raise ArgumentIndexError(self, argindex) + + def _eval_is_extended_real(self): + x = self.args[0] + if len(self.args) == 1: + k = S.Zero + else: + k = self.args[1] + if k.is_zero: + if (x + 1/S.Exp1).is_positive: + return True + elif (x + 1/S.Exp1).is_nonpositive: + return False + elif (k + 1).is_zero: + if x.is_negative and (x + 1/S.Exp1).is_positive: + return True + elif x.is_nonpositive or (x + 1/S.Exp1).is_nonnegative: + return False + elif fuzzy_not(k.is_zero) and fuzzy_not((k + 1).is_zero): + if x.is_extended_real: + return False + + def _eval_is_finite(self): + return self.args[0].is_finite + + def _eval_is_algebraic(self): + s = self.func(*self.args) + if s.func == self.func: + if fuzzy_not(self.args[0].is_zero) and self.args[0].is_algebraic: + return False + else: + return s.is_algebraic + + def _eval_as_leading_term(self, x, logx, cdir): + if len(self.args) == 1: + arg = self.args[0] + arg0 = arg.subs(x, 0).cancel() + if not arg0.is_zero: + return self.func(arg0) + return arg.as_leading_term(x) + + def _eval_nseries(self, x, n, logx, cdir=0): + if len(self.args) == 1: + from sympy.functions.elementary.integers import ceiling + from sympy.series.order import Order + arg = self.args[0].nseries(x, n=n, logx=logx) + lt = arg.as_leading_term(x, logx=logx) + lte = 1 + if lt.is_Pow: + lte = lt.exp + if ceiling(n/lte) >= 1: + s = Add(*[(-S.One)**(k - 1)*Integer(k)**(k - 2)/ + factorial(k - 1)*arg**k for k in range(1, ceiling(n/lte))]) + s = expand_multinomial(s) + else: + s = S.Zero + + return s + Order(x**n, x) + return super()._eval_nseries(x, n, logx) + + def _eval_is_zero(self): + x = self.args[0] + if len(self.args) == 1: + return x.is_zero + else: + return fuzzy_and([x.is_zero, self.args[1].is_zero]) + + +@cacheit +def _log_atan_table(): + return { + # first quadrant only + sqrt(3): pi / 3, + 1: pi / 4, + sqrt(5 - 2 * sqrt(5)): pi / 5, + sqrt(2) * sqrt(5 - sqrt(5)) / (1 + sqrt(5)): pi / 5, + sqrt(5 + 2 * sqrt(5)): pi * Rational(2, 5), + sqrt(2) * sqrt(sqrt(5) + 5) / (-1 + sqrt(5)): pi * Rational(2, 5), + sqrt(3) / 3: pi / 6, + sqrt(2) - 1: pi / 8, + sqrt(2 - sqrt(2)) / sqrt(sqrt(2) + 2): pi / 8, + sqrt(2) + 1: pi * Rational(3, 8), + sqrt(sqrt(2) + 2) / sqrt(2 - sqrt(2)): pi * Rational(3, 8), + sqrt(1 - 2 * sqrt(5) / 5): pi / 10, + (-sqrt(2) + sqrt(10)) / (2 * sqrt(sqrt(5) + 5)): pi / 10, + sqrt(1 + 2 * sqrt(5) / 5): pi * Rational(3, 10), + (sqrt(2) + sqrt(10)) / (2 * sqrt(5 - sqrt(5))): pi * Rational(3, 10), + 2 - sqrt(3): pi / 12, + (-1 + sqrt(3)) / (1 + sqrt(3)): pi / 12, + 2 + sqrt(3): pi * Rational(5, 12), + (1 + sqrt(3)) / (-1 + sqrt(3)): pi * Rational(5, 12) + } diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/functions/elementary/hyperbolic.py b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/functions/elementary/hyperbolic.py new file mode 100644 index 0000000000000000000000000000000000000000..1031d035373bb641d26e61a395e6048906285bfe --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/functions/elementary/hyperbolic.py @@ -0,0 +1,2285 @@ +from sympy.core import S, sympify, cacheit +from sympy.core.add import Add +from sympy.core.function import DefinedFunction, ArgumentIndexError +from sympy.core.logic import fuzzy_or, fuzzy_and, fuzzy_not, FuzzyBool +from sympy.core.numbers import I, pi, Rational +from sympy.core.symbol import Dummy +from sympy.functions.combinatorial.factorials import (binomial, factorial, + RisingFactorial) +from sympy.functions.combinatorial.numbers import bernoulli, euler, nC +from sympy.functions.elementary.complexes import Abs, im, re +from sympy.functions.elementary.exponential import exp, log, match_real_imag +from sympy.functions.elementary.integers import floor +from sympy.functions.elementary.miscellaneous import sqrt +from sympy.functions.elementary.trigonometric import ( + acos, acot, asin, atan, cos, cot, csc, sec, sin, tan, + _imaginary_unit_as_coefficient) +from sympy.polys.specialpolys import symmetric_poly + + +def _rewrite_hyperbolics_as_exp(expr): + return expr.xreplace({h: h.rewrite(exp) + for h in expr.atoms(HyperbolicFunction)}) + + +@cacheit +def _acosh_table(): + return { + I: log(I*(1 + sqrt(2))), + -I: log(-I*(1 + sqrt(2))), + S.Half: pi/3, + Rational(-1, 2): pi*Rational(2, 3), + sqrt(2)/2: pi/4, + -sqrt(2)/2: pi*Rational(3, 4), + 1/sqrt(2): pi/4, + -1/sqrt(2): pi*Rational(3, 4), + sqrt(3)/2: pi/6, + -sqrt(3)/2: pi*Rational(5, 6), + (sqrt(3) - 1)/sqrt(2**3): pi*Rational(5, 12), + -(sqrt(3) - 1)/sqrt(2**3): pi*Rational(7, 12), + sqrt(2 + sqrt(2))/2: pi/8, + -sqrt(2 + sqrt(2))/2: pi*Rational(7, 8), + sqrt(2 - sqrt(2))/2: pi*Rational(3, 8), + -sqrt(2 - sqrt(2))/2: pi*Rational(5, 8), + (1 + sqrt(3))/(2*sqrt(2)): pi/12, + -(1 + sqrt(3))/(2*sqrt(2)): pi*Rational(11, 12), + (sqrt(5) + 1)/4: pi/5, + -(sqrt(5) + 1)/4: pi*Rational(4, 5) + } + + +@cacheit +def _acsch_table(): + return { + I: -pi / 2, + I*(sqrt(2) + sqrt(6)): -pi / 12, + I*(1 + sqrt(5)): -pi / 10, + I*2 / sqrt(2 - sqrt(2)): -pi / 8, + I*2: -pi / 6, + I*sqrt(2 + 2/sqrt(5)): -pi / 5, + I*sqrt(2): -pi / 4, + I*(sqrt(5)-1): -3*pi / 10, + I*2 / sqrt(3): -pi / 3, + I*2 / sqrt(2 + sqrt(2)): -3*pi / 8, + I*sqrt(2 - 2/sqrt(5)): -2*pi / 5, + I*(sqrt(6) - sqrt(2)): -5*pi / 12, + S(2): -I*log((1+sqrt(5))/2), + } + + +@cacheit +def _asech_table(): + return { + I: - (pi*I / 2) + log(1 + sqrt(2)), + -I: (pi*I / 2) + log(1 + sqrt(2)), + (sqrt(6) - sqrt(2)): pi / 12, + (sqrt(2) - sqrt(6)): 11*pi / 12, + sqrt(2 - 2/sqrt(5)): pi / 10, + -sqrt(2 - 2/sqrt(5)): 9*pi / 10, + 2 / sqrt(2 + sqrt(2)): pi / 8, + -2 / sqrt(2 + sqrt(2)): 7*pi / 8, + 2 / sqrt(3): pi / 6, + -2 / sqrt(3): 5*pi / 6, + (sqrt(5) - 1): pi / 5, + (1 - sqrt(5)): 4*pi / 5, + sqrt(2): pi / 4, + -sqrt(2): 3*pi / 4, + sqrt(2 + 2/sqrt(5)): 3*pi / 10, + -sqrt(2 + 2/sqrt(5)): 7*pi / 10, + S(2): pi / 3, + -S(2): 2*pi / 3, + sqrt(2*(2 + sqrt(2))): 3*pi / 8, + -sqrt(2*(2 + sqrt(2))): 5*pi / 8, + (1 + sqrt(5)): 2*pi / 5, + (-1 - sqrt(5)): 3*pi / 5, + (sqrt(6) + sqrt(2)): 5*pi / 12, + (-sqrt(6) - sqrt(2)): 7*pi / 12, + I*S.Infinity: -pi*I / 2, + I*S.NegativeInfinity: pi*I / 2, + } + +############################################################################### +########################### HYPERBOLIC FUNCTIONS ############################## +############################################################################### + + +class HyperbolicFunction(DefinedFunction): + """ + Base class for hyperbolic functions. + + See Also + ======== + + sinh, cosh, tanh, coth + """ + + unbranched = True + + +def _peeloff_ipi(arg): + r""" + Split ARG into two parts, a "rest" and a multiple of $I\pi$. + This assumes ARG to be an ``Add``. + The multiple of $I\pi$ returned in the second position is always a ``Rational``. + + Examples + ======== + + >>> from sympy.functions.elementary.hyperbolic import _peeloff_ipi as peel + >>> from sympy import pi, I + >>> from sympy.abc import x, y + >>> peel(x + I*pi/2) + (x, 1/2) + >>> peel(x + I*2*pi/3 + I*pi*y) + (x + I*pi*y + I*pi/6, 1/2) + """ + ipi = pi*I + for a in Add.make_args(arg): + if a == ipi: + K = S.One + break + elif a.is_Mul: + K, p = a.as_two_terms() + if p == ipi and K.is_Rational: + break + else: + return arg, S.Zero + + m1 = (K % S.Half) + m2 = K - m1 + return arg - m2*ipi, m2 + + +class sinh(HyperbolicFunction): + r""" + ``sinh(x)`` is the hyperbolic sine of ``x``. + + The hyperbolic sine function is $\frac{e^x - e^{-x}}{2}$. + + Examples + ======== + + >>> from sympy import sinh + >>> from sympy.abc import x + >>> sinh(x) + sinh(x) + + See Also + ======== + + cosh, tanh, asinh + """ + + def fdiff(self, argindex=1): + """ + Returns the first derivative of this function. + """ + if argindex == 1: + return cosh(self.args[0]) + else: + raise ArgumentIndexError(self, argindex) + + def inverse(self, argindex=1): + """ + Returns the inverse of this function. + """ + return asinh + + @classmethod + def eval(cls, arg): + if arg.is_Number: + if arg is S.NaN: + return S.NaN + elif arg is S.Infinity: + return S.Infinity + elif arg is S.NegativeInfinity: + return S.NegativeInfinity + elif arg.is_zero: + return S.Zero + elif arg.is_negative: + return -cls(-arg) + else: + if arg is S.ComplexInfinity: + return S.NaN + + i_coeff = _imaginary_unit_as_coefficient(arg) + + if i_coeff is not None: + return I * sin(i_coeff) + else: + if arg.could_extract_minus_sign(): + return -cls(-arg) + + if arg.is_Add: + x, m = _peeloff_ipi(arg) + if m: + m = m*pi*I + return sinh(m)*cosh(x) + cosh(m)*sinh(x) + + if arg.is_zero: + return S.Zero + + if arg.func == asinh: + return arg.args[0] + + if arg.func == acosh: + x = arg.args[0] + return sqrt(x - 1) * sqrt(x + 1) + + if arg.func == atanh: + x = arg.args[0] + return x/sqrt(1 - x**2) + + if arg.func == acoth: + x = arg.args[0] + return 1/(sqrt(x - 1) * sqrt(x + 1)) + + @staticmethod + @cacheit + def taylor_term(n, x, *previous_terms): + """ + Returns the next term in the Taylor series expansion. + """ + if n < 0 or n % 2 == 0: + return S.Zero + else: + x = sympify(x) + + if len(previous_terms) > 2: + p = previous_terms[-2] + return p * x**2 / (n*(n - 1)) + else: + return x**(n) / factorial(n) + + def _eval_conjugate(self): + return self.func(self.args[0].conjugate()) + + def as_real_imag(self, deep=True, **hints): + """ + Returns this function as a complex coordinate. + """ + if self.args[0].is_extended_real: + if deep: + hints['complex'] = False + return (self.expand(deep, **hints), S.Zero) + else: + return (self, S.Zero) + if deep: + re, im = self.args[0].expand(deep, **hints).as_real_imag() + else: + re, im = self.args[0].as_real_imag() + return (sinh(re)*cos(im), cosh(re)*sin(im)) + + def _eval_expand_complex(self, deep=True, **hints): + re_part, im_part = self.as_real_imag(deep=deep, **hints) + return re_part + im_part*I + + def _eval_expand_trig(self, deep=True, **hints): + if deep: + arg = self.args[0].expand(deep, **hints) + else: + arg = self.args[0] + x = None + if arg.is_Add: # TODO, implement more if deep stuff here + x, y = arg.as_two_terms() + else: + coeff, terms = arg.as_coeff_Mul(rational=True) + if coeff is not S.One and coeff.is_Integer and terms is not S.One: + x = terms + y = (coeff - 1)*x + if x is not None: + return (sinh(x)*cosh(y) + sinh(y)*cosh(x)).expand(trig=True) + return sinh(arg) + + def _eval_rewrite_as_tractable(self, arg, limitvar=None, **kwargs): + return (exp(arg) - exp(-arg)) / 2 + + def _eval_rewrite_as_exp(self, arg, **kwargs): + return (exp(arg) - exp(-arg)) / 2 + + def _eval_rewrite_as_sin(self, arg, **kwargs): + return -I * sin(I * arg) + + def _eval_rewrite_as_csc(self, arg, **kwargs): + return -I / csc(I * arg) + + def _eval_rewrite_as_cosh(self, arg, **kwargs): + return -I*cosh(arg + pi*I/2) + + def _eval_rewrite_as_tanh(self, arg, **kwargs): + tanh_half = tanh(S.Half*arg) + return 2*tanh_half/(1 - tanh_half**2) + + def _eval_rewrite_as_coth(self, arg, **kwargs): + coth_half = coth(S.Half*arg) + return 2*coth_half/(coth_half**2 - 1) + + def _eval_rewrite_as_csch(self, arg, **kwargs): + return 1 / csch(arg) + + def _eval_as_leading_term(self, x, logx, cdir): + arg = self.args[0].as_leading_term(x, logx=logx, cdir=cdir) + arg0 = arg.subs(x, 0) + + if arg0 is S.NaN: + arg0 = arg.limit(x, 0, dir='-' if cdir.is_negative else '+') + if arg0.is_zero: + return arg + elif arg0.is_finite: + return self.func(arg0) + else: + return self + + def _eval_is_real(self): + arg = self.args[0] + if arg.is_real: + return True + + # if `im` is of the form n*pi + # else, check if it is a number + re, im = arg.as_real_imag() + return (im%pi).is_zero + + def _eval_is_extended_real(self): + if self.args[0].is_extended_real: + return True + + def _eval_is_positive(self): + if self.args[0].is_extended_real: + return self.args[0].is_positive + + def _eval_is_negative(self): + if self.args[0].is_extended_real: + return self.args[0].is_negative + + def _eval_is_finite(self): + arg = self.args[0] + return arg.is_finite + + def _eval_is_zero(self): + rest, ipi_mult = _peeloff_ipi(self.args[0]) + if rest.is_zero: + return ipi_mult.is_integer + + +class cosh(HyperbolicFunction): + r""" + ``cosh(x)`` is the hyperbolic cosine of ``x``. + + The hyperbolic cosine function is $\frac{e^x + e^{-x}}{2}$. + + Examples + ======== + + >>> from sympy import cosh + >>> from sympy.abc import x + >>> cosh(x) + cosh(x) + + See Also + ======== + + sinh, tanh, acosh + """ + + def fdiff(self, argindex=1): + if argindex == 1: + return sinh(self.args[0]) + else: + raise ArgumentIndexError(self, argindex) + + @classmethod + def eval(cls, arg): + from sympy.functions.elementary.trigonometric import cos + if arg.is_Number: + if arg is S.NaN: + return S.NaN + elif arg is S.Infinity: + return S.Infinity + elif arg is S.NegativeInfinity: + return S.Infinity + elif arg.is_zero: + return S.One + elif arg.is_negative: + return cls(-arg) + else: + if arg is S.ComplexInfinity: + return S.NaN + + i_coeff = _imaginary_unit_as_coefficient(arg) + + if i_coeff is not None: + return cos(i_coeff) + else: + if arg.could_extract_minus_sign(): + return cls(-arg) + + if arg.is_Add: + x, m = _peeloff_ipi(arg) + if m: + m = m*pi*I + return cosh(m)*cosh(x) + sinh(m)*sinh(x) + + if arg.is_zero: + return S.One + + if arg.func == asinh: + return sqrt(1 + arg.args[0]**2) + + if arg.func == acosh: + return arg.args[0] + + if arg.func == atanh: + return 1/sqrt(1 - arg.args[0]**2) + + if arg.func == acoth: + x = arg.args[0] + return x/(sqrt(x - 1) * sqrt(x + 1)) + + @staticmethod + @cacheit + def taylor_term(n, x, *previous_terms): + if n < 0 or n % 2 == 1: + return S.Zero + else: + x = sympify(x) + + if len(previous_terms) > 2: + p = previous_terms[-2] + return p * x**2 / (n*(n - 1)) + else: + return x**(n)/factorial(n) + + def _eval_conjugate(self): + return self.func(self.args[0].conjugate()) + + def as_real_imag(self, deep=True, **hints): + if self.args[0].is_extended_real: + if deep: + hints['complex'] = False + return (self.expand(deep, **hints), S.Zero) + else: + return (self, S.Zero) + if deep: + re, im = self.args[0].expand(deep, **hints).as_real_imag() + else: + re, im = self.args[0].as_real_imag() + + return (cosh(re)*cos(im), sinh(re)*sin(im)) + + def _eval_expand_complex(self, deep=True, **hints): + re_part, im_part = self.as_real_imag(deep=deep, **hints) + return re_part + im_part*I + + def _eval_expand_trig(self, deep=True, **hints): + if deep: + arg = self.args[0].expand(deep, **hints) + else: + arg = self.args[0] + x = None + if arg.is_Add: # TODO, implement more if deep stuff here + x, y = arg.as_two_terms() + else: + coeff, terms = arg.as_coeff_Mul(rational=True) + if coeff is not S.One and coeff.is_Integer and terms is not S.One: + x = terms + y = (coeff - 1)*x + if x is not None: + return (cosh(x)*cosh(y) + sinh(x)*sinh(y)).expand(trig=True) + return cosh(arg) + + def _eval_rewrite_as_tractable(self, arg, limitvar=None, **kwargs): + return (exp(arg) + exp(-arg)) / 2 + + def _eval_rewrite_as_exp(self, arg, **kwargs): + return (exp(arg) + exp(-arg)) / 2 + + def _eval_rewrite_as_cos(self, arg, **kwargs): + return cos(I * arg, evaluate=False) + + def _eval_rewrite_as_sec(self, arg, **kwargs): + return 1 / sec(I * arg, evaluate=False) + + def _eval_rewrite_as_sinh(self, arg, **kwargs): + return -I*sinh(arg + pi*I/2, evaluate=False) + + def _eval_rewrite_as_tanh(self, arg, **kwargs): + tanh_half = tanh(S.Half*arg)**2 + return (1 + tanh_half)/(1 - tanh_half) + + def _eval_rewrite_as_coth(self, arg, **kwargs): + coth_half = coth(S.Half*arg)**2 + return (coth_half + 1)/(coth_half - 1) + + def _eval_rewrite_as_sech(self, arg, **kwargs): + return 1 / sech(arg) + + def _eval_as_leading_term(self, x, logx, cdir): + arg = self.args[0].as_leading_term(x, logx=logx, cdir=cdir) + arg0 = arg.subs(x, 0) + + if arg0 is S.NaN: + arg0 = arg.limit(x, 0, dir='-' if cdir.is_negative else '+') + if arg0.is_zero: + return S.One + elif arg0.is_finite: + return self.func(arg0) + else: + return self + + def _eval_is_real(self): + arg = self.args[0] + + # `cosh(x)` is real for real OR purely imaginary `x` + if arg.is_real or arg.is_imaginary: + return True + + # cosh(a+ib) = cos(b)*cosh(a) + i*sin(b)*sinh(a) + # the imaginary part can be an expression like n*pi + # if not, check if the imaginary part is a number + re, im = arg.as_real_imag() + return (im%pi).is_zero + + def _eval_is_positive(self): + # cosh(x+I*y) = cos(y)*cosh(x) + I*sin(y)*sinh(x) + # cosh(z) is positive iff it is real and the real part is positive. + # So we need sin(y)*sinh(x) = 0 which gives x=0 or y=n*pi + # Case 1 (y=n*pi): cosh(z) = (-1)**n * cosh(x) -> positive for n even + # Case 2 (x=0): cosh(z) = cos(y) -> positive when cos(y) is positive + z = self.args[0] + + x, y = z.as_real_imag() + ymod = y % (2*pi) + + yzero = ymod.is_zero + # shortcut if ymod is zero + if yzero: + return True + + xzero = x.is_zero + # shortcut x is not zero + if xzero is False: + return yzero + + return fuzzy_or([ + # Case 1: + yzero, + # Case 2: + fuzzy_and([ + xzero, + fuzzy_or([ymod < pi/2, ymod > 3*pi/2]) + ]) + ]) + + + def _eval_is_nonnegative(self): + z = self.args[0] + + x, y = z.as_real_imag() + ymod = y % (2*pi) + + yzero = ymod.is_zero + # shortcut if ymod is zero + if yzero: + return True + + xzero = x.is_zero + # shortcut x is not zero + if xzero is False: + return yzero + + return fuzzy_or([ + # Case 1: + yzero, + # Case 2: + fuzzy_and([ + xzero, + fuzzy_or([ymod <= pi/2, ymod >= 3*pi/2]) + ]) + ]) + + def _eval_is_finite(self): + arg = self.args[0] + return arg.is_finite + + def _eval_is_zero(self): + rest, ipi_mult = _peeloff_ipi(self.args[0]) + if ipi_mult and rest.is_zero: + return (ipi_mult - S.Half).is_integer + + +class tanh(HyperbolicFunction): + r""" + ``tanh(x)`` is the hyperbolic tangent of ``x``. + + The hyperbolic tangent function is $\frac{\sinh(x)}{\cosh(x)}$. + + Examples + ======== + + >>> from sympy import tanh + >>> from sympy.abc import x + >>> tanh(x) + tanh(x) + + See Also + ======== + + sinh, cosh, atanh + """ + + def fdiff(self, argindex=1): + if argindex == 1: + return S.One - tanh(self.args[0])**2 + else: + raise ArgumentIndexError(self, argindex) + + def inverse(self, argindex=1): + """ + Returns the inverse of this function. + """ + return atanh + + @classmethod + def eval(cls, arg): + if arg.is_Number: + if arg is S.NaN: + return S.NaN + elif arg is S.Infinity: + return S.One + elif arg is S.NegativeInfinity: + return S.NegativeOne + elif arg.is_zero: + return S.Zero + elif arg.is_negative: + return -cls(-arg) + else: + if arg is S.ComplexInfinity: + return S.NaN + + i_coeff = _imaginary_unit_as_coefficient(arg) + + if i_coeff is not None: + if i_coeff.could_extract_minus_sign(): + return -I * tan(-i_coeff) + return I * tan(i_coeff) + else: + if arg.could_extract_minus_sign(): + return -cls(-arg) + + if arg.is_Add: + x, m = _peeloff_ipi(arg) + if m: + tanhm = tanh(m*pi*I) + if tanhm is S.ComplexInfinity: + return coth(x) + else: # tanhm == 0 + return tanh(x) + + if arg.is_zero: + return S.Zero + + if arg.func == asinh: + x = arg.args[0] + return x/sqrt(1 + x**2) + + if arg.func == acosh: + x = arg.args[0] + return sqrt(x - 1) * sqrt(x + 1) / x + + if arg.func == atanh: + return arg.args[0] + + if arg.func == acoth: + return 1/arg.args[0] + + @staticmethod + @cacheit + def taylor_term(n, x, *previous_terms): + if n < 0 or n % 2 == 0: + return S.Zero + else: + x = sympify(x) + + a = 2**(n + 1) + + B = bernoulli(n + 1) + F = factorial(n + 1) + + return a*(a - 1) * B/F * x**n + + def _eval_conjugate(self): + return self.func(self.args[0].conjugate()) + + def as_real_imag(self, deep=True, **hints): + if self.args[0].is_extended_real: + if deep: + hints['complex'] = False + return (self.expand(deep, **hints), S.Zero) + else: + return (self, S.Zero) + if deep: + re, im = self.args[0].expand(deep, **hints).as_real_imag() + else: + re, im = self.args[0].as_real_imag() + denom = sinh(re)**2 + cos(im)**2 + return (sinh(re)*cosh(re)/denom, sin(im)*cos(im)/denom) + + def _eval_expand_trig(self, **hints): + arg = self.args[0] + if arg.is_Add: + n = len(arg.args) + TX = [tanh(x, evaluate=False)._eval_expand_trig() + for x in arg.args] + p = [0, 0] # [den, num] + for i in range(n + 1): + p[i % 2] += symmetric_poly(i, TX) + return p[1]/p[0] + elif arg.is_Mul: + coeff, terms = arg.as_coeff_Mul() + if coeff.is_Integer and coeff > 1: + T = tanh(terms) + n = [nC(range(coeff), k)*T**k for k in range(1, coeff + 1, 2)] + d = [nC(range(coeff), k)*T**k for k in range(0, coeff + 1, 2)] + return Add(*n)/Add(*d) + return tanh(arg) + + def _eval_rewrite_as_tractable(self, arg, limitvar=None, **kwargs): + neg_exp, pos_exp = exp(-arg), exp(arg) + return (pos_exp - neg_exp)/(pos_exp + neg_exp) + + def _eval_rewrite_as_exp(self, arg, **kwargs): + neg_exp, pos_exp = exp(-arg), exp(arg) + return (pos_exp - neg_exp)/(pos_exp + neg_exp) + + def _eval_rewrite_as_tan(self, arg, **kwargs): + return -I * tan(I * arg, evaluate=False) + + def _eval_rewrite_as_cot(self, arg, **kwargs): + return -I / cot(I * arg, evaluate=False) + + def _eval_rewrite_as_sinh(self, arg, **kwargs): + return I*sinh(arg)/sinh(pi*I/2 - arg, evaluate=False) + + def _eval_rewrite_as_cosh(self, arg, **kwargs): + return I*cosh(pi*I/2 - arg, evaluate=False)/cosh(arg) + + def _eval_rewrite_as_coth(self, arg, **kwargs): + return 1/coth(arg) + + def _eval_as_leading_term(self, x, logx, cdir): + from sympy.series.order import Order + arg = self.args[0].as_leading_term(x) + + if x in arg.free_symbols and Order(1, x).contains(arg): + return arg + else: + return self.func(arg) + + def _eval_is_real(self): + arg = self.args[0] + if arg.is_real: + return True + + re, im = arg.as_real_imag() + + # if denom = 0, tanh(arg) = zoo + if re == 0 and im % pi == pi/2: + return None + + # check if im is of the form n*pi/2 to make sin(2*im) = 0 + # if not, im could be a number, return False in that case + return (im % (pi/2)).is_zero + + def _eval_is_extended_real(self): + if self.args[0].is_extended_real: + return True + + def _eval_is_positive(self): + if self.args[0].is_extended_real: + return self.args[0].is_positive + + def _eval_is_negative(self): + if self.args[0].is_extended_real: + return self.args[0].is_negative + + def _eval_is_finite(self): + arg = self.args[0] + + re, im = arg.as_real_imag() + denom = cos(im)**2 + sinh(re)**2 + if denom == 0: + return False + elif denom.is_number: + return True + if arg.is_extended_real: + return True + + def _eval_is_zero(self): + arg = self.args[0] + if arg.is_zero: + return True + + +class coth(HyperbolicFunction): + r""" + ``coth(x)`` is the hyperbolic cotangent of ``x``. + + The hyperbolic cotangent function is $\frac{\cosh(x)}{\sinh(x)}$. + + Examples + ======== + + >>> from sympy import coth + >>> from sympy.abc import x + >>> coth(x) + coth(x) + + See Also + ======== + + sinh, cosh, acoth + """ + + def fdiff(self, argindex=1): + if argindex == 1: + return -1/sinh(self.args[0])**2 + else: + raise ArgumentIndexError(self, argindex) + + def inverse(self, argindex=1): + """ + Returns the inverse of this function. + """ + return acoth + + @classmethod + def eval(cls, arg): + if arg.is_Number: + if arg is S.NaN: + return S.NaN + elif arg is S.Infinity: + return S.One + elif arg is S.NegativeInfinity: + return S.NegativeOne + elif arg.is_zero: + return S.ComplexInfinity + elif arg.is_negative: + return -cls(-arg) + else: + if arg is S.ComplexInfinity: + return S.NaN + + i_coeff = _imaginary_unit_as_coefficient(arg) + + if i_coeff is not None: + if i_coeff.could_extract_minus_sign(): + return I * cot(-i_coeff) + return -I * cot(i_coeff) + else: + if arg.could_extract_minus_sign(): + return -cls(-arg) + + if arg.is_Add: + x, m = _peeloff_ipi(arg) + if m: + cothm = coth(m*pi*I) + if cothm is S.ComplexInfinity: + return coth(x) + else: # cothm == 0 + return tanh(x) + + if arg.is_zero: + return S.ComplexInfinity + + if arg.func == asinh: + x = arg.args[0] + return sqrt(1 + x**2)/x + + if arg.func == acosh: + x = arg.args[0] + return x/(sqrt(x - 1) * sqrt(x + 1)) + + if arg.func == atanh: + return 1/arg.args[0] + + if arg.func == acoth: + return arg.args[0] + + @staticmethod + @cacheit + def taylor_term(n, x, *previous_terms): + if n == 0: + return 1 / sympify(x) + elif n < 0 or n % 2 == 0: + return S.Zero + else: + x = sympify(x) + + B = bernoulli(n + 1) + F = factorial(n + 1) + + return 2**(n + 1) * B/F * x**n + + def _eval_conjugate(self): + return self.func(self.args[0].conjugate()) + + def as_real_imag(self, deep=True, **hints): + from sympy.functions.elementary.trigonometric import (cos, sin) + if self.args[0].is_extended_real: + if deep: + hints['complex'] = False + return (self.expand(deep, **hints), S.Zero) + else: + return (self, S.Zero) + if deep: + re, im = self.args[0].expand(deep, **hints).as_real_imag() + else: + re, im = self.args[0].as_real_imag() + denom = sinh(re)**2 + sin(im)**2 + return (sinh(re)*cosh(re)/denom, -sin(im)*cos(im)/denom) + + def _eval_rewrite_as_tractable(self, arg, limitvar=None, **kwargs): + neg_exp, pos_exp = exp(-arg), exp(arg) + return (pos_exp + neg_exp)/(pos_exp - neg_exp) + + def _eval_rewrite_as_exp(self, arg, **kwargs): + neg_exp, pos_exp = exp(-arg), exp(arg) + return (pos_exp + neg_exp)/(pos_exp - neg_exp) + + def _eval_rewrite_as_sinh(self, arg, **kwargs): + return -I*sinh(pi*I/2 - arg, evaluate=False)/sinh(arg) + + def _eval_rewrite_as_cosh(self, arg, **kwargs): + return -I*cosh(arg)/cosh(pi*I/2 - arg, evaluate=False) + + def _eval_rewrite_as_tanh(self, arg, **kwargs): + return 1/tanh(arg) + + def _eval_is_positive(self): + if self.args[0].is_extended_real: + return self.args[0].is_positive + + def _eval_is_negative(self): + if self.args[0].is_extended_real: + return self.args[0].is_negative + + def _eval_as_leading_term(self, x, logx, cdir): + from sympy.series.order import Order + arg = self.args[0].as_leading_term(x) + + if x in arg.free_symbols and Order(1, x).contains(arg): + return 1/arg + else: + return self.func(arg) + + def _eval_expand_trig(self, **hints): + arg = self.args[0] + if arg.is_Add: + CX = [coth(x, evaluate=False)._eval_expand_trig() for x in arg.args] + p = [[], []] + n = len(arg.args) + for i in range(n, -1, -1): + p[(n - i) % 2].append(symmetric_poly(i, CX)) + return Add(*p[0])/Add(*p[1]) + elif arg.is_Mul: + coeff, x = arg.as_coeff_Mul(rational=True) + if coeff.is_Integer and coeff > 1: + c = coth(x, evaluate=False) + p = [[], []] + for i in range(coeff, -1, -1): + p[(coeff - i) % 2].append(binomial(coeff, i)*c**i) + return Add(*p[0])/Add(*p[1]) + return coth(arg) + + +class ReciprocalHyperbolicFunction(HyperbolicFunction): + """Base class for reciprocal functions of hyperbolic functions. """ + + #To be defined in class + _reciprocal_of = None + _is_even: FuzzyBool = None + _is_odd: FuzzyBool = None + + @classmethod + def eval(cls, arg): + if arg.could_extract_minus_sign(): + if cls._is_even: + return cls(-arg) + if cls._is_odd: + return -cls(-arg) + + t = cls._reciprocal_of.eval(arg) + if hasattr(arg, 'inverse') and arg.inverse() == cls: + return arg.args[0] + return 1/t if t is not None else t + + def _call_reciprocal(self, method_name, *args, **kwargs): + # Calls method_name on _reciprocal_of + o = self._reciprocal_of(self.args[0]) + return getattr(o, method_name)(*args, **kwargs) + + def _calculate_reciprocal(self, method_name, *args, **kwargs): + # If calling method_name on _reciprocal_of returns a value != None + # then return the reciprocal of that value + t = self._call_reciprocal(method_name, *args, **kwargs) + return 1/t if t is not None else t + + def _rewrite_reciprocal(self, method_name, arg): + # Special handling for rewrite functions. If reciprocal rewrite returns + # unmodified expression, then return None + t = self._call_reciprocal(method_name, arg) + if t is not None and t != self._reciprocal_of(arg): + return 1/t + + def _eval_rewrite_as_exp(self, arg, **kwargs): + return self._rewrite_reciprocal("_eval_rewrite_as_exp", arg) + + def _eval_rewrite_as_tractable(self, arg, limitvar=None, **kwargs): + return self._rewrite_reciprocal("_eval_rewrite_as_tractable", arg) + + def _eval_rewrite_as_tanh(self, arg, **kwargs): + return self._rewrite_reciprocal("_eval_rewrite_as_tanh", arg) + + def _eval_rewrite_as_coth(self, arg, **kwargs): + return self._rewrite_reciprocal("_eval_rewrite_as_coth", arg) + + def as_real_imag(self, deep = True, **hints): + return (1 / self._reciprocal_of(self.args[0])).as_real_imag(deep, **hints) + + def _eval_conjugate(self): + return self.func(self.args[0].conjugate()) + + def _eval_expand_complex(self, deep=True, **hints): + re_part, im_part = self.as_real_imag(deep=True, **hints) + return re_part + I*im_part + + def _eval_expand_trig(self, **hints): + return self._calculate_reciprocal("_eval_expand_trig", **hints) + + def _eval_as_leading_term(self, x, logx, cdir): + return (1/self._reciprocal_of(self.args[0]))._eval_as_leading_term(x, logx=logx, cdir=cdir) + + def _eval_is_extended_real(self): + return self._reciprocal_of(self.args[0]).is_extended_real + + def _eval_is_finite(self): + return (1/self._reciprocal_of(self.args[0])).is_finite + + +class csch(ReciprocalHyperbolicFunction): + r""" + ``csch(x)`` is the hyperbolic cosecant of ``x``. + + The hyperbolic cosecant function is $\frac{2}{e^x - e^{-x}}$ + + Examples + ======== + + >>> from sympy import csch + >>> from sympy.abc import x + >>> csch(x) + csch(x) + + See Also + ======== + + sinh, cosh, tanh, sech, asinh, acosh + """ + + _reciprocal_of = sinh + _is_odd = True + + def fdiff(self, argindex=1): + """ + Returns the first derivative of this function + """ + if argindex == 1: + return -coth(self.args[0]) * csch(self.args[0]) + else: + raise ArgumentIndexError(self, argindex) + + @staticmethod + @cacheit + def taylor_term(n, x, *previous_terms): + """ + Returns the next term in the Taylor series expansion + """ + if n == 0: + return 1/sympify(x) + elif n < 0 or n % 2 == 0: + return S.Zero + else: + x = sympify(x) + + B = bernoulli(n + 1) + F = factorial(n + 1) + + return 2 * (1 - 2**n) * B/F * x**n + + def _eval_rewrite_as_sin(self, arg, **kwargs): + return I / sin(I * arg, evaluate=False) + + def _eval_rewrite_as_csc(self, arg, **kwargs): + return I * csc(I * arg, evaluate=False) + + def _eval_rewrite_as_cosh(self, arg, **kwargs): + return I / cosh(arg + I * pi / 2, evaluate=False) + + def _eval_rewrite_as_sinh(self, arg, **kwargs): + return 1 / sinh(arg) + + def _eval_is_positive(self): + if self.args[0].is_extended_real: + return self.args[0].is_positive + + def _eval_is_negative(self): + if self.args[0].is_extended_real: + return self.args[0].is_negative + + +class sech(ReciprocalHyperbolicFunction): + r""" + ``sech(x)`` is the hyperbolic secant of ``x``. + + The hyperbolic secant function is $\frac{2}{e^x + e^{-x}}$ + + Examples + ======== + + >>> from sympy import sech + >>> from sympy.abc import x + >>> sech(x) + sech(x) + + See Also + ======== + + sinh, cosh, tanh, coth, csch, asinh, acosh + """ + + _reciprocal_of = cosh + _is_even = True + + def fdiff(self, argindex=1): + if argindex == 1: + return - tanh(self.args[0])*sech(self.args[0]) + else: + raise ArgumentIndexError(self, argindex) + + @staticmethod + @cacheit + def taylor_term(n, x, *previous_terms): + if n < 0 or n % 2 == 1: + return S.Zero + else: + x = sympify(x) + return euler(n) / factorial(n) * x**(n) + + def _eval_rewrite_as_cos(self, arg, **kwargs): + return 1 / cos(I * arg, evaluate=False) + + def _eval_rewrite_as_sec(self, arg, **kwargs): + return sec(I * arg, evaluate=False) + + def _eval_rewrite_as_sinh(self, arg, **kwargs): + return I / sinh(arg + I * pi /2, evaluate=False) + + def _eval_rewrite_as_cosh(self, arg, **kwargs): + return 1 / cosh(arg) + + def _eval_is_positive(self): + if self.args[0].is_extended_real: + return True + + +############################################################################### +############################# HYPERBOLIC INVERSES ############################# +############################################################################### + +class InverseHyperbolicFunction(DefinedFunction): + """Base class for inverse hyperbolic functions.""" + + pass + + +class asinh(InverseHyperbolicFunction): + """ + ``asinh(x)`` is the inverse hyperbolic sine of ``x``. + + The inverse hyperbolic sine function. + + Examples + ======== + + >>> from sympy import asinh + >>> from sympy.abc import x + >>> asinh(x).diff(x) + 1/sqrt(x**2 + 1) + >>> asinh(1) + log(1 + sqrt(2)) + + See Also + ======== + + acosh, atanh, sinh + """ + + def fdiff(self, argindex=1): + if argindex == 1: + return 1/sqrt(self.args[0]**2 + 1) + else: + raise ArgumentIndexError(self, argindex) + + @classmethod + def eval(cls, arg): + if arg.is_Number: + if arg is S.NaN: + return S.NaN + elif arg is S.Infinity: + return S.Infinity + elif arg is S.NegativeInfinity: + return S.NegativeInfinity + elif arg.is_zero: + return S.Zero + elif arg is S.One: + return log(sqrt(2) + 1) + elif arg is S.NegativeOne: + return log(sqrt(2) - 1) + elif arg.is_negative: + return -cls(-arg) + else: + if arg is S.ComplexInfinity: + return S.ComplexInfinity + + if arg.is_zero: + return S.Zero + + i_coeff = _imaginary_unit_as_coefficient(arg) + + if i_coeff is not None: + return I * asin(i_coeff) + else: + if arg.could_extract_minus_sign(): + return -cls(-arg) + + if isinstance(arg, sinh) and arg.args[0].is_number: + z = arg.args[0] + if z.is_real: + return z + r, i = match_real_imag(z) + if r is not None and i is not None: + f = floor((i + pi/2)/pi) + m = z - I*pi*f + even = f.is_even + if even is True: + return m + elif even is False: + return -m + + @staticmethod + @cacheit + def taylor_term(n, x, *previous_terms): + if n < 0 or n % 2 == 0: + return S.Zero + else: + x = sympify(x) + if len(previous_terms) >= 2 and n > 2: + p = previous_terms[-2] + return -p * (n - 2)**2/(n*(n - 1)) * x**2 + else: + k = (n - 1) // 2 + R = RisingFactorial(S.Half, k) + F = factorial(k) + return S.NegativeOne**k * R / F * x**n / n + + def _eval_as_leading_term(self, x, logx, cdir): + arg = self.args[0] + x0 = arg.subs(x, 0).cancel() + if x0.is_zero: + return arg.as_leading_term(x) + + if x0 is S.NaN: + expr = self.func(arg.as_leading_term(x)) + if expr.is_finite: + return expr + else: + return self + + # Handling branch points + if x0 in (-I, I, S.ComplexInfinity): + return self.rewrite(log)._eval_as_leading_term(x, logx=logx, cdir=cdir) + # Handling points lying on branch cuts (-I*oo, -I) U (I, I*oo) + if (1 + x0**2).is_negative: + ndir = arg.dir(x, cdir if cdir else 1) + if re(ndir).is_positive: + if im(x0).is_negative: + return -self.func(x0) - I*pi + elif re(ndir).is_negative: + if im(x0).is_positive: + return -self.func(x0) + I*pi + else: + return self.rewrite(log)._eval_as_leading_term(x, logx=logx, cdir=cdir) + return self.func(x0) + + def _eval_nseries(self, x, n, logx, cdir=0): # asinh + arg = self.args[0] + arg0 = arg.subs(x, 0) + + # Handling branch points + if arg0 in (I, -I): + return self.rewrite(log)._eval_nseries(x, n, logx=logx, cdir=cdir) + + res = super()._eval_nseries(x, n=n, logx=logx) + if arg0 is S.ComplexInfinity: + return res + + # Handling points lying on branch cuts (-I*oo, -I) U (I, I*oo) + if (1 + arg0**2).is_negative: + ndir = arg.dir(x, cdir if cdir else 1) + if re(ndir).is_positive: + if im(arg0).is_negative: + return -res - I*pi + elif re(ndir).is_negative: + if im(arg0).is_positive: + return -res + I*pi + else: + return self.rewrite(log)._eval_nseries(x, n, logx=logx, cdir=cdir) + return res + + def _eval_rewrite_as_log(self, x, **kwargs): + return log(x + sqrt(x**2 + 1)) + + _eval_rewrite_as_tractable = _eval_rewrite_as_log + + def _eval_rewrite_as_atanh(self, x, **kwargs): + return atanh(x/sqrt(1 + x**2)) + + def _eval_rewrite_as_acosh(self, x, **kwargs): + ix = I*x + return I*(sqrt(1 - ix)/sqrt(ix - 1) * acosh(ix) - pi/2) + + def _eval_rewrite_as_asin(self, x, **kwargs): + return -I * asin(I * x, evaluate=False) + + def _eval_rewrite_as_acos(self, x, **kwargs): + return I * acos(I * x, evaluate=False) - I*pi/2 + + def inverse(self, argindex=1): + """ + Returns the inverse of this function. + """ + return sinh + + def _eval_is_zero(self): + return self.args[0].is_zero + + def _eval_is_extended_real(self): + return self.args[0].is_extended_real + + def _eval_is_finite(self): + return self.args[0].is_finite + + +class acosh(InverseHyperbolicFunction): + """ + ``acosh(x)`` is the inverse hyperbolic cosine of ``x``. + + The inverse hyperbolic cosine function. + + Examples + ======== + + >>> from sympy import acosh + >>> from sympy.abc import x + >>> acosh(x).diff(x) + 1/(sqrt(x - 1)*sqrt(x + 1)) + >>> acosh(1) + 0 + + See Also + ======== + + asinh, atanh, cosh + """ + + def fdiff(self, argindex=1): + if argindex == 1: + arg = self.args[0] + return 1/(sqrt(arg - 1)*sqrt(arg + 1)) + else: + raise ArgumentIndexError(self, argindex) + + @classmethod + def eval(cls, arg): + if arg.is_Number: + if arg is S.NaN: + return S.NaN + elif arg is S.Infinity: + return S.Infinity + elif arg is S.NegativeInfinity: + return S.Infinity + elif arg.is_zero: + return pi*I / 2 + elif arg is S.One: + return S.Zero + elif arg is S.NegativeOne: + return pi*I + + if arg.is_number: + cst_table = _acosh_table() + + if arg in cst_table: + if arg.is_extended_real: + return cst_table[arg]*I + return cst_table[arg] + + if arg is S.ComplexInfinity: + return S.ComplexInfinity + if arg == I*S.Infinity: + return S.Infinity + I*pi/2 + if arg == -I*S.Infinity: + return S.Infinity - I*pi/2 + + if arg.is_zero: + return pi*I*S.Half + + if isinstance(arg, cosh) and arg.args[0].is_number: + z = arg.args[0] + if z.is_real: + return Abs(z) + r, i = match_real_imag(z) + if r is not None and i is not None: + f = floor(i/pi) + m = z - I*pi*f + even = f.is_even + if even is True: + if r.is_nonnegative: + return m + elif r.is_negative: + return -m + elif even is False: + m -= I*pi + if r.is_nonpositive: + return -m + elif r.is_positive: + return m + + @staticmethod + @cacheit + def taylor_term(n, x, *previous_terms): + if n == 0: + return I*pi/2 + elif n < 0 or n % 2 == 0: + return S.Zero + else: + x = sympify(x) + if len(previous_terms) >= 2 and n > 2: + p = previous_terms[-2] + return p * (n - 2)**2/(n*(n - 1)) * x**2 + else: + k = (n - 1) // 2 + R = RisingFactorial(S.Half, k) + F = factorial(k) + return -R / F * I * x**n / n + + def _eval_as_leading_term(self, x, logx, cdir): + arg = self.args[0] + x0 = arg.subs(x, 0).cancel() + # Handling branch points + if x0 in (-S.One, S.Zero, S.One, S.ComplexInfinity): + return self.rewrite(log)._eval_as_leading_term(x, logx=logx, cdir=cdir) + + if x0 is S.NaN: + expr = self.func(arg.as_leading_term(x)) + if expr.is_finite: + return expr + else: + return self + + # Handling points lying on branch cuts (-oo, 1) + if (x0 - 1).is_negative: + ndir = arg.dir(x, cdir if cdir else 1) + if im(ndir).is_negative: + if (x0 + 1).is_negative: + return self.func(x0) - 2*I*pi + return -self.func(x0) + elif not im(ndir).is_positive: + return self.rewrite(log)._eval_as_leading_term(x, logx=logx, cdir=cdir) + return self.func(x0) + + def _eval_nseries(self, x, n, logx, cdir=0): # acosh + arg = self.args[0] + arg0 = arg.subs(x, 0) + + # Handling branch points + if arg0 in (S.One, S.NegativeOne): + return self.rewrite(log)._eval_nseries(x, n, logx=logx, cdir=cdir) + + res = super()._eval_nseries(x, n=n, logx=logx) + if arg0 is S.ComplexInfinity: + return res + + # Handling points lying on branch cuts (-oo, 1) + if (arg0 - 1).is_negative: + ndir = arg.dir(x, cdir if cdir else 1) + if im(ndir).is_negative: + if (arg0 + 1).is_negative: + return res - 2*I*pi + return -res + elif not im(ndir).is_positive: + return self.rewrite(log)._eval_nseries(x, n, logx=logx, cdir=cdir) + return res + + def _eval_rewrite_as_log(self, x, **kwargs): + return log(x + sqrt(x + 1) * sqrt(x - 1)) + + _eval_rewrite_as_tractable = _eval_rewrite_as_log + + def _eval_rewrite_as_acos(self, x, **kwargs): + return sqrt(x - 1)/sqrt(1 - x) * acos(x) + + def _eval_rewrite_as_asin(self, x, **kwargs): + return sqrt(x - 1)/sqrt(1 - x) * (pi/2 - asin(x)) + + def _eval_rewrite_as_asinh(self, x, **kwargs): + return sqrt(x - 1)/sqrt(1 - x) * (pi/2 + I*asinh(I*x, evaluate=False)) + + def _eval_rewrite_as_atanh(self, x, **kwargs): + sxm1 = sqrt(x - 1) + s1mx = sqrt(1 - x) + sx2m1 = sqrt(x**2 - 1) + return (pi/2*sxm1/s1mx*(1 - x * sqrt(1/x**2)) + + sxm1*sqrt(x + 1)/sx2m1 * atanh(sx2m1/x)) + + def inverse(self, argindex=1): + """ + Returns the inverse of this function. + """ + return cosh + + def _eval_is_zero(self): + if (self.args[0] - 1).is_zero: + return True + + def _eval_is_extended_real(self): + return fuzzy_and([self.args[0].is_extended_real, (self.args[0] - 1).is_extended_nonnegative]) + + def _eval_is_finite(self): + return self.args[0].is_finite + + +class atanh(InverseHyperbolicFunction): + """ + ``atanh(x)`` is the inverse hyperbolic tangent of ``x``. + + The inverse hyperbolic tangent function. + + Examples + ======== + + >>> from sympy import atanh + >>> from sympy.abc import x + >>> atanh(x).diff(x) + 1/(1 - x**2) + + See Also + ======== + + asinh, acosh, tanh + """ + + def fdiff(self, argindex=1): + if argindex == 1: + return 1/(1 - self.args[0]**2) + else: + raise ArgumentIndexError(self, argindex) + + @classmethod + def eval(cls, arg): + if arg.is_Number: + if arg is S.NaN: + return S.NaN + elif arg.is_zero: + return S.Zero + elif arg is S.One: + return S.Infinity + elif arg is S.NegativeOne: + return S.NegativeInfinity + elif arg is S.Infinity: + return -I * atan(arg) + elif arg is S.NegativeInfinity: + return I * atan(-arg) + elif arg.is_negative: + return -cls(-arg) + else: + if arg is S.ComplexInfinity: + from sympy.calculus.accumulationbounds import AccumBounds + return I*AccumBounds(-pi/2, pi/2) + + i_coeff = _imaginary_unit_as_coefficient(arg) + + if i_coeff is not None: + return I * atan(i_coeff) + else: + if arg.could_extract_minus_sign(): + return -cls(-arg) + + if arg.is_zero: + return S.Zero + + if isinstance(arg, tanh) and arg.args[0].is_number: + z = arg.args[0] + if z.is_real: + return z + r, i = match_real_imag(z) + if r is not None and i is not None: + f = floor(2*i/pi) + even = f.is_even + m = z - I*f*pi/2 + if even is True: + return m + elif even is False: + return m - I*pi/2 + + @staticmethod + @cacheit + def taylor_term(n, x, *previous_terms): + if n < 0 or n % 2 == 0: + return S.Zero + else: + x = sympify(x) + return x**n / n + + def _eval_as_leading_term(self, x, logx, cdir): + arg = self.args[0] + x0 = arg.subs(x, 0).cancel() + if x0.is_zero: + return arg.as_leading_term(x) + if x0 is S.NaN: + expr = self.func(arg.as_leading_term(x)) + if expr.is_finite: + return expr + else: + return self + + # Handling branch points + if x0 in (-S.One, S.One, S.ComplexInfinity): + return self.rewrite(log)._eval_as_leading_term(x, logx=logx, cdir=cdir) + # Handling points lying on branch cuts (-oo, -1] U [1, oo) + if (1 - x0**2).is_negative: + ndir = arg.dir(x, cdir if cdir else 1) + if im(ndir).is_negative: + if x0.is_negative: + return self.func(x0) - I*pi + elif im(ndir).is_positive: + if x0.is_positive: + return self.func(x0) + I*pi + else: + return self.rewrite(log)._eval_as_leading_term(x, logx=logx, cdir=cdir) + return self.func(x0) + + def _eval_nseries(self, x, n, logx, cdir=0): # atanh + arg = self.args[0] + arg0 = arg.subs(x, 0) + + # Handling branch points + if arg0 in (S.One, S.NegativeOne): + return self.rewrite(log)._eval_nseries(x, n, logx=logx, cdir=cdir) + + res = super()._eval_nseries(x, n=n, logx=logx) + if arg0 is S.ComplexInfinity: + return res + + # Handling points lying on branch cuts (-oo, -1] U [1, oo) + if (1 - arg0**2).is_negative: + ndir = arg.dir(x, cdir if cdir else 1) + if im(ndir).is_negative: + if arg0.is_negative: + return res - I*pi + elif im(ndir).is_positive: + if arg0.is_positive: + return res + I*pi + else: + return self.rewrite(log)._eval_nseries(x, n, logx=logx, cdir=cdir) + return res + + def _eval_rewrite_as_log(self, x, **kwargs): + return (log(1 + x) - log(1 - x)) / 2 + + _eval_rewrite_as_tractable = _eval_rewrite_as_log + + def _eval_rewrite_as_asinh(self, x, **kwargs): + f = sqrt(1/(x**2 - 1)) + return (pi*x/(2*sqrt(-x**2)) - + sqrt(-x)*sqrt(1 - x**2)/sqrt(x)*f*asinh(f)) + + def _eval_is_zero(self): + if self.args[0].is_zero: + return True + + def _eval_is_extended_real(self): + return fuzzy_and([self.args[0].is_extended_real, (1 - self.args[0]).is_nonnegative, (self.args[0] + 1).is_nonnegative]) + + def _eval_is_finite(self): + return fuzzy_not(fuzzy_or([(self.args[0] - 1).is_zero, (self.args[0] + 1).is_zero])) + + def _eval_is_imaginary(self): + return self.args[0].is_imaginary + + def inverse(self, argindex=1): + """ + Returns the inverse of this function. + """ + return tanh + + +class acoth(InverseHyperbolicFunction): + """ + ``acoth(x)`` is the inverse hyperbolic cotangent of ``x``. + + The inverse hyperbolic cotangent function. + + Examples + ======== + + >>> from sympy import acoth + >>> from sympy.abc import x + >>> acoth(x).diff(x) + 1/(1 - x**2) + + See Also + ======== + + asinh, acosh, coth + """ + + def fdiff(self, argindex=1): + if argindex == 1: + return 1/(1 - self.args[0]**2) + else: + raise ArgumentIndexError(self, argindex) + + @classmethod + def eval(cls, arg): + if arg.is_Number: + if arg is S.NaN: + return S.NaN + elif arg is S.Infinity: + return S.Zero + elif arg is S.NegativeInfinity: + return S.Zero + elif arg.is_zero: + return pi*I / 2 + elif arg is S.One: + return S.Infinity + elif arg is S.NegativeOne: + return S.NegativeInfinity + elif arg.is_negative: + return -cls(-arg) + else: + if arg is S.ComplexInfinity: + return S.Zero + + i_coeff = _imaginary_unit_as_coefficient(arg) + + if i_coeff is not None: + return -I * acot(i_coeff) + else: + if arg.could_extract_minus_sign(): + return -cls(-arg) + + if arg.is_zero: + return pi*I*S.Half + + @staticmethod + @cacheit + def taylor_term(n, x, *previous_terms): + if n == 0: + return -I*pi/2 + elif n < 0 or n % 2 == 0: + return S.Zero + else: + x = sympify(x) + return x**n / n + + def _eval_as_leading_term(self, x, logx, cdir): + arg = self.args[0] + x0 = arg.subs(x, 0).cancel() + if x0 is S.ComplexInfinity: + return (1/arg).as_leading_term(x) + if x0 is S.NaN: + expr = self.func(arg.as_leading_term(x)) + if expr.is_finite: + return expr + else: + return self + + # Handling branch points + if x0 in (-S.One, S.One, S.Zero): + return self.rewrite(log)._eval_as_leading_term(x, logx=logx, cdir=cdir) + # Handling points lying on branch cuts [-1, 1] + if x0.is_real and (1 - x0**2).is_positive: + ndir = arg.dir(x, cdir if cdir else 1) + if im(ndir).is_negative: + if x0.is_positive: + return self.func(x0) + I*pi + elif im(ndir).is_positive: + if x0.is_negative: + return self.func(x0) - I*pi + else: + return self.rewrite(log)._eval_as_leading_term(x, logx=logx, cdir=cdir) + return self.func(x0) + + def _eval_nseries(self, x, n, logx, cdir=0): # acoth + arg = self.args[0] + arg0 = arg.subs(x, 0) + + # Handling branch points + if arg0 in (S.One, S.NegativeOne): + return self.rewrite(log)._eval_nseries(x, n, logx=logx, cdir=cdir) + + res = super()._eval_nseries(x, n=n, logx=logx) + if arg0 is S.ComplexInfinity: + return res + + # Handling points lying on branch cuts [-1, 1] + if arg0.is_real and (1 - arg0**2).is_positive: + ndir = arg.dir(x, cdir if cdir else 1) + if im(ndir).is_negative: + if arg0.is_positive: + return res + I*pi + elif im(ndir).is_positive: + if arg0.is_negative: + return res - I*pi + else: + return self.rewrite(log)._eval_nseries(x, n, logx=logx, cdir=cdir) + return res + + def _eval_rewrite_as_log(self, x, **kwargs): + return (log(1 + 1/x) - log(1 - 1/x)) / 2 + + _eval_rewrite_as_tractable = _eval_rewrite_as_log + + def _eval_rewrite_as_atanh(self, x, **kwargs): + return atanh(1/x) + + def _eval_rewrite_as_asinh(self, x, **kwargs): + return (pi*I/2*(sqrt((x - 1)/x)*sqrt(x/(x - 1)) - sqrt(1 + 1/x)*sqrt(x/(x + 1))) + + x*sqrt(1/x**2)*asinh(sqrt(1/(x**2 - 1)))) + + def inverse(self, argindex=1): + """ + Returns the inverse of this function. + """ + return coth + + def _eval_is_extended_real(self): + return fuzzy_and([self.args[0].is_extended_real, fuzzy_or([(self.args[0] - 1).is_extended_nonnegative, (self.args[0] + 1).is_extended_nonpositive])]) + + def _eval_is_finite(self): + return fuzzy_not(fuzzy_or([(self.args[0] - 1).is_zero, (self.args[0] + 1).is_zero])) + + +class asech(InverseHyperbolicFunction): + """ + ``asech(x)`` is the inverse hyperbolic secant of ``x``. + + The inverse hyperbolic secant function. + + Examples + ======== + + >>> from sympy import asech, sqrt, S + >>> from sympy.abc import x + >>> asech(x).diff(x) + -1/(x*sqrt(1 - x**2)) + >>> asech(1).diff(x) + 0 + >>> asech(1) + 0 + >>> asech(S(2)) + I*pi/3 + >>> asech(-sqrt(2)) + 3*I*pi/4 + >>> asech((sqrt(6) - sqrt(2))) + I*pi/12 + + See Also + ======== + + asinh, atanh, cosh, acoth + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Hyperbolic_function + .. [2] https://dlmf.nist.gov/4.37 + .. [3] https://functions.wolfram.com/ElementaryFunctions/ArcSech/ + + """ + + def fdiff(self, argindex=1): + if argindex == 1: + z = self.args[0] + return -1/(z*sqrt(1 - z**2)) + else: + raise ArgumentIndexError(self, argindex) + + @classmethod + def eval(cls, arg): + if arg.is_Number: + if arg is S.NaN: + return S.NaN + elif arg is S.Infinity: + return pi*I / 2 + elif arg is S.NegativeInfinity: + return pi*I / 2 + elif arg.is_zero: + return S.Infinity + elif arg is S.One: + return S.Zero + elif arg is S.NegativeOne: + return pi*I + + if arg.is_number: + cst_table = _asech_table() + + if arg in cst_table: + if arg.is_extended_real: + return cst_table[arg]*I + return cst_table[arg] + + if arg is S.ComplexInfinity: + from sympy.calculus.accumulationbounds import AccumBounds + return I*AccumBounds(-pi/2, pi/2) + + if arg.is_zero: + return S.Infinity + + @staticmethod + @cacheit + def taylor_term(n, x, *previous_terms): + if n == 0: + return log(2 / x) + elif n < 0 or n % 2 == 1: + return S.Zero + else: + x = sympify(x) + if len(previous_terms) > 2 and n > 2: + p = previous_terms[-2] + return p * ((n - 1)*(n-2)) * x**2/(4 * (n//2)**2) + else: + k = n // 2 + R = RisingFactorial(S.Half, k) * n + F = factorial(k) * n // 2 * n // 2 + return -1 * R / F * x**n / 4 + + def _eval_as_leading_term(self, x, logx, cdir): + arg = self.args[0] + x0 = arg.subs(x, 0).cancel() + # Handling branch points + if x0 in (-S.One, S.Zero, S.One, S.ComplexInfinity): + return self.rewrite(log)._eval_as_leading_term(x, logx=logx, cdir=cdir) + + if x0 is S.NaN: + expr = self.func(arg.as_leading_term(x)) + if expr.is_finite: + return expr + else: + return self + + # Handling points lying on branch cuts (-oo, 0] U (1, oo) + if x0.is_negative or (1 - x0).is_negative: + ndir = arg.dir(x, cdir if cdir else 1) + if im(ndir).is_positive: + if x0.is_positive or (x0 + 1).is_negative: + return -self.func(x0) + return self.func(x0) - 2*I*pi + elif not im(ndir).is_negative: + return self.rewrite(log)._eval_as_leading_term(x, logx=logx, cdir=cdir) + return self.func(x0) + + def _eval_nseries(self, x, n, logx, cdir=0): # asech + from sympy.series.order import O + arg = self.args[0] + arg0 = arg.subs(x, 0) + + # Handling branch points + if arg0 is S.One: + t = Dummy('t', positive=True) + ser = asech(S.One - t**2).rewrite(log).nseries(t, 0, 2*n) + arg1 = S.One - self.args[0] + f = arg1.as_leading_term(x) + g = (arg1 - f)/ f + if not g.is_meromorphic(x, 0): # cannot be expanded + return O(1) if n == 0 else O(sqrt(x)) + res1 = sqrt(S.One + g)._eval_nseries(x, n=n, logx=logx) + res = (res1.removeO()*sqrt(f)).expand() + return ser.removeO().subs(t, res).expand().powsimp() + O(x**n, x) + + if arg0 is S.NegativeOne: + t = Dummy('t', positive=True) + ser = asech(S.NegativeOne + t**2).rewrite(log).nseries(t, 0, 2*n) + arg1 = S.One + self.args[0] + f = arg1.as_leading_term(x) + g = (arg1 - f)/ f + if not g.is_meromorphic(x, 0): # cannot be expanded + return O(1) if n == 0 else I*pi + O(sqrt(x)) + res1 = sqrt(S.One + g)._eval_nseries(x, n=n, logx=logx) + res = (res1.removeO()*sqrt(f)).expand() + return ser.removeO().subs(t, res).expand().powsimp() + O(x**n, x) + + res = super()._eval_nseries(x, n=n, logx=logx) + if arg0 is S.ComplexInfinity: + return res + + # Handling points lying on branch cuts (-oo, 0] U (1, oo) + if arg0.is_negative or (1 - arg0).is_negative: + ndir = arg.dir(x, cdir if cdir else 1) + if im(ndir).is_positive: + if arg0.is_positive or (arg0 + 1).is_negative: + return -res + return res - 2*I*pi + elif not im(ndir).is_negative: + return self.rewrite(log)._eval_nseries(x, n, logx=logx, cdir=cdir) + return res + + def inverse(self, argindex=1): + """ + Returns the inverse of this function. + """ + return sech + + def _eval_rewrite_as_log(self, arg, **kwargs): + return log(1/arg + sqrt(1/arg - 1) * sqrt(1/arg + 1)) + + _eval_rewrite_as_tractable = _eval_rewrite_as_log + + def _eval_rewrite_as_acosh(self, arg, **kwargs): + return acosh(1/arg) + + def _eval_rewrite_as_asinh(self, arg, **kwargs): + return sqrt(1/arg - 1)/sqrt(1 - 1/arg)*(I*asinh(I/arg, evaluate=False) + + pi*S.Half) + + def _eval_rewrite_as_atanh(self, x, **kwargs): + return (I*pi*(1 - sqrt(x)*sqrt(1/x) - I/2*sqrt(-x)/sqrt(x) - I/2*sqrt(x**2)/sqrt(-x**2)) + + sqrt(1/(x + 1))*sqrt(x + 1)*atanh(sqrt(1 - x**2))) + + def _eval_rewrite_as_acsch(self, x, **kwargs): + return sqrt(1/x - 1)/sqrt(1 - 1/x)*(pi/2 - I*acsch(I*x, evaluate=False)) + + def _eval_is_extended_real(self): + return fuzzy_and([self.args[0].is_extended_real, self.args[0].is_nonnegative, (1 - self.args[0]).is_nonnegative]) + + def _eval_is_finite(self): + return fuzzy_not(self.args[0].is_zero) + + +class acsch(InverseHyperbolicFunction): + """ + ``acsch(x)`` is the inverse hyperbolic cosecant of ``x``. + + The inverse hyperbolic cosecant function. + + Examples + ======== + + >>> from sympy import acsch, sqrt, I + >>> from sympy.abc import x + >>> acsch(x).diff(x) + -1/(x**2*sqrt(1 + x**(-2))) + >>> acsch(1).diff(x) + 0 + >>> acsch(1) + log(1 + sqrt(2)) + >>> acsch(I) + -I*pi/2 + >>> acsch(-2*I) + I*pi/6 + >>> acsch(I*(sqrt(6) - sqrt(2))) + -5*I*pi/12 + + See Also + ======== + + asinh + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Hyperbolic_function + .. [2] https://dlmf.nist.gov/4.37 + .. [3] https://functions.wolfram.com/ElementaryFunctions/ArcCsch/ + + """ + + def fdiff(self, argindex=1): + if argindex == 1: + z = self.args[0] + return -1/(z**2*sqrt(1 + 1/z**2)) + else: + raise ArgumentIndexError(self, argindex) + + @classmethod + def eval(cls, arg): + if arg.is_Number: + if arg is S.NaN: + return S.NaN + elif arg is S.Infinity: + return S.Zero + elif arg is S.NegativeInfinity: + return S.Zero + elif arg.is_zero: + return S.ComplexInfinity + elif arg is S.One: + return log(1 + sqrt(2)) + elif arg is S.NegativeOne: + return - log(1 + sqrt(2)) + + if arg.is_number: + cst_table = _acsch_table() + + if arg in cst_table: + return cst_table[arg]*I + + if arg is S.ComplexInfinity: + return S.Zero + + if arg.is_infinite: + return S.Zero + + if arg.is_zero: + return S.ComplexInfinity + + if arg.could_extract_minus_sign(): + return -cls(-arg) + + @staticmethod + @cacheit + def taylor_term(n, x, *previous_terms): + if n == 0: + return log(2 / x) + elif n < 0 or n % 2 == 1: + return S.Zero + else: + x = sympify(x) + if len(previous_terms) > 2 and n > 2: + p = previous_terms[-2] + return -p * ((n - 1)*(n-2)) * x**2/(4 * (n//2)**2) + else: + k = n // 2 + R = RisingFactorial(S.Half, k) * n + F = factorial(k) * n // 2 * n // 2 + return S.NegativeOne**(k +1) * R / F * x**n / 4 + + def _eval_as_leading_term(self, x, logx, cdir): + arg = self.args[0] + x0 = arg.subs(x, 0).cancel() + # Handling branch points + if x0 in (-I, I, S.Zero): + return self.rewrite(log)._eval_as_leading_term(x, logx=logx, cdir=cdir) + + if x0 is S.NaN: + expr = self.func(arg.as_leading_term(x)) + if expr.is_finite: + return expr + else: + return self + + if x0 is S.ComplexInfinity: + return (1/arg).as_leading_term(x) + # Handling points lying on branch cuts (-I, I) + if x0.is_imaginary and (1 + x0**2).is_positive: + ndir = arg.dir(x, cdir if cdir else 1) + if re(ndir).is_positive: + if im(x0).is_positive: + return -self.func(x0) - I*pi + elif re(ndir).is_negative: + if im(x0).is_negative: + return -self.func(x0) + I*pi + else: + return self.rewrite(log)._eval_as_leading_term(x, logx=logx, cdir=cdir) + return self.func(x0) + + def _eval_nseries(self, x, n, logx, cdir=0): # acsch + from sympy.series.order import O + arg = self.args[0] + arg0 = arg.subs(x, 0) + + # Handling branch points + if arg0 is I: + t = Dummy('t', positive=True) + ser = acsch(I + t**2).rewrite(log).nseries(t, 0, 2*n) + arg1 = -I + self.args[0] + f = arg1.as_leading_term(x) + g = (arg1 - f)/ f + if not g.is_meromorphic(x, 0): # cannot be expanded + return O(1) if n == 0 else -I*pi/2 + O(sqrt(x)) + res1 = sqrt(S.One + g)._eval_nseries(x, n=n, logx=logx) + res = (res1.removeO()*sqrt(f)).expand() + res = ser.removeO().subs(t, res).expand().powsimp() + O(x**n, x) + return res + + if arg0 == S.NegativeOne*I: + t = Dummy('t', positive=True) + ser = acsch(-I + t**2).rewrite(log).nseries(t, 0, 2*n) + arg1 = I + self.args[0] + f = arg1.as_leading_term(x) + g = (arg1 - f)/ f + if not g.is_meromorphic(x, 0): # cannot be expanded + return O(1) if n == 0 else I*pi/2 + O(sqrt(x)) + res1 = sqrt(S.One + g)._eval_nseries(x, n=n, logx=logx) + res = (res1.removeO()*sqrt(f)).expand() + return ser.removeO().subs(t, res).expand().powsimp() + O(x**n, x) + + res = super()._eval_nseries(x, n=n, logx=logx) + if arg0 is S.ComplexInfinity: + return res + + # Handling points lying on branch cuts (-I, I) + if arg0.is_imaginary and (1 + arg0**2).is_positive: + ndir = self.args[0].dir(x, cdir if cdir else 1) + if re(ndir).is_positive: + if im(arg0).is_positive: + return -res - I*pi + elif re(ndir).is_negative: + if im(arg0).is_negative: + return -res + I*pi + else: + return self.rewrite(log)._eval_nseries(x, n, logx=logx, cdir=cdir) + return res + + def inverse(self, argindex=1): + """ + Returns the inverse of this function. + """ + return csch + + def _eval_rewrite_as_log(self, arg, **kwargs): + return log(1/arg + sqrt(1/arg**2 + 1)) + + _eval_rewrite_as_tractable = _eval_rewrite_as_log + + def _eval_rewrite_as_asinh(self, arg, **kwargs): + return asinh(1/arg) + + def _eval_rewrite_as_acosh(self, arg, **kwargs): + return I*(sqrt(1 - I/arg)/sqrt(I/arg - 1)* + acosh(I/arg, evaluate=False) - pi*S.Half) + + def _eval_rewrite_as_atanh(self, arg, **kwargs): + arg2 = arg**2 + arg2p1 = arg2 + 1 + return sqrt(-arg2)/arg*(pi*S.Half - + sqrt(-arg2p1**2)/arg2p1*atanh(sqrt(arg2p1))) + + def _eval_is_zero(self): + return self.args[0].is_infinite + + def _eval_is_extended_real(self): + return self.args[0].is_extended_real + + def _eval_is_finite(self): + return fuzzy_not(self.args[0].is_zero) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/functions/elementary/integers.py b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/functions/elementary/integers.py new file mode 100644 index 0000000000000000000000000000000000000000..d0b58d32399144c39133855475d70c01b70b1a3f --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/functions/elementary/integers.py @@ -0,0 +1,710 @@ +from __future__ import annotations + +from sympy.core.basic import Basic +from sympy.core.expr import Expr + +from sympy.core import Add, S +from sympy.core.evalf import get_integer_part, PrecisionExhausted +from sympy.core.function import DefinedFunction +from sympy.core.logic import fuzzy_or, fuzzy_and +from sympy.core.numbers import Integer, int_valued +from sympy.core.relational import Gt, Lt, Ge, Le, Relational, is_eq, is_le, is_lt +from sympy.core.sympify import _sympify +from sympy.functions.elementary.complexes import im, re +from sympy.multipledispatch import dispatch + +############################################################################### +######################### FLOOR and CEILING FUNCTIONS ######################### +############################################################################### + + +class RoundFunction(DefinedFunction): + """Abstract base class for rounding functions.""" + + args: tuple[Expr] + + @classmethod + def eval(cls, arg): + if (v := cls._eval_number(arg)) is not None: + return v + if (v := cls._eval_const_number(arg)) is not None: + return v + + if arg.is_integer or arg.is_finite is False: + return arg + if arg.is_imaginary or (S.ImaginaryUnit*arg).is_real: + i = im(arg) + if not i.has(S.ImaginaryUnit): + return cls(i)*S.ImaginaryUnit + return cls(arg, evaluate=False) + + # Integral, numerical, symbolic part + ipart = npart = spart = S.Zero + + # Extract integral (or complex integral) terms + intof = lambda x: int(x) if int_valued(x) else ( + x if x.is_integer else None) + for t in Add.make_args(arg): + if t.is_imaginary and (i := intof(im(t))) is not None: + ipart += i*S.ImaginaryUnit + elif (i := intof(t)) is not None: + ipart += i + elif t.is_number: + npart += t + else: + spart += t + + if not (npart or spart): + return ipart + + # Evaluate npart numerically if independent of spart + if npart and ( + not spart or + npart.is_real and (spart.is_imaginary or (S.ImaginaryUnit*spart).is_real) or + npart.is_imaginary and spart.is_real): + try: + r, i = get_integer_part( + npart, cls._dir, {}, return_ints=True) + ipart += Integer(r) + Integer(i)*S.ImaginaryUnit + npart = S.Zero + except (PrecisionExhausted, NotImplementedError): + pass + + spart += npart + if not spart: + return ipart + elif spart.is_imaginary or (S.ImaginaryUnit*spart).is_real: + return ipart + cls(im(spart), evaluate=False)*S.ImaginaryUnit + elif isinstance(spart, (floor, ceiling)): + return ipart + spart + else: + return ipart + cls(spart, evaluate=False) + + @classmethod + def _eval_number(cls, arg): + raise NotImplementedError() + + def _eval_is_finite(self): + return self.args[0].is_finite + + def _eval_is_real(self): + return self.args[0].is_real + + def _eval_is_integer(self): + return self.args[0].is_real + + +class floor(RoundFunction): + """ + Floor is a univariate function which returns the largest integer + value not greater than its argument. This implementation + generalizes floor to complex numbers by taking the floor of the + real and imaginary parts separately. + + Examples + ======== + + >>> from sympy import floor, E, I, S, Float, Rational + >>> floor(17) + 17 + >>> floor(Rational(23, 10)) + 2 + >>> floor(2*E) + 5 + >>> floor(-Float(0.567)) + -1 + >>> floor(-I/2) + -I + >>> floor(S(5)/2 + 5*I/2) + 2 + 2*I + + See Also + ======== + + sympy.functions.elementary.integers.ceiling + + References + ========== + + .. [1] "Concrete mathematics" by Graham, pp. 87 + .. [2] https://mathworld.wolfram.com/FloorFunction.html + + """ + _dir = -1 + + @classmethod + def _eval_number(cls, arg): + if arg.is_Number: + return arg.floor() + if any(isinstance(i, j) + for i in (arg, -arg) for j in (floor, ceiling)): + return arg + if arg.is_NumberSymbol: + return arg.approximation_interval(Integer)[0] + + @classmethod + def _eval_const_number(cls, arg): + if arg.is_real: + if arg.is_zero: + return S.Zero + if arg.is_positive: + num, den = arg.as_numer_denom() + s = den.is_negative + if s is None: + return None + if s: + num, den = -num, -den + # 0 <= num/den < 1 -> 0 + if is_lt(num, den): + return S.Zero + # 1 <= num/den < 2 -> 1 + if fuzzy_and([is_le(den, num), is_lt(num, 2*den)]): + return S.One + if arg.is_negative: + num, den = arg.as_numer_denom() + s = den.is_negative + if s is None: + return None + if s: + num, den = -num, -den + # -1 <= num/den < 0 -> -1 + if is_le(-den, num): + return S.NegativeOne + # -2 <= num/den < -1 -> -2 + if fuzzy_and([is_le(-2*den, num), is_lt(num, -den)]): + return Integer(-2) + + def _eval_as_leading_term(self, x, logx, cdir): + from sympy.calculus.accumulationbounds import AccumBounds + arg = self.args[0] + arg0 = arg.subs(x, 0) + r = self.subs(x, 0) + if arg0 is S.NaN or isinstance(arg0, AccumBounds): + arg0 = arg.limit(x, 0, dir='-' if re(cdir).is_negative else '+') + r = floor(arg0) + if arg0.is_finite: + if arg0 == r: + ndir = arg.dir(x, cdir=cdir if cdir != 0 else 1) + if ndir.is_negative: + return r - 1 + elif ndir.is_positive: + return r + else: + raise NotImplementedError("Not sure of sign of %s" % ndir) + else: + return r + return arg.as_leading_term(x, logx=logx, cdir=cdir) + + def _eval_nseries(self, x, n, logx, cdir=0): + arg = self.args[0] + arg0 = arg.subs(x, 0) + r = self.subs(x, 0) + if arg0 is S.NaN: + arg0 = arg.limit(x, 0, dir='-' if re(cdir).is_negative else '+') + r = floor(arg0) + if arg0.is_infinite: + from sympy.calculus.accumulationbounds import AccumBounds + from sympy.series.order import Order + s = arg._eval_nseries(x, n, logx, cdir) + o = Order(1, (x, 0)) if n <= 0 else AccumBounds(-1, 0) + return s + o + if arg0 == r: + ndir = arg.dir(x, cdir=cdir if cdir != 0 else 1) + if ndir.is_negative: + return r - 1 + elif ndir.is_positive: + return r + else: + raise NotImplementedError("Not sure of sign of %s" % ndir) + else: + return r + + def _eval_is_negative(self): + return self.args[0].is_negative + + def _eval_is_nonnegative(self): + return self.args[0].is_nonnegative + + def _eval_rewrite_as_ceiling(self, arg, **kwargs): + return -ceiling(-arg) + + def _eval_rewrite_as_frac(self, arg, **kwargs): + return arg - frac(arg) + + def __le__(self, other): + other = S(other) + if self.args[0].is_real: + if other.is_integer: + return self.args[0] < other + 1 + if other.is_number and other.is_real: + return self.args[0] < ceiling(other) + if self.args[0] == other and other.is_real: + return S.true + if other is S.Infinity and self.is_finite: + return S.true + + return Le(self, other, evaluate=False) + + def __ge__(self, other): + other = S(other) + if self.args[0].is_real: + if other.is_integer: + return self.args[0] >= other + if other.is_number and other.is_real: + return self.args[0] >= ceiling(other) + if self.args[0] == other and other.is_real and other.is_noninteger: + return S.false + if other is S.NegativeInfinity and self.is_finite: + return S.true + + return Ge(self, other, evaluate=False) + + def __gt__(self, other): + other = S(other) + if self.args[0].is_real: + if other.is_integer: + return self.args[0] >= other + 1 + if other.is_number and other.is_real: + return self.args[0] >= ceiling(other) + if self.args[0] == other and other.is_real: + return S.false + if other is S.NegativeInfinity and self.is_finite: + return S.true + + return Gt(self, other, evaluate=False) + + def __lt__(self, other): + other = S(other) + if self.args[0].is_real: + if other.is_integer: + return self.args[0] < other + if other.is_number and other.is_real: + return self.args[0] < ceiling(other) + if self.args[0] == other and other.is_real and other.is_noninteger: + return S.true + if other is S.Infinity and self.is_finite: + return S.true + + return Lt(self, other, evaluate=False) + + +@dispatch(floor, Expr) +def _eval_is_eq(lhs, rhs): # noqa:F811 + return is_eq(lhs.rewrite(ceiling), rhs) or \ + is_eq(lhs.rewrite(frac),rhs) + + +class ceiling(RoundFunction): + """ + Ceiling is a univariate function which returns the smallest integer + value not less than its argument. This implementation + generalizes ceiling to complex numbers by taking the ceiling of the + real and imaginary parts separately. + + Examples + ======== + + >>> from sympy import ceiling, E, I, S, Float, Rational + >>> ceiling(17) + 17 + >>> ceiling(Rational(23, 10)) + 3 + >>> ceiling(2*E) + 6 + >>> ceiling(-Float(0.567)) + 0 + >>> ceiling(I/2) + I + >>> ceiling(S(5)/2 + 5*I/2) + 3 + 3*I + + See Also + ======== + + sympy.functions.elementary.integers.floor + + References + ========== + + .. [1] "Concrete mathematics" by Graham, pp. 87 + .. [2] https://mathworld.wolfram.com/CeilingFunction.html + + """ + _dir = 1 + + @classmethod + def _eval_number(cls, arg): + if arg.is_Number: + return arg.ceiling() + if any(isinstance(i, j) + for i in (arg, -arg) for j in (floor, ceiling)): + return arg + if arg.is_NumberSymbol: + return arg.approximation_interval(Integer)[1] + + @classmethod + def _eval_const_number(cls, arg): + if arg.is_real: + if arg.is_zero: + return S.Zero + if arg.is_positive: + num, den = arg.as_numer_denom() + s = den.is_negative + if s is None: + return None + if s: + num, den = -num, -den + # 0 < num/den <= 1 -> 1 + if is_le(num, den): + return S.One + # 1 < num/den <= 2 -> 2 + if fuzzy_and([is_lt(den, num), is_le(num, 2*den)]): + return Integer(2) + if arg.is_negative: + num, den = arg.as_numer_denom() + s = den.is_negative + if s is None: + return None + if s: + num, den = -num, -den + # -1 < num/den <= 0 -> 0 + if is_lt(-den, num): + return S.Zero + # -2 < num/den <= -1 -> -1 + if fuzzy_and([is_lt(-2*den, num), is_le(num, -den)]): + return S.NegativeOne + + def _eval_as_leading_term(self, x, logx, cdir): + from sympy.calculus.accumulationbounds import AccumBounds + arg = self.args[0] + arg0 = arg.subs(x, 0) + r = self.subs(x, 0) + if arg0 is S.NaN or isinstance(arg0, AccumBounds): + arg0 = arg.limit(x, 0, dir='-' if re(cdir).is_negative else '+') + r = ceiling(arg0) + if arg0.is_finite: + if arg0 == r: + ndir = arg.dir(x, cdir=cdir if cdir != 0 else 1) + if ndir.is_negative: + return r + elif ndir.is_positive: + return r + 1 + else: + raise NotImplementedError("Not sure of sign of %s" % ndir) + else: + return r + return arg.as_leading_term(x, logx=logx, cdir=cdir) + + def _eval_nseries(self, x, n, logx, cdir=0): + arg = self.args[0] + arg0 = arg.subs(x, 0) + r = self.subs(x, 0) + if arg0 is S.NaN: + arg0 = arg.limit(x, 0, dir='-' if re(cdir).is_negative else '+') + r = ceiling(arg0) + if arg0.is_infinite: + from sympy.calculus.accumulationbounds import AccumBounds + from sympy.series.order import Order + s = arg._eval_nseries(x, n, logx, cdir) + o = Order(1, (x, 0)) if n <= 0 else AccumBounds(0, 1) + return s + o + if arg0 == r: + ndir = arg.dir(x, cdir=cdir if cdir != 0 else 1) + if ndir.is_negative: + return r + elif ndir.is_positive: + return r + 1 + else: + raise NotImplementedError("Not sure of sign of %s" % ndir) + else: + return r + + def _eval_rewrite_as_floor(self, arg, **kwargs): + return -floor(-arg) + + def _eval_rewrite_as_frac(self, arg, **kwargs): + return arg + frac(-arg) + + def _eval_is_positive(self): + return self.args[0].is_positive + + def _eval_is_nonpositive(self): + return self.args[0].is_nonpositive + + def __lt__(self, other): + other = S(other) + if self.args[0].is_real: + if other.is_integer: + return self.args[0] <= other - 1 + if other.is_number and other.is_real: + return self.args[0] <= floor(other) + if self.args[0] == other and other.is_real: + return S.false + if other is S.Infinity and self.is_finite: + return S.true + + return Lt(self, other, evaluate=False) + + def __gt__(self, other): + other = S(other) + if self.args[0].is_real: + if other.is_integer: + return self.args[0] > other + if other.is_number and other.is_real: + return self.args[0] > floor(other) + if self.args[0] == other and other.is_real and other.is_noninteger: + return S.true + if other is S.NegativeInfinity and self.is_finite: + return S.true + + return Gt(self, other, evaluate=False) + + def __ge__(self, other): + other = S(other) + if self.args[0].is_real: + if other.is_integer: + return self.args[0] > other - 1 + if other.is_number and other.is_real: + return self.args[0] > floor(other) + if self.args[0] == other and other.is_real: + return S.true + if other is S.NegativeInfinity and self.is_finite: + return S.true + + return Ge(self, other, evaluate=False) + + def __le__(self, other): + other = S(other) + if self.args[0].is_real: + if other.is_integer: + return self.args[0] <= other + if other.is_number and other.is_real: + return self.args[0] <= floor(other) + if self.args[0] == other and other.is_real and other.is_noninteger: + return S.false + if other is S.Infinity and self.is_finite: + return S.true + + return Le(self, other, evaluate=False) + + +@dispatch(ceiling, Basic) # type:ignore +def _eval_is_eq(lhs, rhs): # noqa:F811 + return is_eq(lhs.rewrite(floor), rhs) or is_eq(lhs.rewrite(frac),rhs) + + +class frac(DefinedFunction): + r"""Represents the fractional part of x + + For real numbers it is defined [1]_ as + + .. math:: + x - \left\lfloor{x}\right\rfloor + + Examples + ======== + + >>> from sympy import Symbol, frac, Rational, floor, I + >>> frac(Rational(4, 3)) + 1/3 + >>> frac(-Rational(4, 3)) + 2/3 + + returns zero for integer arguments + + >>> n = Symbol('n', integer=True) + >>> frac(n) + 0 + + rewrite as floor + + >>> x = Symbol('x') + >>> frac(x).rewrite(floor) + x - floor(x) + + for complex arguments + + >>> r = Symbol('r', real=True) + >>> t = Symbol('t', real=True) + >>> frac(t + I*r) + I*frac(r) + frac(t) + + See Also + ======== + + sympy.functions.elementary.integers.floor + sympy.functions.elementary.integers.ceiling + + References + =========== + + .. [1] https://en.wikipedia.org/wiki/Fractional_part + .. [2] https://mathworld.wolfram.com/FractionalPart.html + + """ + @classmethod + def eval(cls, arg): + from sympy.calculus.accumulationbounds import AccumBounds + + def _eval(arg): + if arg in (S.Infinity, S.NegativeInfinity): + return AccumBounds(0, 1) + if arg.is_integer: + return S.Zero + if arg.is_number: + if arg is S.NaN: + return S.NaN + elif arg is S.ComplexInfinity: + return S.NaN + else: + return arg - floor(arg) + return cls(arg, evaluate=False) + + real, imag = S.Zero, S.Zero + for t in Add.make_args(arg): + # Two checks are needed for complex arguments + # see issue-7649 for details + if t.is_imaginary or (S.ImaginaryUnit*t).is_real: + i = im(t) + if not i.has(S.ImaginaryUnit): + imag += i + else: + real += t + else: + real += t + + real = _eval(real) + imag = _eval(imag) + return real + S.ImaginaryUnit*imag + + def _eval_rewrite_as_floor(self, arg, **kwargs): + return arg - floor(arg) + + def _eval_rewrite_as_ceiling(self, arg, **kwargs): + return arg + ceiling(-arg) + + def _eval_is_finite(self): + return True + + def _eval_is_real(self): + return self.args[0].is_extended_real + + def _eval_is_imaginary(self): + return self.args[0].is_imaginary + + def _eval_is_integer(self): + return self.args[0].is_integer + + def _eval_is_zero(self): + return fuzzy_or([self.args[0].is_zero, self.args[0].is_integer]) + + def _eval_is_negative(self): + return False + + def __ge__(self, other): + if self.is_extended_real: + other = _sympify(other) + # Check if other <= 0 + if other.is_extended_nonpositive: + return S.true + # Check if other >= 1 + res = self._value_one_or_more(other) + if res is not None: + return not(res) + return Ge(self, other, evaluate=False) + + def __gt__(self, other): + if self.is_extended_real: + other = _sympify(other) + # Check if other < 0 + res = self._value_one_or_more(other) + if res is not None: + return not(res) + # Check if other >= 1 + if other.is_extended_negative: + return S.true + return Gt(self, other, evaluate=False) + + def __le__(self, other): + if self.is_extended_real: + other = _sympify(other) + # Check if other < 0 + if other.is_extended_negative: + return S.false + # Check if other >= 1 + res = self._value_one_or_more(other) + if res is not None: + return res + return Le(self, other, evaluate=False) + + def __lt__(self, other): + if self.is_extended_real: + other = _sympify(other) + # Check if other <= 0 + if other.is_extended_nonpositive: + return S.false + # Check if other >= 1 + res = self._value_one_or_more(other) + if res is not None: + return res + return Lt(self, other, evaluate=False) + + def _value_one_or_more(self, other): + if other.is_extended_real: + if other.is_number: + res = other >= 1 + if res and not isinstance(res, Relational): + return S.true + if other.is_integer and other.is_positive: + return S.true + + def _eval_as_leading_term(self, x, logx, cdir): + from sympy.calculus.accumulationbounds import AccumBounds + arg = self.args[0] + arg0 = arg.subs(x, 0) + r = self.subs(x, 0) + + if arg0.is_finite: + if r.is_zero: + ndir = arg.dir(x, cdir=cdir) + if ndir.is_negative: + return S.One + return (arg - arg0).as_leading_term(x, logx=logx, cdir=cdir) + else: + return r + elif arg0 in (S.ComplexInfinity, S.Infinity, S.NegativeInfinity): + return AccumBounds(0, 1) + return arg.as_leading_term(x, logx=logx, cdir=cdir) + + def _eval_nseries(self, x, n, logx, cdir=0): + from sympy.series.order import Order + arg = self.args[0] + arg0 = arg.subs(x, 0) + r = self.subs(x, 0) + + if arg0.is_infinite: + from sympy.calculus.accumulationbounds import AccumBounds + o = Order(1, (x, 0)) if n <= 0 else AccumBounds(0, 1) + Order(x**n, (x, 0)) + return o + else: + res = (arg - arg0)._eval_nseries(x, n, logx=logx, cdir=cdir) + if r.is_zero: + ndir = arg.dir(x, cdir=cdir) + res += S.One if ndir.is_negative else S.Zero + else: + res += r + return res + + +@dispatch(frac, Basic) # type:ignore +def _eval_is_eq(lhs, rhs): # noqa:F811 + if (lhs.rewrite(floor) == rhs) or \ + (lhs.rewrite(ceiling) == rhs): + return True + # Check if other < 0 + if rhs.is_extended_negative: + return False + # Check if other >= 1 + res = lhs._value_one_or_more(rhs) + if res is not None: + return False diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/functions/elementary/miscellaneous.py b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/functions/elementary/miscellaneous.py new file mode 100644 index 0000000000000000000000000000000000000000..c7f3016bc7ea0d5c4ad778cf9922c941acb7fc44 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/functions/elementary/miscellaneous.py @@ -0,0 +1,915 @@ +from sympy.core import S, sympify, NumberKind +from sympy.utilities.iterables import sift +from sympy.core.add import Add +from sympy.core.containers import Tuple +from sympy.core.operations import LatticeOp, ShortCircuit +from sympy.core.function import (Application, Lambda, + ArgumentIndexError, DefinedFunction) +from sympy.core.expr import Expr +from sympy.core.exprtools import factor_terms +from sympy.core.mod import Mod +from sympy.core.mul import Mul +from sympy.core.numbers import Rational +from sympy.core.power import Pow +from sympy.core.relational import Eq, Relational +from sympy.core.singleton import Singleton +from sympy.core.sorting import ordered +from sympy.core.symbol import Dummy +from sympy.core.rules import Transform +from sympy.core.logic import fuzzy_and, fuzzy_or, _torf +from sympy.core.traversal import walk +from sympy.core.numbers import Integer +from sympy.logic.boolalg import And, Or + + +def _minmax_as_Piecewise(op, *args): + # helper for Min/Max rewrite as Piecewise + from sympy.functions.elementary.piecewise import Piecewise + ec = [] + for i, a in enumerate(args): + c = [Relational(a, args[j], op) for j in range(i + 1, len(args))] + ec.append((a, And(*c))) + return Piecewise(*ec) + + +class IdentityFunction(Lambda, metaclass=Singleton): + """ + The identity function + + Examples + ======== + + >>> from sympy import Id, Symbol + >>> x = Symbol('x') + >>> Id(x) + x + + """ + + _symbol = Dummy('x') + + @property + def signature(self): + return Tuple(self._symbol) + + @property + def expr(self): + return self._symbol + + +Id = S.IdentityFunction + +############################################################################### +############################# ROOT and SQUARE ROOT FUNCTION ################### +############################################################################### + + +def sqrt(arg, evaluate=None): + """Returns the principal square root. + + Parameters + ========== + + evaluate : bool, optional + The parameter determines if the expression should be evaluated. + If ``None``, its value is taken from + ``global_parameters.evaluate``. + + Examples + ======== + + >>> from sympy import sqrt, Symbol, S + >>> x = Symbol('x') + + >>> sqrt(x) + sqrt(x) + + >>> sqrt(x)**2 + x + + Note that sqrt(x**2) does not simplify to x. + + >>> sqrt(x**2) + sqrt(x**2) + + This is because the two are not equal to each other in general. + For example, consider x == -1: + + >>> from sympy import Eq + >>> Eq(sqrt(x**2), x).subs(x, -1) + False + + This is because sqrt computes the principal square root, so the square may + put the argument in a different branch. This identity does hold if x is + positive: + + >>> y = Symbol('y', positive=True) + >>> sqrt(y**2) + y + + You can force this simplification by using the powdenest() function with + the force option set to True: + + >>> from sympy import powdenest + >>> sqrt(x**2) + sqrt(x**2) + >>> powdenest(sqrt(x**2), force=True) + x + + To get both branches of the square root you can use the rootof function: + + >>> from sympy import rootof + + >>> [rootof(x**2-3,i) for i in (0,1)] + [-sqrt(3), sqrt(3)] + + Although ``sqrt`` is printed, there is no ``sqrt`` function so looking for + ``sqrt`` in an expression will fail: + + >>> from sympy.utilities.misc import func_name + >>> func_name(sqrt(x)) + 'Pow' + >>> sqrt(x).has(sqrt) + False + + To find ``sqrt`` look for ``Pow`` with an exponent of ``1/2``: + + >>> (x + 1/sqrt(x)).find(lambda i: i.is_Pow and abs(i.exp) is S.Half) + {1/sqrt(x)} + + See Also + ======== + + sympy.polys.rootoftools.rootof, root, real_root + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Square_root + .. [2] https://en.wikipedia.org/wiki/Principal_value + """ + # arg = sympify(arg) is handled by Pow + return Pow(arg, S.Half, evaluate=evaluate) + + +def cbrt(arg, evaluate=None): + """Returns the principal cube root. + + Parameters + ========== + + evaluate : bool, optional + The parameter determines if the expression should be evaluated. + If ``None``, its value is taken from + ``global_parameters.evaluate``. + + Examples + ======== + + >>> from sympy import cbrt, Symbol + >>> x = Symbol('x') + + >>> cbrt(x) + x**(1/3) + + >>> cbrt(x)**3 + x + + Note that cbrt(x**3) does not simplify to x. + + >>> cbrt(x**3) + (x**3)**(1/3) + + This is because the two are not equal to each other in general. + For example, consider `x == -1`: + + >>> from sympy import Eq + >>> Eq(cbrt(x**3), x).subs(x, -1) + False + + This is because cbrt computes the principal cube root, this + identity does hold if `x` is positive: + + >>> y = Symbol('y', positive=True) + >>> cbrt(y**3) + y + + See Also + ======== + + sympy.polys.rootoftools.rootof, root, real_root + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Cube_root + .. [2] https://en.wikipedia.org/wiki/Principal_value + + """ + return Pow(arg, Rational(1, 3), evaluate=evaluate) + + +def root(arg, n, k=0, evaluate=None): + r"""Returns the *k*-th *n*-th root of ``arg``. + + Parameters + ========== + + k : int, optional + Should be an integer in $\{0, 1, ..., n-1\}$. + Defaults to the principal root if $0$. + + evaluate : bool, optional + The parameter determines if the expression should be evaluated. + If ``None``, its value is taken from + ``global_parameters.evaluate``. + + Examples + ======== + + >>> from sympy import root, Rational + >>> from sympy.abc import x, n + + >>> root(x, 2) + sqrt(x) + + >>> root(x, 3) + x**(1/3) + + >>> root(x, n) + x**(1/n) + + >>> root(x, -Rational(2, 3)) + x**(-3/2) + + To get the k-th n-th root, specify k: + + >>> root(-2, 3, 2) + -(-1)**(2/3)*2**(1/3) + + To get all n n-th roots you can use the rootof function. + The following examples show the roots of unity for n + equal 2, 3 and 4: + + >>> from sympy import rootof + + >>> [rootof(x**2 - 1, i) for i in range(2)] + [-1, 1] + + >>> [rootof(x**3 - 1,i) for i in range(3)] + [1, -1/2 - sqrt(3)*I/2, -1/2 + sqrt(3)*I/2] + + >>> [rootof(x**4 - 1,i) for i in range(4)] + [-1, 1, -I, I] + + SymPy, like other symbolic algebra systems, returns the + complex root of negative numbers. This is the principal + root and differs from the text-book result that one might + be expecting. For example, the cube root of -8 does not + come back as -2: + + >>> root(-8, 3) + 2*(-1)**(1/3) + + The real_root function can be used to either make the principal + result real (or simply to return the real root directly): + + >>> from sympy import real_root + >>> real_root(_) + -2 + >>> real_root(-32, 5) + -2 + + Alternatively, the n//2-th n-th root of a negative number can be + computed with root: + + >>> root(-32, 5, 5//2) + -2 + + See Also + ======== + + sympy.polys.rootoftools.rootof + sympy.core.intfunc.integer_nthroot + sqrt, real_root + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Square_root + .. [2] https://en.wikipedia.org/wiki/Real_root + .. [3] https://en.wikipedia.org/wiki/Root_of_unity + .. [4] https://en.wikipedia.org/wiki/Principal_value + .. [5] https://mathworld.wolfram.com/CubeRoot.html + + """ + n = sympify(n) + if k: + return Mul(Pow(arg, S.One/n, evaluate=evaluate), S.NegativeOne**(2*k/n), evaluate=evaluate) + return Pow(arg, 1/n, evaluate=evaluate) + + +def real_root(arg, n=None, evaluate=None): + r"""Return the real *n*'th-root of *arg* if possible. + + Parameters + ========== + + n : int or None, optional + If *n* is ``None``, then all instances of + $(-n)^{1/\text{odd}}$ will be changed to $-n^{1/\text{odd}}$. + This will only create a real root of a principal root. + The presence of other factors may cause the result to not be + real. + + evaluate : bool, optional + The parameter determines if the expression should be evaluated. + If ``None``, its value is taken from + ``global_parameters.evaluate``. + + Examples + ======== + + >>> from sympy import root, real_root + + >>> real_root(-8, 3) + -2 + >>> root(-8, 3) + 2*(-1)**(1/3) + >>> real_root(_) + -2 + + If one creates a non-principal root and applies real_root, the + result will not be real (so use with caution): + + >>> root(-8, 3, 2) + -2*(-1)**(2/3) + >>> real_root(_) + -2*(-1)**(2/3) + + See Also + ======== + + sympy.polys.rootoftools.rootof + sympy.core.intfunc.integer_nthroot + root, sqrt + """ + from sympy.functions.elementary.complexes import Abs, im, sign + from sympy.functions.elementary.piecewise import Piecewise + if n is not None: + return Piecewise( + (root(arg, n, evaluate=evaluate), Or(Eq(n, S.One), Eq(n, S.NegativeOne))), + (Mul(sign(arg), root(Abs(arg), n, evaluate=evaluate), evaluate=evaluate), + And(Eq(im(arg), S.Zero), Eq(Mod(n, 2), S.One))), + (root(arg, n, evaluate=evaluate), True)) + rv = sympify(arg) + n1pow = Transform(lambda x: -(-x.base)**x.exp, + lambda x: + x.is_Pow and + x.base.is_negative and + x.exp.is_Rational and + x.exp.p == 1 and x.exp.q % 2) + return rv.xreplace(n1pow) + +############################################################################### +############################# MINIMUM and MAXIMUM ############################# +############################################################################### + + +class MinMaxBase(Expr, LatticeOp): + def __new__(cls, *args, **assumptions): + from sympy.core.parameters import global_parameters + evaluate = assumptions.pop('evaluate', global_parameters.evaluate) + args = (sympify(arg) for arg in args) + + # first standard filter, for cls.zero and cls.identity + # also reshape Max(a, Max(b, c)) to Max(a, b, c) + + if evaluate: + try: + args = frozenset(cls._new_args_filter(args)) + except ShortCircuit: + return cls.zero + # remove redundant args that are easily identified + args = cls._collapse_arguments(args, **assumptions) + # find local zeros + args = cls._find_localzeros(args, **assumptions) + args = frozenset(args) + + if not args: + return cls.identity + + if len(args) == 1: + return list(args).pop() + + # base creation + obj = Expr.__new__(cls, *ordered(args), **assumptions) + obj._argset = args + return obj + + @classmethod + def _collapse_arguments(cls, args, **assumptions): + """Remove redundant args. + + Examples + ======== + + >>> from sympy import Min, Max + >>> from sympy.abc import a, b, c, d, e + + Any arg in parent that appears in any + parent-like function in any of the flat args + of parent can be removed from that sub-arg: + + >>> Min(a, Max(b, Min(a, c, d))) + Min(a, Max(b, Min(c, d))) + + If the arg of parent appears in an opposite-than parent + function in any of the flat args of parent that function + can be replaced with the arg: + + >>> Min(a, Max(b, Min(c, d, Max(a, e)))) + Min(a, Max(b, Min(a, c, d))) + """ + if not args: + return args + args = list(ordered(args)) + if cls == Min: + other = Max + else: + other = Min + + # find global comparable max of Max and min of Min if a new + # value is being introduced in these args at position 0 of + # the ordered args + if args[0].is_number: + sifted = mins, maxs = [], [] + for i in args: + for v in walk(i, Min, Max): + if v.args[0].is_comparable: + sifted[isinstance(v, Max)].append(v) + small = Min.identity + for i in mins: + v = i.args[0] + if v.is_number and (v < small) == True: + small = v + big = Max.identity + for i in maxs: + v = i.args[0] + if v.is_number and (v > big) == True: + big = v + # at the point when this function is called from __new__, + # there may be more than one numeric arg present since + # local zeros have not been handled yet, so look through + # more than the first arg + if cls == Min: + for arg in args: + if not arg.is_number: + break + if (arg < small) == True: + small = arg + elif cls == Max: + for arg in args: + if not arg.is_number: + break + if (arg > big) == True: + big = arg + T = None + if cls == Min: + if small != Min.identity: + other = Max + T = small + elif big != Max.identity: + other = Min + T = big + if T is not None: + # remove numerical redundancy + for i in range(len(args)): + a = args[i] + if isinstance(a, other): + a0 = a.args[0] + if ((a0 > T) if other == Max else (a0 < T)) == True: + args[i] = cls.identity + + # remove redundant symbolic args + def do(ai, a): + if not isinstance(ai, (Min, Max)): + return ai + cond = a in ai.args + if not cond: + return ai.func(*[do(i, a) for i in ai.args], + evaluate=False) + if isinstance(ai, cls): + return ai.func(*[do(i, a) for i in ai.args if i != a], + evaluate=False) + return a + for i, a in enumerate(args): + args[i + 1:] = [do(ai, a) for ai in args[i + 1:]] + + # factor out common elements as for + # Min(Max(x, y), Max(x, z)) -> Max(x, Min(y, z)) + # and vice versa when swapping Min/Max -- do this only for the + # easy case where all functions contain something in common; + # trying to find some optimal subset of args to modify takes + # too long + + def factor_minmax(args): + is_other = lambda arg: isinstance(arg, other) + other_args, remaining_args = sift(args, is_other, binary=True) + if not other_args: + return args + + # Min(Max(x, y, z), Max(x, y, u, v)) -> {x,y}, ({z}, {u,v}) + arg_sets = [set(arg.args) for arg in other_args] + common = set.intersection(*arg_sets) + if not common: + return args + + new_other_args = list(common) + arg_sets_diff = [arg_set - common for arg_set in arg_sets] + + # If any set is empty after removing common then all can be + # discarded e.g. Min(Max(a, b, c), Max(a, b)) -> Max(a, b) + if all(arg_sets_diff): + other_args_diff = [other(*s, evaluate=False) for s in arg_sets_diff] + new_other_args.append(cls(*other_args_diff, evaluate=False)) + + other_args_factored = other(*new_other_args, evaluate=False) + return remaining_args + [other_args_factored] + + if len(args) > 1: + args = factor_minmax(args) + + return args + + @classmethod + def _new_args_filter(cls, arg_sequence): + """ + Generator filtering args. + + first standard filter, for cls.zero and cls.identity. + Also reshape ``Max(a, Max(b, c))`` to ``Max(a, b, c)``, + and check arguments for comparability + """ + for arg in arg_sequence: + # pre-filter, checking comparability of arguments + if not isinstance(arg, Expr) or arg.is_extended_real is False or ( + arg.is_number and + not arg.is_comparable): + raise ValueError("The argument '%s' is not comparable." % arg) + + if arg == cls.zero: + raise ShortCircuit(arg) + elif arg == cls.identity: + continue + elif arg.func == cls: + yield from arg.args + else: + yield arg + + @classmethod + def _find_localzeros(cls, values, **options): + """ + Sequentially allocate values to localzeros. + + When a value is identified as being more extreme than another member it + replaces that member; if this is never true, then the value is simply + appended to the localzeros. + """ + localzeros = set() + for v in values: + is_newzero = True + localzeros_ = list(localzeros) + for z in localzeros_: + if id(v) == id(z): + is_newzero = False + else: + con = cls._is_connected(v, z) + if con: + is_newzero = False + if con is True or con == cls: + localzeros.remove(z) + localzeros.update([v]) + if is_newzero: + localzeros.update([v]) + return localzeros + + @classmethod + def _is_connected(cls, x, y): + """ + Check if x and y are connected somehow. + """ + for i in range(2): + if x == y: + return True + t, f = Max, Min + for op in "><": + for j in range(2): + try: + if op == ">": + v = x >= y + else: + v = x <= y + except TypeError: + return False # non-real arg + if not v.is_Relational: + return t if v else f + t, f = f, t + x, y = y, x + x, y = y, x # run next pass with reversed order relative to start + # simplification can be expensive, so be conservative + # in what is attempted + x = factor_terms(x - y) + y = S.Zero + + return False + + def _eval_derivative(self, s): + # f(x).diff(s) -> x.diff(s) * f.fdiff(1)(s) + i = 0 + l = [] + for a in self.args: + i += 1 + da = a.diff(s) + if da.is_zero: + continue + try: + df = self.fdiff(i) + except ArgumentIndexError: + df = super().fdiff(i) + l.append(df * da) + return Add(*l) + + def _eval_rewrite_as_Abs(self, *args, **kwargs): + from sympy.functions.elementary.complexes import Abs + s = (args[0] + self.func(*args[1:]))/2 + d = abs(args[0] - self.func(*args[1:]))/2 + return (s + d if isinstance(self, Max) else s - d).rewrite(Abs) + + def evalf(self, n=15, **options): + return self.func(*[a.evalf(n, **options) for a in self.args]) + + def n(self, *args, **kwargs): + return self.evalf(*args, **kwargs) + + _eval_is_algebraic = lambda s: _torf(i.is_algebraic for i in s.args) + _eval_is_antihermitian = lambda s: _torf(i.is_antihermitian for i in s.args) + _eval_is_commutative = lambda s: _torf(i.is_commutative for i in s.args) + _eval_is_complex = lambda s: _torf(i.is_complex for i in s.args) + _eval_is_composite = lambda s: _torf(i.is_composite for i in s.args) + _eval_is_even = lambda s: _torf(i.is_even for i in s.args) + _eval_is_finite = lambda s: _torf(i.is_finite for i in s.args) + _eval_is_hermitian = lambda s: _torf(i.is_hermitian for i in s.args) + _eval_is_imaginary = lambda s: _torf(i.is_imaginary for i in s.args) + _eval_is_infinite = lambda s: _torf(i.is_infinite for i in s.args) + _eval_is_integer = lambda s: _torf(i.is_integer for i in s.args) + _eval_is_irrational = lambda s: _torf(i.is_irrational for i in s.args) + _eval_is_negative = lambda s: _torf(i.is_negative for i in s.args) + _eval_is_noninteger = lambda s: _torf(i.is_noninteger for i in s.args) + _eval_is_nonnegative = lambda s: _torf(i.is_nonnegative for i in s.args) + _eval_is_nonpositive = lambda s: _torf(i.is_nonpositive for i in s.args) + _eval_is_nonzero = lambda s: _torf(i.is_nonzero for i in s.args) + _eval_is_odd = lambda s: _torf(i.is_odd for i in s.args) + _eval_is_polar = lambda s: _torf(i.is_polar for i in s.args) + _eval_is_positive = lambda s: _torf(i.is_positive for i in s.args) + _eval_is_prime = lambda s: _torf(i.is_prime for i in s.args) + _eval_is_rational = lambda s: _torf(i.is_rational for i in s.args) + _eval_is_real = lambda s: _torf(i.is_real for i in s.args) + _eval_is_extended_real = lambda s: _torf(i.is_extended_real for i in s.args) + _eval_is_transcendental = lambda s: _torf(i.is_transcendental for i in s.args) + _eval_is_zero = lambda s: _torf(i.is_zero for i in s.args) + + +class Max(MinMaxBase, Application): + r""" + Return, if possible, the maximum value of the list. + + When number of arguments is equal one, then + return this argument. + + When number of arguments is equal two, then + return, if possible, the value from (a, b) that is $\ge$ the other. + + In common case, when the length of list greater than 2, the task + is more complicated. Return only the arguments, which are greater + than others, if it is possible to determine directional relation. + + If is not possible to determine such a relation, return a partially + evaluated result. + + Assumptions are used to make the decision too. + + Also, only comparable arguments are permitted. + + It is named ``Max`` and not ``max`` to avoid conflicts + with the built-in function ``max``. + + + Examples + ======== + + >>> from sympy import Max, Symbol, oo + >>> from sympy.abc import x, y, z + >>> p = Symbol('p', positive=True) + >>> n = Symbol('n', negative=True) + + >>> Max(x, -2) + Max(-2, x) + >>> Max(x, -2).subs(x, 3) + 3 + >>> Max(p, -2) + p + >>> Max(x, y) + Max(x, y) + >>> Max(x, y) == Max(y, x) + True + >>> Max(x, Max(y, z)) + Max(x, y, z) + >>> Max(n, 8, p, 7, -oo) + Max(8, p) + >>> Max (1, x, oo) + oo + + * Algorithm + + The task can be considered as searching of supremums in the + directed complete partial orders [1]_. + + The source values are sequentially allocated by the isolated subsets + in which supremums are searched and result as Max arguments. + + If the resulted supremum is single, then it is returned. + + The isolated subsets are the sets of values which are only the comparable + with each other in the current set. E.g. natural numbers are comparable with + each other, but not comparable with the `x` symbol. Another example: the + symbol `x` with negative assumption is comparable with a natural number. + + Also there are "least" elements, which are comparable with all others, + and have a zero property (maximum or minimum for all elements). + For example, in case of $\infty$, the allocation operation is terminated + and only this value is returned. + + Assumption: + - if $A > B > C$ then $A > C$ + - if $A = B$ then $B$ can be removed + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Directed_complete_partial_order + .. [2] https://en.wikipedia.org/wiki/Lattice_%28order%29 + + See Also + ======== + + Min : find minimum values + """ + zero = S.Infinity + identity = S.NegativeInfinity + + def fdiff( self, argindex ): + from sympy.functions.special.delta_functions import Heaviside + n = len(self.args) + if 0 < argindex and argindex <= n: + argindex -= 1 + if n == 2: + return Heaviside(self.args[argindex] - self.args[1 - argindex]) + newargs = tuple([self.args[i] for i in range(n) if i != argindex]) + return Heaviside(self.args[argindex] - Max(*newargs)) + else: + raise ArgumentIndexError(self, argindex) + + def _eval_rewrite_as_Heaviside(self, *args, **kwargs): + from sympy.functions.special.delta_functions import Heaviside + return Add(*[j*Mul(*[Heaviside(j - i) for i in args if i!=j]) \ + for j in args]) + + def _eval_rewrite_as_Piecewise(self, *args, **kwargs): + return _minmax_as_Piecewise('>=', *args) + + def _eval_is_positive(self): + return fuzzy_or(a.is_positive for a in self.args) + + def _eval_is_nonnegative(self): + return fuzzy_or(a.is_nonnegative for a in self.args) + + def _eval_is_negative(self): + return fuzzy_and(a.is_negative for a in self.args) + + +class Min(MinMaxBase, Application): + """ + Return, if possible, the minimum value of the list. + It is named ``Min`` and not ``min`` to avoid conflicts + with the built-in function ``min``. + + Examples + ======== + + >>> from sympy import Min, Symbol, oo + >>> from sympy.abc import x, y + >>> p = Symbol('p', positive=True) + >>> n = Symbol('n', negative=True) + + >>> Min(x, -2) + Min(-2, x) + >>> Min(x, -2).subs(x, 3) + -2 + >>> Min(p, -3) + -3 + >>> Min(x, y) + Min(x, y) + >>> Min(n, 8, p, -7, p, oo) + Min(-7, n) + + See Also + ======== + + Max : find maximum values + """ + zero = S.NegativeInfinity + identity = S.Infinity + + def fdiff( self, argindex ): + from sympy.functions.special.delta_functions import Heaviside + n = len(self.args) + if 0 < argindex and argindex <= n: + argindex -= 1 + if n == 2: + return Heaviside( self.args[1-argindex] - self.args[argindex] ) + newargs = tuple([ self.args[i] for i in range(n) if i != argindex]) + return Heaviside( Min(*newargs) - self.args[argindex] ) + else: + raise ArgumentIndexError(self, argindex) + + def _eval_rewrite_as_Heaviside(self, *args, **kwargs): + from sympy.functions.special.delta_functions import Heaviside + return Add(*[j*Mul(*[Heaviside(i-j) for i in args if i!=j]) \ + for j in args]) + + def _eval_rewrite_as_Piecewise(self, *args, **kwargs): + return _minmax_as_Piecewise('<=', *args) + + def _eval_is_positive(self): + return fuzzy_and(a.is_positive for a in self.args) + + def _eval_is_nonnegative(self): + return fuzzy_and(a.is_nonnegative for a in self.args) + + def _eval_is_negative(self): + return fuzzy_or(a.is_negative for a in self.args) + + +class Rem(DefinedFunction): + """Returns the remainder when ``p`` is divided by ``q`` where ``p`` is finite + and ``q`` is not equal to zero. The result, ``p - int(p/q)*q``, has the same sign + as the divisor. + + Parameters + ========== + + p : Expr + Dividend. + + q : Expr + Divisor. + + Notes + ===== + + ``Rem`` corresponds to the ``%`` operator in C. + + Examples + ======== + + >>> from sympy.abc import x, y + >>> from sympy import Rem + >>> Rem(x**3, y) + Rem(x**3, y) + >>> Rem(x**3, y).subs({x: -5, y: 3}) + -2 + + See Also + ======== + + Mod + """ + kind = NumberKind + + @classmethod + def eval(cls, p, q): + """Return the function remainder if both p, q are numbers and q is not + zero. + """ + + if q.is_zero: + raise ZeroDivisionError("Division by zero") + if p is S.NaN or q is S.NaN or p.is_finite is False or q.is_finite is False: + return S.NaN + if p is S.Zero or p in (q, -q) or (p.is_integer and q == 1): + return S.Zero + + if q.is_Number: + if p.is_Number: + return p - Integer(p/q)*q diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/functions/elementary/piecewise.py b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/functions/elementary/piecewise.py new file mode 100644 index 0000000000000000000000000000000000000000..fe4a4d4f57e2c3af170dac994e11782b9ed54b8f --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/functions/elementary/piecewise.py @@ -0,0 +1,1517 @@ +from sympy.core import S, diff, Tuple, Dummy, Mul +from sympy.core.basic import Basic, as_Basic +from sympy.core.function import DefinedFunction +from sympy.core.numbers import Rational, NumberSymbol, _illegal +from sympy.core.parameters import global_parameters +from sympy.core.relational import (Lt, Gt, Eq, Ne, Relational, + _canonical, _canonical_coeff) +from sympy.core.sorting import ordered +from sympy.functions.elementary.miscellaneous import Max, Min +from sympy.logic.boolalg import (And, Boolean, distribute_and_over_or, Not, + true, false, Or, ITE, simplify_logic, to_cnf, distribute_or_over_and) +from sympy.utilities.iterables import uniq, sift, common_prefix +from sympy.utilities.misc import filldedent, func_name + +from itertools import product + +Undefined = S.NaN # Piecewise() + +class ExprCondPair(Tuple): + """Represents an expression, condition pair.""" + + def __new__(cls, expr, cond): + expr = as_Basic(expr) + if cond == True: + return Tuple.__new__(cls, expr, true) + elif cond == False: + return Tuple.__new__(cls, expr, false) + elif isinstance(cond, Basic) and cond.has(Piecewise): + cond = piecewise_fold(cond) + if isinstance(cond, Piecewise): + cond = cond.rewrite(ITE) + + if not isinstance(cond, Boolean): + raise TypeError(filldedent(''' + Second argument must be a Boolean, + not `%s`''' % func_name(cond))) + return Tuple.__new__(cls, expr, cond) + + @property + def expr(self): + """ + Returns the expression of this pair. + """ + return self.args[0] + + @property + def cond(self): + """ + Returns the condition of this pair. + """ + return self.args[1] + + @property + def is_commutative(self): + return self.expr.is_commutative + + def __iter__(self): + yield self.expr + yield self.cond + + def _eval_simplify(self, **kwargs): + return self.func(*[a.simplify(**kwargs) for a in self.args]) + + +class Piecewise(DefinedFunction): + """ + Represents a piecewise function. + + Usage: + + Piecewise( (expr,cond), (expr,cond), ... ) + - Each argument is a 2-tuple defining an expression and condition + - The conds are evaluated in turn returning the first that is True. + If any of the evaluated conds are not explicitly False, + e.g. ``x < 1``, the function is returned in symbolic form. + - If the function is evaluated at a place where all conditions are False, + nan will be returned. + - Pairs where the cond is explicitly False, will be removed and no pair + appearing after a True condition will ever be retained. If a single + pair with a True condition remains, it will be returned, even when + evaluation is False. + + Examples + ======== + + >>> from sympy import Piecewise, log, piecewise_fold + >>> from sympy.abc import x, y + >>> f = x**2 + >>> g = log(x) + >>> p = Piecewise((0, x < -1), (f, x <= 1), (g, True)) + >>> p.subs(x,1) + 1 + >>> p.subs(x,5) + log(5) + + Booleans can contain Piecewise elements: + + >>> cond = (x < y).subs(x, Piecewise((2, x < 0), (3, True))); cond + Piecewise((2, x < 0), (3, True)) < y + + The folded version of this results in a Piecewise whose + expressions are Booleans: + + >>> folded_cond = piecewise_fold(cond); folded_cond + Piecewise((2 < y, x < 0), (3 < y, True)) + + When a Boolean containing Piecewise (like cond) or a Piecewise + with Boolean expressions (like folded_cond) is used as a condition, + it is converted to an equivalent :class:`~.ITE` object: + + >>> Piecewise((1, folded_cond)) + Piecewise((1, ITE(x < 0, y > 2, y > 3))) + + When a condition is an ``ITE``, it will be converted to a simplified + Boolean expression: + + >>> piecewise_fold(_) + Piecewise((1, ((x >= 0) | (y > 2)) & ((y > 3) | (x < 0)))) + + See Also + ======== + + piecewise_fold + piecewise_exclusive + ITE + """ + + nargs = None + is_Piecewise = True + + def __new__(cls, *args, **options): + if len(args) == 0: + raise TypeError("At least one (expr, cond) pair expected.") + # (Try to) sympify args first + newargs = [] + for ec in args: + # ec could be a ExprCondPair or a tuple + pair = ExprCondPair(*getattr(ec, 'args', ec)) + cond = pair.cond + if cond is false: + continue + newargs.append(pair) + if cond is true: + break + + eval = options.pop('evaluate', global_parameters.evaluate) + if eval: + r = cls.eval(*newargs) + if r is not None: + return r + elif len(newargs) == 1 and newargs[0].cond == True: + return newargs[0].expr + + return Basic.__new__(cls, *newargs, **options) + + @classmethod + def eval(cls, *_args): + """Either return a modified version of the args or, if no + modifications were made, return None. + + Modifications that are made here: + + 1. relationals are made canonical + 2. any False conditions are dropped + 3. any repeat of a previous condition is ignored + 4. any args past one with a true condition are dropped + + If there are no args left, nan will be returned. + If there is a single arg with a True condition, its + corresponding expression will be returned. + + EXAMPLES + ======== + + >>> from sympy import Piecewise + >>> from sympy.abc import x + >>> cond = -x < -1 + >>> args = [(1, cond), (4, cond), (3, False), (2, True), (5, x < 1)] + >>> Piecewise(*args, evaluate=False) + Piecewise((1, -x < -1), (4, -x < -1), (2, True)) + >>> Piecewise(*args) + Piecewise((1, x > 1), (2, True)) + """ + if not _args: + return Undefined + + if len(_args) == 1 and _args[0][-1] == True: + return _args[0][0] + + newargs = _piecewise_collapse_arguments(_args) + + # some conditions may have been redundant + missing = len(newargs) != len(_args) + # some conditions may have changed + same = all(a == b for a, b in zip(newargs, _args)) + # if either change happened we return the expr with the + # updated args + if not newargs: + raise ValueError(filldedent(''' + There are no conditions (or none that + are not trivially false) to define an + expression.''')) + if missing or not same: + return cls(*newargs) + + def doit(self, **hints): + """ + Evaluate this piecewise function. + """ + newargs = [] + for e, c in self.args: + if hints.get('deep', True): + if isinstance(e, Basic): + newe = e.doit(**hints) + if newe != self: + e = newe + if isinstance(c, Basic): + c = c.doit(**hints) + newargs.append((e, c)) + return self.func(*newargs) + + def _eval_simplify(self, **kwargs): + return piecewise_simplify(self, **kwargs) + + def _eval_as_leading_term(self, x, logx, cdir): + for e, c in self.args: + if c == True or c.subs(x, 0) == True: + return e.as_leading_term(x) + + def _eval_adjoint(self): + return self.func(*[(e.adjoint(), c) for e, c in self.args]) + + def _eval_conjugate(self): + return self.func(*[(e.conjugate(), c) for e, c in self.args]) + + def _eval_derivative(self, x): + return self.func(*[(diff(e, x), c) for e, c in self.args]) + + def _eval_evalf(self, prec): + return self.func(*[(e._evalf(prec), c) for e, c in self.args]) + + def _eval_is_meromorphic(self, x, a): + # Conditions often implicitly assume that the argument is real. + # Hence, there needs to be some check for as_set. + if not a.is_real: + return None + + # Then, scan ExprCondPairs in the given order to find a piece that would contain a, + # possibly as a boundary point. + for e, c in self.args: + cond = c.subs(x, a) + + if cond.is_Relational: + return None + if a in c.as_set().boundary: + return None + # Apply expression if a is an interior point of the domain of e. + if cond: + return e._eval_is_meromorphic(x, a) + + def piecewise_integrate(self, x, **kwargs): + """Return the Piecewise with each expression being + replaced with its antiderivative. To obtain a continuous + antiderivative, use the :func:`~.integrate` function or method. + + Examples + ======== + + >>> from sympy import Piecewise + >>> from sympy.abc import x + >>> p = Piecewise((0, x < 0), (1, x < 1), (2, True)) + >>> p.piecewise_integrate(x) + Piecewise((0, x < 0), (x, x < 1), (2*x, True)) + + Note that this does not give a continuous function, e.g. + at x = 1 the 3rd condition applies and the antiderivative + there is 2*x so the value of the antiderivative is 2: + + >>> anti = _ + >>> anti.subs(x, 1) + 2 + + The continuous derivative accounts for the integral *up to* + the point of interest, however: + + >>> p.integrate(x) + Piecewise((0, x < 0), (x, x < 1), (2*x - 1, True)) + >>> _.subs(x, 1) + 1 + + See Also + ======== + Piecewise._eval_integral + """ + from sympy.integrals import integrate + return self.func(*[(integrate(e, x, **kwargs), c) for e, c in self.args]) + + def _handle_irel(self, x, handler): + """Return either None (if the conditions of self depend only on x) else + a Piecewise expression whose expressions (handled by the handler that + was passed) are paired with the governing x-independent relationals, + e.g. Piecewise((A, a(x) & b(y)), (B, c(x) | c(y)) -> + Piecewise( + (handler(Piecewise((A, a(x) & True), (B, c(x) | True)), b(y) & c(y)), + (handler(Piecewise((A, a(x) & True), (B, c(x) | False)), b(y)), + (handler(Piecewise((A, a(x) & False), (B, c(x) | True)), c(y)), + (handler(Piecewise((A, a(x) & False), (B, c(x) | False)), True)) + """ + # identify governing relationals + rel = self.atoms(Relational) + irel = list(ordered([r for r in rel if x not in r.free_symbols + and r not in (S.true, S.false)])) + if irel: + args = {} + exprinorder = [] + for truth in product((1, 0), repeat=len(irel)): + reps = dict(zip(irel, truth)) + # only store the true conditions since the false are implied + # when they appear lower in the Piecewise args + if 1 not in truth: + cond = None # flag this one so it doesn't get combined + else: + andargs = Tuple(*[i for i in reps if reps[i]]) + free = list(andargs.free_symbols) + if len(free) == 1: + from sympy.solvers.inequalities import ( + reduce_inequalities, _solve_inequality) + try: + t = reduce_inequalities(andargs, free[0]) + # ValueError when there are potentially + # nonvanishing imaginary parts + except (ValueError, NotImplementedError): + # at least isolate free symbol on left + t = And(*[_solve_inequality( + a, free[0], linear=True) + for a in andargs]) + else: + t = And(*andargs) + if t is S.false: + continue # an impossible combination + cond = t + expr = handler(self.xreplace(reps)) + if isinstance(expr, self.func) and len(expr.args) == 1: + expr, econd = expr.args[0] + cond = And(econd, True if cond is None else cond) + # the ec pairs are being collected since all possibilities + # are being enumerated, but don't put the last one in since + # its expr might match a previous expression and it + # must appear last in the args + if cond is not None: + args.setdefault(expr, []).append(cond) + # but since we only store the true conditions we must maintain + # the order so that the expression with the most true values + # comes first + exprinorder.append(expr) + # convert collected conditions as args of Or + for k in args: + args[k] = Or(*args[k]) + # take them in the order obtained + args = [(e, args[e]) for e in uniq(exprinorder)] + # add in the last arg + args.append((expr, True)) + return Piecewise(*args) + + def _eval_integral(self, x, _first=True, **kwargs): + """Return the indefinite integral of the + Piecewise such that subsequent substitution of x with a + value will give the value of the integral (not including + the constant of integration) up to that point. To only + integrate the individual parts of Piecewise, use the + ``piecewise_integrate`` method. + + Examples + ======== + + >>> from sympy import Piecewise + >>> from sympy.abc import x + >>> p = Piecewise((0, x < 0), (1, x < 1), (2, True)) + >>> p.integrate(x) + Piecewise((0, x < 0), (x, x < 1), (2*x - 1, True)) + >>> p.piecewise_integrate(x) + Piecewise((0, x < 0), (x, x < 1), (2*x, True)) + + See Also + ======== + Piecewise.piecewise_integrate + """ + from sympy.integrals.integrals import integrate + + if _first: + def handler(ipw): + if isinstance(ipw, self.func): + return ipw._eval_integral(x, _first=False, **kwargs) + else: + return ipw.integrate(x, **kwargs) + irv = self._handle_irel(x, handler) + if irv is not None: + return irv + + # handle a Piecewise from -oo to oo with and no x-independent relationals + # ----------------------------------------------------------------------- + ok, abei = self._intervals(x) + if not ok: + from sympy.integrals.integrals import Integral + return Integral(self, x) # unevaluated + + pieces = [(a, b) for a, b, _, _ in abei] + oo = S.Infinity + done = [(-oo, oo, -1)] + for k, p in enumerate(pieces): + if p == (-oo, oo): + # all undone intervals will get this key + for j, (a, b, i) in enumerate(done): + if i == -1: + done[j] = a, b, k + break # nothing else to consider + N = len(done) - 1 + for j, (a, b, i) in enumerate(reversed(done)): + if i == -1: + j = N - j + done[j: j + 1] = _clip(p, (a, b), k) + done = [(a, b, i) for a, b, i in done if a != b] + + # append an arg if there is a hole so a reference to + # argument -1 will give Undefined + if any(i == -1 for (a, b, i) in done): + abei.append((-oo, oo, Undefined, -1)) + + # return the sum of the intervals + args = [] + sum = None + for a, b, i in done: + anti = integrate(abei[i][-2], x, **kwargs) + if sum is None: + sum = anti + else: + sum = sum.subs(x, a) + e = anti._eval_interval(x, a, x) + if sum.has(*_illegal) or e.has(*_illegal): + sum = anti + else: + sum += e + # see if we know whether b is contained in original + # condition + if b is S.Infinity: + cond = True + elif self.args[abei[i][-1]].cond.subs(x, b) == False: + cond = (x < b) + else: + cond = (x <= b) + args.append((sum, cond)) + return Piecewise(*args) + + def _eval_interval(self, sym, a, b, _first=True): + """Evaluates the function along the sym in a given interval [a, b]""" + # FIXME: Currently complex intervals are not supported. A possible + # replacement algorithm, discussed in issue 5227, can be found in the + # following papers; + # http://portal.acm.org/citation.cfm?id=281649 + # http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.70.4127&rep=rep1&type=pdf + + if a is None or b is None: + # In this case, it is just simple substitution + return super()._eval_interval(sym, a, b) + else: + x, lo, hi = map(as_Basic, (sym, a, b)) + + if _first: # get only x-dependent relationals + def handler(ipw): + if isinstance(ipw, self.func): + return ipw._eval_interval(x, lo, hi, _first=None) + else: + return ipw._eval_interval(x, lo, hi) + irv = self._handle_irel(x, handler) + if irv is not None: + return irv + + if (lo < hi) is S.false or ( + lo is S.Infinity or hi is S.NegativeInfinity): + rv = self._eval_interval(x, hi, lo, _first=False) + if isinstance(rv, Piecewise): + rv = Piecewise(*[(-e, c) for e, c in rv.args]) + else: + rv = -rv + return rv + + if (lo < hi) is S.true or ( + hi is S.Infinity or lo is S.NegativeInfinity): + pass + else: + _a = Dummy('lo') + _b = Dummy('hi') + a = lo if lo.is_comparable else _a + b = hi if hi.is_comparable else _b + pos = self._eval_interval(x, a, b, _first=False) + if a == _a and b == _b: + # it's purely symbolic so just swap lo and hi and + # change the sign to get the value for when lo > hi + neg, pos = (-pos.xreplace({_a: hi, _b: lo}), + pos.xreplace({_a: lo, _b: hi})) + else: + # at least one of the bounds was comparable, so allow + # _eval_interval to use that information when computing + # the interval with lo and hi reversed + neg, pos = (-self._eval_interval(x, hi, lo, _first=False), + pos.xreplace({_a: lo, _b: hi})) + + # allow simplification based on ordering of lo and hi + p = Dummy('', positive=True) + if lo.is_Symbol: + pos = pos.xreplace({lo: hi - p}).xreplace({p: hi - lo}) + neg = neg.xreplace({lo: hi + p}).xreplace({p: lo - hi}) + elif hi.is_Symbol: + pos = pos.xreplace({hi: lo + p}).xreplace({p: hi - lo}) + neg = neg.xreplace({hi: lo - p}).xreplace({p: lo - hi}) + # evaluate limits that may have unevaluate Min/Max + touch = lambda _: _.replace( + lambda x: isinstance(x, (Min, Max)), + lambda x: x.func(*x.args)) + neg = touch(neg) + pos = touch(pos) + # assemble return expression; make the first condition be Lt + # b/c then the first expression will look the same whether + # the lo or hi limit is symbolic + if a == _a: # the lower limit was symbolic + rv = Piecewise( + (pos, + lo < hi), + (neg, + True)) + else: + rv = Piecewise( + (neg, + hi < lo), + (pos, + True)) + + if rv == Undefined: + raise ValueError("Can't integrate across undefined region.") + if any(isinstance(i, Piecewise) for i in (pos, neg)): + rv = piecewise_fold(rv) + return rv + + # handle a Piecewise with lo <= hi and no x-independent relationals + # ----------------------------------------------------------------- + ok, abei = self._intervals(x) + if not ok: + from sympy.integrals.integrals import Integral + # not being able to do the interval of f(x) can + # be stated as not being able to do the integral + # of f'(x) over the same range + return Integral(self.diff(x), (x, lo, hi)) # unevaluated + + pieces = [(a, b) for a, b, _, _ in abei] + done = [(lo, hi, -1)] + oo = S.Infinity + for k, p in enumerate(pieces): + if p[:2] == (-oo, oo): + # all undone intervals will get this key + for j, (a, b, i) in enumerate(done): + if i == -1: + done[j] = a, b, k + break # nothing else to consider + N = len(done) - 1 + for j, (a, b, i) in enumerate(reversed(done)): + if i == -1: + j = N - j + done[j: j + 1] = _clip(p, (a, b), k) + done = [(a, b, i) for a, b, i in done if a != b] + + # return the sum of the intervals + sum = S.Zero + upto = None + for a, b, i in done: + if i == -1: + if upto is None: + return Undefined + # TODO simplify hi <= upto + return Piecewise((sum, hi <= upto), (Undefined, True)) + sum += abei[i][-2]._eval_interval(x, a, b) + upto = b + return sum + + def _intervals(self, sym, err_on_Eq=False): + r"""Return a bool and a message (when bool is False), else a + list of unique tuples, (a, b, e, i), where a and b + are the lower and upper bounds in which the expression e of + argument i in self is defined and $a < b$ (when involving + numbers) or $a \le b$ when involving symbols. + + If there are any relationals not involving sym, or any + relational cannot be solved for sym, the bool will be False + a message be given as the second return value. The calling + routine should have removed such relationals before calling + this routine. + + The evaluated conditions will be returned as ranges. + Discontinuous ranges will be returned separately with + identical expressions. The first condition that evaluates to + True will be returned as the last tuple with a, b = -oo, oo. + """ + from sympy.solvers.inequalities import _solve_inequality + + assert isinstance(self, Piecewise) + + def nonsymfail(cond): + return False, filldedent(''' + A condition not involving + %s appeared: %s''' % (sym, cond)) + + def _solve_relational(r): + if sym not in r.free_symbols: + return nonsymfail(r) + try: + rv = _solve_inequality(r, sym) + except NotImplementedError: + return False, 'Unable to solve relational %s for %s.' % (r, sym) + if isinstance(rv, Relational): + free = rv.args[1].free_symbols + if rv.args[0] != sym or sym in free: + return False, 'Unable to solve relational %s for %s.' % (r, sym) + if rv.rel_op == '==': + # this equality has been affirmed to have the form + # Eq(sym, rhs) where rhs is sym-free; it represents + # a zero-width interval which will be ignored + # whether it is an isolated condition or contained + # within an And or an Or + rv = S.false + elif rv.rel_op == '!=': + try: + rv = Or(sym < rv.rhs, sym > rv.rhs) + except TypeError: + # e.g. x != I ==> all real x satisfy + rv = S.true + elif rv == (S.NegativeInfinity < sym) & (sym < S.Infinity): + rv = S.true + return True, rv + + args = list(self.args) + # make self canonical wrt Relationals + keys = self.atoms(Relational) + reps = {} + for r in keys: + ok, s = _solve_relational(r) + if ok != True: + return False, ok + reps[r] = s + # process args individually so if any evaluate, their position + # in the original Piecewise will be known + args = [i.xreplace(reps) for i in self.args] + + # precondition args + expr_cond = [] + default = idefault = None + for i, (expr, cond) in enumerate(args): + if cond is S.false: + continue + if cond is S.true: + default = expr + idefault = i + break + if isinstance(cond, Eq): + # unanticipated condition, but it is here in case a + # replacement caused an Eq to appear + if err_on_Eq: + return False, 'encountered Eq condition: %s' % cond + continue # zero width interval + + cond = to_cnf(cond) + if isinstance(cond, And): + cond = distribute_or_over_and(cond) + + if isinstance(cond, Or): + expr_cond.extend( + [(i, expr, o) for o in cond.args + if not isinstance(o, Eq)]) + elif cond is not S.false: + expr_cond.append((i, expr, cond)) + elif cond is S.true: + default = expr + idefault = i + break + + # determine intervals represented by conditions + int_expr = [] + for iarg, expr, cond in expr_cond: + if isinstance(cond, And): + lower = S.NegativeInfinity + upper = S.Infinity + exclude = [] + for cond2 in cond.args: + if not isinstance(cond2, Relational): + return False, 'expecting only Relationals' + if isinstance(cond2, Eq): + lower = upper # ignore + if err_on_Eq: + return False, 'encountered secondary Eq condition' + break + elif isinstance(cond2, Ne): + l, r = cond2.args + if l == sym: + exclude.append(r) + elif r == sym: + exclude.append(l) + else: + return nonsymfail(cond2) + continue + elif cond2.lts == sym: + upper = Min(cond2.gts, upper) + elif cond2.gts == sym: + lower = Max(cond2.lts, lower) + else: + return nonsymfail(cond2) # should never get here + if exclude: + exclude = list(ordered(exclude)) + newcond = [] + for i, e in enumerate(exclude): + if e < lower == True or e > upper == True: + continue + if not newcond: + newcond.append((None, lower)) # add a primer + newcond.append((newcond[-1][1], e)) + newcond.append((newcond[-1][1], upper)) + newcond.pop(0) # remove the primer + expr_cond.extend([(iarg, expr, And(i[0] < sym, sym < i[1])) for i in newcond]) + continue + elif isinstance(cond, Relational) and cond.rel_op != '!=': + lower, upper = cond.lts, cond.gts # part 1: initialize with givens + if cond.lts == sym: # part 1a: expand the side ... + lower = S.NegativeInfinity # e.g. x <= 0 ---> -oo <= 0 + elif cond.gts == sym: # part 1a: ... that can be expanded + upper = S.Infinity # e.g. x >= 0 ---> oo >= 0 + else: + return nonsymfail(cond) + else: + return False, 'unrecognized condition: %s' % cond + + upper = Max(lower, upper) + if err_on_Eq and lower == upper: + return False, 'encountered Eq condition' + if (lower >= upper) is not S.true: + int_expr.append((lower, upper, expr, iarg)) + + if default is not None: + int_expr.append( + (S.NegativeInfinity, S.Infinity, default, idefault)) + + return True, list(uniq(int_expr)) + + def _eval_nseries(self, x, n, logx, cdir=0): + args = [(ec.expr._eval_nseries(x, n, logx), ec.cond) for ec in self.args] + return self.func(*args) + + def _eval_power(self, s): + return self.func(*[(e**s, c) for e, c in self.args]) + + def _eval_subs(self, old, new): + # this is strictly not necessary, but we can keep track + # of whether True or False conditions arise and be + # somewhat more efficient by avoiding other substitutions + # and avoiding invalid conditions that appear after a + # True condition + args = list(self.args) + args_exist = False + for i, (e, c) in enumerate(args): + c = c._subs(old, new) + if c != False: + args_exist = True + e = e._subs(old, new) + args[i] = (e, c) + if c == True: + break + if not args_exist: + args = ((Undefined, True),) + return self.func(*args) + + def _eval_transpose(self): + return self.func(*[(e.transpose(), c) for e, c in self.args]) + + def _eval_template_is_attr(self, is_attr): + b = None + for expr, _ in self.args: + a = getattr(expr, is_attr) + if a is None: + return + if b is None: + b = a + elif b is not a: + return + return b + + _eval_is_finite = lambda self: self._eval_template_is_attr( + 'is_finite') + _eval_is_complex = lambda self: self._eval_template_is_attr('is_complex') + _eval_is_even = lambda self: self._eval_template_is_attr('is_even') + _eval_is_imaginary = lambda self: self._eval_template_is_attr( + 'is_imaginary') + _eval_is_integer = lambda self: self._eval_template_is_attr('is_integer') + _eval_is_irrational = lambda self: self._eval_template_is_attr( + 'is_irrational') + _eval_is_negative = lambda self: self._eval_template_is_attr('is_negative') + _eval_is_nonnegative = lambda self: self._eval_template_is_attr( + 'is_nonnegative') + _eval_is_nonpositive = lambda self: self._eval_template_is_attr( + 'is_nonpositive') + _eval_is_nonzero = lambda self: self._eval_template_is_attr( + 'is_nonzero') + _eval_is_odd = lambda self: self._eval_template_is_attr('is_odd') + _eval_is_polar = lambda self: self._eval_template_is_attr('is_polar') + _eval_is_positive = lambda self: self._eval_template_is_attr('is_positive') + _eval_is_extended_real = lambda self: self._eval_template_is_attr( + 'is_extended_real') + _eval_is_extended_positive = lambda self: self._eval_template_is_attr( + 'is_extended_positive') + _eval_is_extended_negative = lambda self: self._eval_template_is_attr( + 'is_extended_negative') + _eval_is_extended_nonzero = lambda self: self._eval_template_is_attr( + 'is_extended_nonzero') + _eval_is_extended_nonpositive = lambda self: self._eval_template_is_attr( + 'is_extended_nonpositive') + _eval_is_extended_nonnegative = lambda self: self._eval_template_is_attr( + 'is_extended_nonnegative') + _eval_is_real = lambda self: self._eval_template_is_attr('is_real') + _eval_is_zero = lambda self: self._eval_template_is_attr( + 'is_zero') + + @classmethod + def __eval_cond(cls, cond): + """Return the truth value of the condition.""" + if cond == True: + return True + if isinstance(cond, Eq): + try: + diff = cond.lhs - cond.rhs + if diff.is_commutative: + return diff.is_zero + except TypeError: + pass + + def as_expr_set_pairs(self, domain=None): + """Return tuples for each argument of self that give + the expression and the interval in which it is valid + which is contained within the given domain. + If a condition cannot be converted to a set, an error + will be raised. The variable of the conditions is + assumed to be real; sets of real values are returned. + + Examples + ======== + + >>> from sympy import Piecewise, Interval + >>> from sympy.abc import x + >>> p = Piecewise( + ... (1, x < 2), + ... (2,(x > 0) & (x < 4)), + ... (3, True)) + >>> p.as_expr_set_pairs() + [(1, Interval.open(-oo, 2)), + (2, Interval.Ropen(2, 4)), + (3, Interval(4, oo))] + >>> p.as_expr_set_pairs(Interval(0, 3)) + [(1, Interval.Ropen(0, 2)), + (2, Interval(2, 3))] + """ + if domain is None: + domain = S.Reals + exp_sets = [] + U = domain + complex = not domain.is_subset(S.Reals) + cond_free = set() + for expr, cond in self.args: + cond_free |= cond.free_symbols + if len(cond_free) > 1: + raise NotImplementedError(filldedent(''' + multivariate conditions are not handled.''')) + if complex: + for i in cond.atoms(Relational): + if not isinstance(i, (Eq, Ne)): + raise ValueError(filldedent(''' + Inequalities in the complex domain are + not supported. Try the real domain by + setting domain=S.Reals''')) + cond_int = U.intersect(cond.as_set()) + U = U - cond_int + if cond_int != S.EmptySet: + exp_sets.append((expr, cond_int)) + return exp_sets + + def _eval_rewrite_as_ITE(self, *args, **kwargs): + byfree = {} + args = list(args) + default = any(c == True for b, c in args) + for i, (b, c) in enumerate(args): + if not isinstance(b, Boolean) and b != True: + raise TypeError(filldedent(''' + Expecting Boolean or bool but got `%s` + ''' % func_name(b))) + if c == True: + break + # loop over independent conditions for this b + for c in c.args if isinstance(c, Or) else [c]: + free = c.free_symbols + x = free.pop() + try: + byfree[x] = byfree.setdefault( + x, S.EmptySet).union(c.as_set()) + except NotImplementedError: + if not default: + raise NotImplementedError(filldedent(''' + A method to determine whether a multivariate + conditional is consistent with a complete coverage + of all variables has not been implemented so the + rewrite is being stopped after encountering `%s`. + This error would not occur if a default expression + like `(foo, True)` were given. + ''' % c)) + if byfree[x] in (S.UniversalSet, S.Reals): + # collapse the ith condition to True and break + args[i] = list(args[i]) + c = args[i][1] = True + break + if c == True: + break + if c != True: + raise ValueError(filldedent(''' + Conditions must cover all reals or a final default + condition `(foo, True)` must be given. + ''')) + last, _ = args[i] # ignore all past ith arg + for a, c in reversed(args[:i]): + last = ITE(c, a, last) + return _canonical(last) + + def _eval_rewrite_as_KroneckerDelta(self, *args, **kwargs): + from sympy.functions.special.tensor_functions import KroneckerDelta + + rules = { + And: [False, False], + Or: [True, True], + Not: [True, False], + Eq: [None, None], + Ne: [None, None] + } + + class UnrecognizedCondition(Exception): + pass + + def rewrite(cond): + if isinstance(cond, Eq): + return KroneckerDelta(*cond.args) + if isinstance(cond, Ne): + return 1 - KroneckerDelta(*cond.args) + + cls, args = type(cond), cond.args + if cls not in rules: + raise UnrecognizedCondition(cls) + + b1, b2 = rules[cls] + k = Mul(*[1 - rewrite(c) for c in args]) if b1 else Mul(*[rewrite(c) for c in args]) + + if b2: + return 1 - k + return k + + conditions = [] + true_value = None + for value, cond in args: + if type(cond) in rules: + conditions.append((value, cond)) + elif cond is S.true: + if true_value is None: + true_value = value + else: + return + + if true_value is not None: + result = true_value + + for value, cond in conditions[::-1]: + try: + k = rewrite(cond) + result = k * value + (1 - k) * result + except UnrecognizedCondition: + return + + return result + + +def piecewise_fold(expr, evaluate=True): + """ + Takes an expression containing a piecewise function and returns the + expression in piecewise form. In addition, any ITE conditions are + rewritten in negation normal form and simplified. + + The final Piecewise is evaluated (default) but if the raw form + is desired, send ``evaluate=False``; if trivial evaluation is + desired, send ``evaluate=None`` and duplicate conditions and + processing of True and False will be handled. + + Examples + ======== + + >>> from sympy import Piecewise, piecewise_fold, S + >>> from sympy.abc import x + >>> p = Piecewise((x, x < 1), (1, S(1) <= x)) + >>> piecewise_fold(x*p) + Piecewise((x**2, x < 1), (x, True)) + + See Also + ======== + + Piecewise + piecewise_exclusive + """ + if not isinstance(expr, Basic) or not expr.has(Piecewise): + return expr + + new_args = [] + if isinstance(expr, (ExprCondPair, Piecewise)): + for e, c in expr.args: + if not isinstance(e, Piecewise): + e = piecewise_fold(e) + # we don't keep Piecewise in condition because + # it has to be checked to see that it's complete + # and we convert it to ITE at that time + assert not c.has(Piecewise) # pragma: no cover + if isinstance(c, ITE): + c = c.to_nnf() + c = simplify_logic(c, form='cnf') + if isinstance(e, Piecewise): + new_args.extend([(piecewise_fold(ei), And(ci, c)) + for ei, ci in e.args]) + else: + new_args.append((e, c)) + else: + # Given + # P1 = Piecewise((e11, c1), (e12, c2), A) + # P2 = Piecewise((e21, c1), (e22, c2), B) + # ... + # the folding of f(P1, P2) is trivially + # Piecewise( + # (f(e11, e21), c1), + # (f(e12, e22), c2), + # (f(Piecewise(A), Piecewise(B)), True)) + # Certain objects end up rewriting themselves as thus, so + # we do that grouping before the more generic folding. + # The following applies this idea when f = Add or f = Mul + # (and the expression is commutative). + if expr.is_Add or expr.is_Mul and expr.is_commutative: + p, args = sift(expr.args, lambda x: x.is_Piecewise, binary=True) + pc = sift(p, lambda x: tuple([c for e,c in x.args])) + for c in list(ordered(pc)): + if len(pc[c]) > 1: + pargs = [list(i.args) for i in pc[c]] + # the first one is the same; there may be more + com = common_prefix(*[ + [i.cond for i in j] for j in pargs]) + n = len(com) + collected = [] + for i in range(n): + collected.append(( + expr.func(*[ai[i].expr for ai in pargs]), + com[i])) + remains = [] + for a in pargs: + if n == len(a): # no more args + continue + if a[n].cond == True: # no longer Piecewise + remains.append(a[n].expr) + else: # restore the remaining Piecewise + remains.append( + Piecewise(*a[n:], evaluate=False)) + if remains: + collected.append((expr.func(*remains), True)) + args.append(Piecewise(*collected, evaluate=False)) + continue + args.extend(pc[c]) + else: + args = expr.args + # fold + folded = list(map(piecewise_fold, args)) + for ec in product(*[ + (i.args if isinstance(i, Piecewise) else + [(i, true)]) for i in folded]): + e, c = zip(*ec) + new_args.append((expr.func(*e), And(*c))) + + if evaluate is None: + # don't return duplicate conditions, otherwise don't evaluate + new_args = list(reversed([(e, c) for c, e in { + c: e for e, c in reversed(new_args)}.items()])) + rv = Piecewise(*new_args, evaluate=evaluate) + if evaluate is None and len(rv.args) == 1 and rv.args[0].cond == True: + return rv.args[0].expr + if any(s.expr.has(Piecewise) for p in rv.atoms(Piecewise) for s in p.args): + return piecewise_fold(rv) + return rv + + +def _clip(A, B, k): + """Return interval B as intervals that are covered by A (keyed + to k) and all other intervals of B not covered by A keyed to -1. + + The reference point of each interval is the rhs; if the lhs is + greater than the rhs then an interval of zero width interval will + result, e.g. (4, 1) is treated like (1, 1). + + Examples + ======== + + >>> from sympy.functions.elementary.piecewise import _clip + >>> from sympy import Tuple + >>> A = Tuple(1, 3) + >>> B = Tuple(2, 4) + >>> _clip(A, B, 0) + [(2, 3, 0), (3, 4, -1)] + + Interpretation: interval portion (2, 3) of interval (2, 4) is + covered by interval (1, 3) and is keyed to 0 as requested; + interval (3, 4) was not covered by (1, 3) and is keyed to -1. + """ + a, b = B + c, d = A + c, d = Min(Max(c, a), b), Min(Max(d, a), b) + a = Min(a, b) + p = [] + if a != c: + p.append((a, c, -1)) + else: + pass + if c != d: + p.append((c, d, k)) + else: + pass + if b != d: + if d == c and p and p[-1][-1] == -1: + p[-1] = p[-1][0], b, -1 + else: + p.append((d, b, -1)) + else: + pass + + return p + + +def piecewise_simplify_arguments(expr, **kwargs): + from sympy.simplify.simplify import simplify + + # simplify conditions + f1 = expr.args[0].cond.free_symbols + args = None + if len(f1) == 1 and not expr.atoms(Eq): + x = f1.pop() + # this won't return intervals involving Eq + # and it won't handle symbols treated as + # booleans + ok, abe_ = expr._intervals(x, err_on_Eq=True) + def include(c, x, a): + "return True if c.subs(x, a) is True, else False" + try: + return c.subs(x, a) == True + except TypeError: + return False + if ok: + args = [] + covered = S.EmptySet + from sympy.sets.sets import Interval + for a, b, e, i in abe_: + c = expr.args[i].cond + incl_a = include(c, x, a) + incl_b = include(c, x, b) + iv = Interval(a, b, not incl_a, not incl_b) + cset = iv - covered + if not cset: + continue + try: + a = cset.inf + except NotImplementedError: + pass # continue with the given `a` + else: + incl_a = include(c, x, a) + if incl_a and incl_b: + if a.is_infinite and b.is_infinite: + c = S.true + elif b.is_infinite: + c = (x > a) if a in covered else (x >= a) + elif a.is_infinite: + c = (x <= b) + elif a in covered: + c = And(a < x, x <= b) + else: + c = And(a <= x, x <= b) + elif incl_a: + if a.is_infinite: + c = (x < b) + elif a in covered: + c = And(a < x, x < b) + else: + c = And(a <= x, x < b) + elif incl_b: + if b.is_infinite: + c = (x > a) + else: + c = And(a < x, x <= b) + else: + if a in covered: + c = (x < b) + else: + c = And(a < x, x < b) + covered |= iv + if a is S.NegativeInfinity and incl_a: + covered |= {S.NegativeInfinity} + if b is S.Infinity and incl_b: + covered |= {S.Infinity} + args.append((e, c)) + if not S.Reals.is_subset(covered): + args.append((Undefined, True)) + if args is None: + args = list(expr.args) + for i in range(len(args)): + e, c = args[i] + if isinstance(c, Basic): + c = simplify(c, **kwargs) + args[i] = (e, c) + + # simplify expressions + doit = kwargs.pop('doit', None) + for i in range(len(args)): + e, c = args[i] + if isinstance(e, Basic): + # Skip doit to avoid growth at every call for some integrals + # and sums, see sympy/sympy#17165 + newe = simplify(e, doit=False, **kwargs) + if newe != e: + e = newe + args[i] = (e, c) + + # restore kwargs flag + if doit is not None: + kwargs['doit'] = doit + + return Piecewise(*args) + + +def _piecewise_collapse_arguments(_args): + newargs = [] # the unevaluated conditions + current_cond = set() # the conditions up to a given e, c pair + for expr, cond in _args: + cond = cond.replace( + lambda _: _.is_Relational, _canonical_coeff) + # Check here if expr is a Piecewise and collapse if one of + # the conds in expr matches cond. This allows the collapsing + # of Piecewise((Piecewise((x,x<0)),x<0)) to Piecewise((x,x<0)). + # This is important when using piecewise_fold to simplify + # multiple Piecewise instances having the same conds. + # Eventually, this code should be able to collapse Piecewise's + # having different intervals, but this will probably require + # using the new assumptions. + if isinstance(expr, Piecewise): + unmatching = [] + for i, (e, c) in enumerate(expr.args): + if c in current_cond: + # this would already have triggered + continue + if c == cond: + if c != True: + # nothing past this condition will ever + # trigger and only those args before this + # that didn't match a previous condition + # could possibly trigger + if unmatching: + expr = Piecewise(*( + unmatching + [(e, c)])) + else: + expr = e + break + else: + unmatching.append((e, c)) + + # check for condition repeats + got = False + # -- if an And contains a condition that was + # already encountered, then the And will be + # False: if the previous condition was False + # then the And will be False and if the previous + # condition is True then then we wouldn't get to + # this point. In either case, we can skip this condition. + for i in ([cond] + + (list(cond.args) if isinstance(cond, And) else + [])): + if i in current_cond: + got = True + break + if got: + continue + + # -- if not(c) is already in current_cond then c is + # a redundant condition in an And. This does not + # apply to Or, however: (e1, c), (e2, Or(~c, d)) + # is not (e1, c), (e2, d) because if c and d are + # both False this would give no results when the + # true answer should be (e2, True) + if isinstance(cond, And): + nonredundant = [] + for c in cond.args: + if isinstance(c, Relational): + if c.negated.canonical in current_cond: + continue + # if a strict inequality appears after + # a non-strict one, then the condition is + # redundant + if isinstance(c, (Lt, Gt)) and ( + c.weak in current_cond): + cond = False + break + nonredundant.append(c) + else: + cond = cond.func(*nonredundant) + elif isinstance(cond, Relational): + if cond.negated.canonical in current_cond: + cond = S.true + + current_cond.add(cond) + + # collect successive e,c pairs when exprs or cond match + if newargs: + if newargs[-1].expr == expr: + orcond = Or(cond, newargs[-1].cond) + if isinstance(orcond, (And, Or)): + orcond = distribute_and_over_or(orcond) + newargs[-1] = ExprCondPair(expr, orcond) + continue + elif newargs[-1].cond == cond: + continue + newargs.append(ExprCondPair(expr, cond)) + return newargs + + +_blessed = lambda e: getattr(e.lhs, '_diff_wrt', False) and ( + getattr(e.rhs, '_diff_wrt', None) or + isinstance(e.rhs, (Rational, NumberSymbol))) + + +def piecewise_simplify(expr, **kwargs): + expr = piecewise_simplify_arguments(expr, **kwargs) + if not isinstance(expr, Piecewise): + return expr + args = list(expr.args) + + args = _piecewise_simplify_eq_and(args) + args = _piecewise_simplify_equal_to_next_segment(args) + return Piecewise(*args) + + +def _piecewise_simplify_equal_to_next_segment(args): + """ + See if expressions valid for an Equal expression happens to evaluate + to the same function as in the next piecewise segment, see: + https://github.com/sympy/sympy/issues/8458 + """ + prevexpr = None + for i, (expr, cond) in reversed(list(enumerate(args))): + if prevexpr is not None: + if isinstance(cond, And): + eqs, other = sift(cond.args, + lambda i: isinstance(i, Eq), binary=True) + elif isinstance(cond, Eq): + eqs, other = [cond], [] + else: + eqs = other = [] + _prevexpr = prevexpr + _expr = expr + if eqs and not other: + eqs = list(ordered(eqs)) + for e in eqs: + # allow 2 args to collapse into 1 for any e + # otherwise limit simplification to only simple-arg + # Eq instances + if len(args) == 2 or _blessed(e): + _prevexpr = _prevexpr.subs(*e.args) + _expr = _expr.subs(*e.args) + # Did it evaluate to the same? + if _prevexpr == _expr: + # Set the expression for the Not equal section to the same + # as the next. These will be merged when creating the new + # Piecewise + args[i] = args[i].func(args[i + 1][0], cond) + else: + # Update the expression that we compare against + prevexpr = expr + else: + prevexpr = expr + return args + + +def _piecewise_simplify_eq_and(args): + """ + Try to simplify conditions and the expression for + equalities that are part of the condition, e.g. + Piecewise((n, And(Eq(n,0), Eq(n + m, 0))), (1, True)) + -> Piecewise((0, And(Eq(n, 0), Eq(m, 0))), (1, True)) + """ + for i, (expr, cond) in enumerate(args): + if isinstance(cond, And): + eqs, other = sift(cond.args, + lambda i: isinstance(i, Eq), binary=True) + elif isinstance(cond, Eq): + eqs, other = [cond], [] + else: + eqs = other = [] + if eqs: + eqs = list(ordered(eqs)) + for j, e in enumerate(eqs): + # these blessed lhs objects behave like Symbols + # and the rhs are simple replacements for the "symbols" + if _blessed(e): + expr = expr.subs(*e.args) + eqs[j + 1:] = [ei.subs(*e.args) for ei in eqs[j + 1:]] + other = [ei.subs(*e.args) for ei in other] + cond = And(*(eqs + other)) + args[i] = args[i].func(expr, cond) + return args + + +def piecewise_exclusive(expr, *, skip_nan=False, deep=True): + """ + Rewrite :class:`Piecewise` with mutually exclusive conditions. + + Explanation + =========== + + SymPy represents the conditions of a :class:`Piecewise` in an + "if-elif"-fashion, allowing more than one condition to be simultaneously + True. The interpretation is that the first condition that is True is the + case that holds. While this is a useful representation computationally it + is not how a piecewise formula is typically shown in a mathematical text. + The :func:`piecewise_exclusive` function can be used to rewrite any + :class:`Piecewise` with more typical mutually exclusive conditions. + + Note that further manipulation of the resulting :class:`Piecewise`, e.g. + simplifying it, will most likely make it non-exclusive. Hence, this is + primarily a function to be used in conjunction with printing the Piecewise + or if one would like to reorder the expression-condition pairs. + + If it is not possible to determine that all possibilities are covered by + the different cases of the :class:`Piecewise` then a final + :class:`~sympy.core.numbers.NaN` case will be included explicitly. This + can be prevented by passing ``skip_nan=True``. + + Examples + ======== + + >>> from sympy import piecewise_exclusive, Symbol, Piecewise, S + >>> x = Symbol('x', real=True) + >>> p = Piecewise((0, x < 0), (S.Half, x <= 0), (1, True)) + >>> piecewise_exclusive(p) + Piecewise((0, x < 0), (1/2, Eq(x, 0)), (1, x > 0)) + >>> piecewise_exclusive(Piecewise((2, x > 1))) + Piecewise((2, x > 1), (nan, x <= 1)) + >>> piecewise_exclusive(Piecewise((2, x > 1)), skip_nan=True) + Piecewise((2, x > 1)) + + Parameters + ========== + + expr: a SymPy expression. + Any :class:`Piecewise` in the expression will be rewritten. + skip_nan: ``bool`` (default ``False``) + If ``skip_nan`` is set to ``True`` then a final + :class:`~sympy.core.numbers.NaN` case will not be included. + deep: ``bool`` (default ``True``) + If ``deep`` is ``True`` then :func:`piecewise_exclusive` will rewrite + any :class:`Piecewise` subexpressions in ``expr`` rather than just + rewriting ``expr`` itself. + + Returns + ======= + + An expression equivalent to ``expr`` but where all :class:`Piecewise` have + been rewritten with mutually exclusive conditions. + + See Also + ======== + + Piecewise + piecewise_fold + """ + + def make_exclusive(*pwargs): + + cumcond = false + newargs = [] + + # Handle the first n-1 cases + for expr_i, cond_i in pwargs[:-1]: + cancond = And(cond_i, Not(cumcond)).simplify() + cumcond = Or(cond_i, cumcond).simplify() + newargs.append((expr_i, cancond)) + + # For the nth case defer simplification of cumcond + expr_n, cond_n = pwargs[-1] + cancond_n = And(cond_n, Not(cumcond)).simplify() + newargs.append((expr_n, cancond_n)) + + if not skip_nan: + cumcond = Or(cond_n, cumcond).simplify() + if cumcond is not true: + newargs.append((Undefined, Not(cumcond).simplify())) + + return Piecewise(*newargs, evaluate=False) + + if deep: + return expr.replace(Piecewise, make_exclusive) + elif isinstance(expr, Piecewise): + return make_exclusive(*expr.args) + else: + return expr diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/functions/elementary/tests/test_complexes.py b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/functions/elementary/tests/test_complexes.py new file mode 100644 index 0000000000000000000000000000000000000000..699c0fef966c99147b713aaa80710b7b8cf21c73 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/functions/elementary/tests/test_complexes.py @@ -0,0 +1,1030 @@ +from sympy.core.function import (Derivative, Function, Lambda, expand, PoleError) +from sympy.core.numbers import (E, I, Rational, comp, nan, oo, pi, zoo) +from sympy.core.relational import Eq +from sympy.core.singleton import S +from sympy.core.symbol import (Symbol, symbols) +from sympy.functions.elementary.complexes import (Abs, adjoint, arg, conjugate, im, re, sign, transpose) +from sympy.functions.elementary.exponential import (exp, exp_polar, log) +from sympy.functions.elementary.miscellaneous import sqrt +from sympy.functions.elementary.piecewise import Piecewise +from sympy.functions.elementary.trigonometric import (acos, atan, atan2, cos, sin) +from sympy.functions.elementary.hyperbolic import sinh +from sympy.functions.special.delta_functions import (DiracDelta, Heaviside) +from sympy.integrals.integrals import Integral +from sympy.matrices.dense import Matrix +from sympy.matrices.expressions.funcmatrix import FunctionMatrix +from sympy.matrices.expressions.matexpr import MatrixSymbol +from sympy.matrices.immutable import (ImmutableMatrix, ImmutableSparseMatrix) +from sympy.matrices import SparseMatrix +from sympy.sets.sets import Interval +from sympy.core.expr import unchanged +from sympy.core.function import ArgumentIndexError +from sympy.series.order import Order +from sympy.testing.pytest import XFAIL, raises, _both_exp_pow + + +def N_equals(a, b): + """Check whether two complex numbers are numerically close""" + return comp(a.n(), b.n(), 1.e-6) + + +def test_re(): + x, y = symbols('x,y') + a, b = symbols('a,b', real=True) + + r = Symbol('r', real=True) + i = Symbol('i', imaginary=True) + + assert re(nan) is nan + + assert re(oo) is oo + assert re(-oo) is -oo + + assert re(0) == 0 + + assert re(1) == 1 + assert re(-1) == -1 + + assert re(E) == E + assert re(-E) == -E + + assert unchanged(re, x) + assert re(x*I) == -im(x) + assert re(r*I) == 0 + assert re(r) == r + assert re(i*I) == I * i + assert re(i) == 0 + + assert re(x + y) == re(x) + re(y) + assert re(x + r) == re(x) + r + + assert re(re(x)) == re(x) + + assert re(2 + I) == 2 + assert re(x + I) == re(x) + + assert re(x + y*I) == re(x) - im(y) + assert re(x + r*I) == re(x) + + assert re(log(2*I)) == log(2) + + assert re((2 + I)**2).expand(complex=True) == 3 + + assert re(conjugate(x)) == re(x) + assert conjugate(re(x)) == re(x) + + assert re(x).as_real_imag() == (re(x), 0) + + assert re(i*r*x).diff(r) == re(i*x) + assert re(i*r*x).diff(i) == I*r*im(x) + + assert re( + sqrt(a + b*I)) == (a**2 + b**2)**Rational(1, 4)*cos(atan2(b, a)/2) + assert re(a * (2 + b*I)) == 2*a + + assert re((1 + sqrt(a + b*I))/2) == \ + (a**2 + b**2)**Rational(1, 4)*cos(atan2(b, a)/2)/2 + S.Half + + assert re(x).rewrite(im) == x - S.ImaginaryUnit*im(x) + assert (x + re(y)).rewrite(re, im) == x + y - S.ImaginaryUnit*im(y) + + a = Symbol('a', algebraic=True) + t = Symbol('t', transcendental=True) + x = Symbol('x') + assert re(a).is_algebraic + assert re(x).is_algebraic is None + assert re(t).is_algebraic is False + + assert re(S.ComplexInfinity) is S.NaN + + n, m, l = symbols('n m l') + A = MatrixSymbol('A',n,m) + assert re(A) == (S.Half) * (A + conjugate(A)) + + A = Matrix([[1 + 4*I,2],[0, -3*I]]) + assert re(A) == Matrix([[1, 2],[0, 0]]) + + A = ImmutableMatrix([[1 + 3*I, 3-2*I],[0, 2*I]]) + assert re(A) == ImmutableMatrix([[1, 3],[0, 0]]) + + X = SparseMatrix([[2*j + i*I for i in range(5)] for j in range(5)]) + assert re(X) - Matrix([[0, 0, 0, 0, 0], + [2, 2, 2, 2, 2], + [4, 4, 4, 4, 4], + [6, 6, 6, 6, 6], + [8, 8, 8, 8, 8]]) == Matrix.zeros(5) + + assert im(X) - Matrix([[0, 1, 2, 3, 4], + [0, 1, 2, 3, 4], + [0, 1, 2, 3, 4], + [0, 1, 2, 3, 4], + [0, 1, 2, 3, 4]]) == Matrix.zeros(5) + + X = FunctionMatrix(3, 3, Lambda((n, m), n + m*I)) + assert re(X) == Matrix([[0, 0, 0], [1, 1, 1], [2, 2, 2]]) + + +def test_im(): + x, y = symbols('x,y') + a, b = symbols('a,b', real=True) + + r = Symbol('r', real=True) + i = Symbol('i', imaginary=True) + + assert im(nan) is nan + + assert im(oo*I) is oo + assert im(-oo*I) is -oo + + assert im(0) == 0 + + assert im(1) == 0 + assert im(-1) == 0 + + assert im(E*I) == E + assert im(-E*I) == -E + + assert unchanged(im, x) + assert im(x*I) == re(x) + assert im(r*I) == r + assert im(r) == 0 + assert im(i*I) == 0 + assert im(i) == -I * i + + assert im(x + y) == im(x) + im(y) + assert im(x + r) == im(x) + assert im(x + r*I) == im(x) + r + + assert im(im(x)*I) == im(x) + + assert im(2 + I) == 1 + assert im(x + I) == im(x) + 1 + + assert im(x + y*I) == im(x) + re(y) + assert im(x + r*I) == im(x) + r + + assert im(log(2*I)) == pi/2 + + assert im((2 + I)**2).expand(complex=True) == 4 + + assert im(conjugate(x)) == -im(x) + assert conjugate(im(x)) == im(x) + + assert im(x).as_real_imag() == (im(x), 0) + + assert im(i*r*x).diff(r) == im(i*x) + assert im(i*r*x).diff(i) == -I * re(r*x) + + assert im( + sqrt(a + b*I)) == (a**2 + b**2)**Rational(1, 4)*sin(atan2(b, a)/2) + assert im(a * (2 + b*I)) == a*b + + assert im((1 + sqrt(a + b*I))/2) == \ + (a**2 + b**2)**Rational(1, 4)*sin(atan2(b, a)/2)/2 + + assert im(x).rewrite(re) == -S.ImaginaryUnit * (x - re(x)) + assert (x + im(y)).rewrite(im, re) == x - S.ImaginaryUnit * (y - re(y)) + + a = Symbol('a', algebraic=True) + t = Symbol('t', transcendental=True) + x = Symbol('x') + assert re(a).is_algebraic + assert re(x).is_algebraic is None + assert re(t).is_algebraic is False + + assert im(S.ComplexInfinity) is S.NaN + + n, m, l = symbols('n m l') + A = MatrixSymbol('A',n,m) + + assert im(A) == (S.One/(2*I)) * (A - conjugate(A)) + + A = Matrix([[1 + 4*I, 2],[0, -3*I]]) + assert im(A) == Matrix([[4, 0],[0, -3]]) + + A = ImmutableMatrix([[1 + 3*I, 3-2*I],[0, 2*I]]) + assert im(A) == ImmutableMatrix([[3, -2],[0, 2]]) + + X = ImmutableSparseMatrix( + [[i*I + i for i in range(5)] for i in range(5)]) + Y = SparseMatrix([list(range(5)) for i in range(5)]) + assert im(X).as_immutable() == Y + + X = FunctionMatrix(3, 3, Lambda((n, m), n + m*I)) + assert im(X) == Matrix([[0, 1, 2], [0, 1, 2], [0, 1, 2]]) + +def test_sign(): + assert sign(1.2) == 1 + assert sign(-1.2) == -1 + assert sign(3*I) == I + assert sign(-3*I) == -I + assert sign(0) == 0 + assert sign(0, evaluate=False).doit() == 0 + assert sign(oo, evaluate=False).doit() == 1 + assert sign(nan) is nan + assert sign(2 + 2*I).doit() == sqrt(2)*(2 + 2*I)/4 + assert sign(2 + 3*I).simplify() == sign(2 + 3*I) + assert sign(2 + 2*I).simplify() == sign(1 + I) + assert sign(im(sqrt(1 - sqrt(3)))) == 1 + assert sign(sqrt(1 - sqrt(3))) == I + + x = Symbol('x') + assert sign(x).is_finite is True + assert sign(x).is_complex is True + assert sign(x).is_imaginary is None + assert sign(x).is_integer is None + assert sign(x).is_real is None + assert sign(x).is_zero is None + assert sign(x).doit() == sign(x) + assert sign(1.2*x) == sign(x) + assert sign(2*x) == sign(x) + assert sign(I*x) == I*sign(x) + assert sign(-2*I*x) == -I*sign(x) + assert sign(conjugate(x)) == conjugate(sign(x)) + + p = Symbol('p', positive=True) + n = Symbol('n', negative=True) + m = Symbol('m', negative=True) + assert sign(2*p*x) == sign(x) + assert sign(n*x) == -sign(x) + assert sign(n*m*x) == sign(x) + + x = Symbol('x', imaginary=True) + assert sign(x).is_imaginary is True + assert sign(x).is_integer is False + assert sign(x).is_real is False + assert sign(x).is_zero is False + assert sign(x).diff(x) == 2*DiracDelta(-I*x) + assert sign(x).doit() == x / Abs(x) + assert conjugate(sign(x)) == -sign(x) + + x = Symbol('x', real=True) + assert sign(x).is_imaginary is False + assert sign(x).is_integer is True + assert sign(x).is_real is True + assert sign(x).is_zero is None + assert sign(x).diff(x) == 2*DiracDelta(x) + assert sign(x).doit() == sign(x) + assert conjugate(sign(x)) == sign(x) + + x = Symbol('x', nonzero=True) + assert sign(x).is_imaginary is False + assert sign(x).is_integer is True + assert sign(x).is_real is True + assert sign(x).is_zero is False + assert sign(x).doit() == x / Abs(x) + assert sign(Abs(x)) == 1 + assert Abs(sign(x)) == 1 + + x = Symbol('x', positive=True) + assert sign(x).is_imaginary is False + assert sign(x).is_integer is True + assert sign(x).is_real is True + assert sign(x).is_zero is False + assert sign(x).doit() == x / Abs(x) + assert sign(Abs(x)) == 1 + assert Abs(sign(x)) == 1 + + x = 0 + assert sign(x).is_imaginary is False + assert sign(x).is_integer is True + assert sign(x).is_real is True + assert sign(x).is_zero is True + assert sign(x).doit() == 0 + assert sign(Abs(x)) == 0 + assert Abs(sign(x)) == 0 + + nz = Symbol('nz', nonzero=True, integer=True) + assert sign(nz).is_imaginary is False + assert sign(nz).is_integer is True + assert sign(nz).is_real is True + assert sign(nz).is_zero is False + assert sign(nz)**2 == 1 + assert (sign(nz)**3).args == (sign(nz), 3) + + assert sign(Symbol('x', nonnegative=True)).is_nonnegative + assert sign(Symbol('x', nonnegative=True)).is_nonpositive is None + assert sign(Symbol('x', nonpositive=True)).is_nonnegative is None + assert sign(Symbol('x', nonpositive=True)).is_nonpositive + assert sign(Symbol('x', real=True)).is_nonnegative is None + assert sign(Symbol('x', real=True)).is_nonpositive is None + assert sign(Symbol('x', real=True, zero=False)).is_nonpositive is None + + x, y = Symbol('x', real=True), Symbol('y') + f = Function('f') + assert sign(x).rewrite(Piecewise) == \ + Piecewise((1, x > 0), (-1, x < 0), (0, True)) + assert sign(y).rewrite(Piecewise) == sign(y) + assert sign(x).rewrite(Heaviside) == 2*Heaviside(x, H0=S(1)/2) - 1 + assert sign(y).rewrite(Heaviside) == sign(y) + assert sign(y).rewrite(Abs) == Piecewise((0, Eq(y, 0)), (y/Abs(y), True)) + assert sign(f(y)).rewrite(Abs) == Piecewise((0, Eq(f(y), 0)), (f(y)/Abs(f(y)), True)) + + # evaluate what can be evaluated + assert sign(exp_polar(I*pi)*pi) is S.NegativeOne + + eq = -sqrt(10 + 6*sqrt(3)) + sqrt(1 + sqrt(3)) + sqrt(3 + 3*sqrt(3)) + # if there is a fast way to know when and when you cannot prove an + # expression like this is zero then the equality to zero is ok + assert sign(eq).func is sign or sign(eq) == 0 + # but sometimes it's hard to do this so it's better not to load + # abs down with tests that will be very slow + q = 1 + sqrt(2) - 2*sqrt(3) + 1331*sqrt(6) + p = expand(q**3)**Rational(1, 3) + d = p - q + assert sign(d).func is sign or sign(d) == 0 + + +def test_as_real_imag(): + n = pi**1000 + # the special code for working out the real + # and complex parts of a power with Integer exponent + # should not run if there is no imaginary part, hence + # this should not hang + assert n.as_real_imag() == (n, 0) + + # issue 6261 + x = Symbol('x') + assert sqrt(x).as_real_imag() == \ + ((re(x)**2 + im(x)**2)**Rational(1, 4)*cos(atan2(im(x), re(x))/2), + (re(x)**2 + im(x)**2)**Rational(1, 4)*sin(atan2(im(x), re(x))/2)) + + # issue 3853 + a, b = symbols('a,b', real=True) + assert ((1 + sqrt(a + b*I))/2).as_real_imag() == \ + ( + (a**2 + b**2)**Rational( + 1, 4)*cos(atan2(b, a)/2)/2 + S.Half, + (a**2 + b**2)**Rational(1, 4)*sin(atan2(b, a)/2)/2) + + assert sqrt(a**2).as_real_imag() == (sqrt(a**2), 0) + i = symbols('i', imaginary=True) + assert sqrt(i**2).as_real_imag() == (0, abs(i)) + + assert ((1 + I)/(1 - I)).as_real_imag() == (0, 1) + assert ((1 + I)**3/(1 - I)).as_real_imag() == (-2, 0) + + +@XFAIL +def test_sign_issue_3068(): + n = pi**1000 + i = int(n) + x = Symbol('x') + assert (n - i).round() == 1 # doesn't hang + assert sign(n - i) == 1 + # perhaps it's not possible to get the sign right when + # only 1 digit is being requested for this situation; + # 2 digits works + assert (n - x).n(1, subs={x: i}) > 0 + assert (n - x).n(2, subs={x: i}) > 0 + + +def test_Abs(): + raises(TypeError, lambda: Abs(Interval(2, 3))) # issue 8717 + + x, y = symbols('x,y') + assert sign(sign(x)) == sign(x) + assert sign(x*y).func is sign + assert Abs(0) == 0 + assert Abs(1) == 1 + assert Abs(-1) == 1 + assert Abs(I) == 1 + assert Abs(-I) == 1 + assert Abs(nan) is nan + assert Abs(zoo) is oo + assert Abs(I * pi) == pi + assert Abs(-I * pi) == pi + assert Abs(I * x) == Abs(x) + assert Abs(-I * x) == Abs(x) + assert Abs(-2*x) == 2*Abs(x) + assert Abs(-2.0*x) == 2.0*Abs(x) + assert Abs(2*pi*x*y) == 2*pi*Abs(x*y) + assert Abs(conjugate(x)) == Abs(x) + assert conjugate(Abs(x)) == Abs(x) + assert Abs(x).expand(complex=True) == sqrt(re(x)**2 + im(x)**2) + + a = Symbol('a', positive=True) + assert Abs(2*pi*x*a) == 2*pi*a*Abs(x) + assert Abs(2*pi*I*x*a) == 2*pi*a*Abs(x) + + x = Symbol('x', real=True) + n = Symbol('n', integer=True) + assert Abs((-1)**n) == 1 + assert x**(2*n) == Abs(x)**(2*n) + assert Abs(x).diff(x) == sign(x) + assert abs(x) == Abs(x) # Python built-in + assert Abs(x)**3 == x**2*Abs(x) + assert Abs(x)**4 == x**4 + assert ( + Abs(x)**(3*n)).args == (Abs(x), 3*n) # leave symbolic odd unchanged + assert (1/Abs(x)).args == (Abs(x), -1) + assert 1/Abs(x)**3 == 1/(x**2*Abs(x)) + assert Abs(x)**-3 == Abs(x)/(x**4) + assert Abs(x**3) == x**2*Abs(x) + assert Abs(I**I) == exp(-pi/2) + assert Abs((4 + 5*I)**(6 + 7*I)) == 68921*exp(-7*atan(Rational(5, 4))) + y = Symbol('y', real=True) + assert Abs(I**y) == 1 + y = Symbol('y') + assert Abs(I**y) == exp(-pi*im(y)/2) + + x = Symbol('x', imaginary=True) + assert Abs(x).diff(x) == -sign(x) + + eq = -sqrt(10 + 6*sqrt(3)) + sqrt(1 + sqrt(3)) + sqrt(3 + 3*sqrt(3)) + # if there is a fast way to know when you can and when you cannot prove an + # expression like this is zero then the equality to zero is ok + assert abs(eq).func is Abs or abs(eq) == 0 + # but sometimes it's hard to do this so it's better not to load + # abs down with tests that will be very slow + q = 1 + sqrt(2) - 2*sqrt(3) + 1331*sqrt(6) + p = expand(q**3)**Rational(1, 3) + d = p - q + assert abs(d).func is Abs or abs(d) == 0 + + assert Abs(4*exp(pi*I/4)) == 4 + assert Abs(3**(2 + I)) == 9 + assert Abs((-3)**(1 - I)) == 3*exp(pi) + + assert Abs(oo) is oo + assert Abs(-oo) is oo + assert Abs(oo + I) is oo + assert Abs(oo + I*oo) is oo + + a = Symbol('a', algebraic=True) + t = Symbol('t', transcendental=True) + x = Symbol('x') + assert re(a).is_algebraic + assert re(x).is_algebraic is None + assert re(t).is_algebraic is False + assert Abs(x).fdiff() == sign(x) + raises(ArgumentIndexError, lambda: Abs(x).fdiff(2)) + + # doesn't have recursion error + arg = sqrt(acos(1 - I)*acos(1 + I)) + assert abs(arg) == arg + + # special handling to put Abs in denom + assert abs(1/x) == 1/Abs(x) + e = abs(2/x**2) + assert e.is_Mul and e == 2/Abs(x**2) + assert unchanged(Abs, y/x) + assert unchanged(Abs, x/(x + 1)) + assert unchanged(Abs, x*y) + p = Symbol('p', positive=True) + assert abs(x/p) == abs(x)/p + + # coverage + assert unchanged(Abs, Symbol('x', real=True)**y) + # issue 19627 + f = Function('f', positive=True) + assert sqrt(f(x)**2) == f(x) + # issue 21625 + assert unchanged(Abs, S("im(acos(-i + acosh(-g + i)))")) + + +def test_Abs_rewrite(): + x = Symbol('x', real=True) + a = Abs(x).rewrite(Heaviside).expand() + assert a == x*Heaviside(x) - x*Heaviside(-x) + for i in [-2, -1, 0, 1, 2]: + assert a.subs(x, i) == abs(i) + y = Symbol('y') + assert Abs(y).rewrite(Heaviside) == Abs(y) + + x, y = Symbol('x', real=True), Symbol('y') + assert Abs(x).rewrite(Piecewise) == Piecewise((x, x >= 0), (-x, True)) + assert Abs(y).rewrite(Piecewise) == Abs(y) + assert Abs(y).rewrite(sign) == y/sign(y) + + i = Symbol('i', imaginary=True) + assert abs(i).rewrite(Piecewise) == Piecewise((I*i, I*i >= 0), (-I*i, True)) + + + assert Abs(y).rewrite(conjugate) == sqrt(y*conjugate(y)) + assert Abs(i).rewrite(conjugate) == sqrt(-i**2) # == -I*i + + y = Symbol('y', extended_real=True) + assert (Abs(exp(-I*x)-exp(-I*y))**2).rewrite(conjugate) == \ + -exp(I*x)*exp(-I*y) + 2 - exp(-I*x)*exp(I*y) + + +def test_Abs_real(): + # test some properties of abs that only apply + # to real numbers + x = Symbol('x', complex=True) + assert sqrt(x**2) != Abs(x) + assert Abs(x**2) != x**2 + + x = Symbol('x', real=True) + assert sqrt(x**2) == Abs(x) + assert Abs(x**2) == x**2 + + # if the symbol is zero, the following will still apply + nn = Symbol('nn', nonnegative=True, real=True) + np = Symbol('np', nonpositive=True, real=True) + assert Abs(nn) == nn + assert Abs(np) == -np + + +def test_Abs_properties(): + x = Symbol('x') + assert Abs(x).is_real is None + assert Abs(x).is_extended_real is True + assert Abs(x).is_rational is None + assert Abs(x).is_positive is None + assert Abs(x).is_nonnegative is None + assert Abs(x).is_extended_positive is None + assert Abs(x).is_extended_nonnegative is True + + f = Symbol('x', finite=True) + assert Abs(f).is_real is True + assert Abs(f).is_extended_real is True + assert Abs(f).is_rational is None + assert Abs(f).is_positive is None + assert Abs(f).is_nonnegative is True + assert Abs(f).is_extended_positive is None + assert Abs(f).is_extended_nonnegative is True + + z = Symbol('z', complex=True, zero=False) + assert Abs(z).is_real is True # since complex implies finite + assert Abs(z).is_extended_real is True + assert Abs(z).is_rational is None + assert Abs(z).is_positive is True + assert Abs(z).is_extended_positive is True + assert Abs(z).is_zero is False + + p = Symbol('p', positive=True) + assert Abs(p).is_real is True + assert Abs(p).is_extended_real is True + assert Abs(p).is_rational is None + assert Abs(p).is_positive is True + assert Abs(p).is_zero is False + + q = Symbol('q', rational=True) + assert Abs(q).is_real is True + assert Abs(q).is_rational is True + assert Abs(q).is_integer is None + assert Abs(q).is_positive is None + assert Abs(q).is_nonnegative is True + + i = Symbol('i', integer=True) + assert Abs(i).is_real is True + assert Abs(i).is_integer is True + assert Abs(i).is_positive is None + assert Abs(i).is_nonnegative is True + + e = Symbol('n', even=True) + ne = Symbol('ne', real=True, even=False) + assert Abs(e).is_even is True + assert Abs(ne).is_even is False + assert Abs(i).is_even is None + + o = Symbol('n', odd=True) + no = Symbol('no', real=True, odd=False) + assert Abs(o).is_odd is True + assert Abs(no).is_odd is False + assert Abs(i).is_odd is None + + +def test_abs(): + # this tests that abs calls Abs; don't rename to + # test_Abs since that test is already above + a = Symbol('a', positive=True) + assert abs(I*(1 + a)**2) == (1 + a)**2 + + +def test_arg(): + assert arg(0) is nan + assert arg(1) == 0 + assert arg(-1) == pi + assert arg(I) == pi/2 + assert arg(-I) == -pi/2 + assert arg(1 + I) == pi/4 + assert arg(-1 + I) == pi*Rational(3, 4) + assert arg(1 - I) == -pi/4 + assert arg(exp_polar(4*pi*I)) == 4*pi + assert arg(exp_polar(-7*pi*I)) == -7*pi + assert arg(exp_polar(5 - 3*pi*I/4)) == pi*Rational(-3, 4) + + assert arg(exp(I*pi/7)) == pi/7 # issue 17300 + assert arg(exp(16*I)) == 16 - 6*pi + assert arg(exp(13*I*pi/12)) == -11*pi/12 + assert arg(exp(123 - 5*I)) == -5 + 2*pi + assert arg(exp(sin(1 + 3*I))) == -2*pi + cos(1)*sinh(3) + r = Symbol('r', real=True) + assert arg(exp(r - 2*I)) == -2 + + f = Function('f') + assert not arg(f(0) + I*f(1)).atoms(re) + + # check nesting + x = Symbol('x') + assert arg(arg(arg(x))) is not S.NaN + assert arg(arg(arg(arg(x)))) is S.NaN + r = Symbol('r', extended_real=True) + assert arg(arg(r)) is not S.NaN + assert arg(arg(arg(r))) is S.NaN + + p = Function('p', extended_positive=True) + assert arg(p(x)) == 0 + assert arg((3 + I)*p(x)) == arg(3 + I) + + p = Symbol('p', positive=True) + assert arg(p) == 0 + assert arg(p*I) == pi/2 + + n = Symbol('n', negative=True) + assert arg(n) == pi + assert arg(n*I) == -pi/2 + + x = Symbol('x') + assert conjugate(arg(x)) == arg(x) + + e = p + I*p**2 + assert arg(e) == arg(1 + p*I) + # make sure sign doesn't swap + e = -2*p + 4*I*p**2 + assert arg(e) == arg(-1 + 2*p*I) + # make sure sign isn't lost + x = symbols('x', real=True) # could be zero + e = x + I*x + assert arg(e) == arg(x*(1 + I)) + assert arg(e/p) == arg(x*(1 + I)) + e = p*cos(p) + I*log(p)*exp(p) + assert arg(e).args[0] == e + # keep it simple -- let the user do more advanced cancellation + e = (p + 1) + I*(p**2 - 1) + assert arg(e).args[0] == e + + f = Function('f') + e = 2*x*(f(0) - 1) - 2*x*f(0) + assert arg(e) == arg(-2*x) + assert arg(f(0)).func == arg and arg(f(0)).args == (f(0),) + + +def test_arg_rewrite(): + assert arg(1 + I) == atan2(1, 1) + + x = Symbol('x', real=True) + y = Symbol('y', real=True) + assert arg(x + I*y).rewrite(atan2) == atan2(y, x) + + +def test_arg_leading_term_and_series(): + x = Symbol('x') + assert arg(x).as_leading_term(x, cdir = 1) == 0 + assert arg(x).as_leading_term(x, cdir = -1) == pi + raises(PoleError, lambda: arg(x + I).as_leading_term(x, cdir = 1)) + raises(PoleError, lambda: arg(2*x).as_leading_term(x, cdir = I)) + + assert arg(x).nseries(x) == 0 + assert arg(x).nseries(x, n=0) == Order(1) + + +def test_adjoint(): + a = Symbol('a', antihermitian=True) + b = Symbol('b', hermitian=True) + assert adjoint(a) == -a + assert adjoint(I*a) == I*a + assert adjoint(b) == b + assert adjoint(I*b) == -I*b + assert adjoint(a*b) == -b*a + assert adjoint(I*a*b) == I*b*a + + x, y = symbols('x y') + assert adjoint(adjoint(x)) == x + assert adjoint(x + y) == conjugate(x) + conjugate(y) + assert adjoint(x - y) == conjugate(x) - conjugate(y) + assert adjoint(x * y) == conjugate(x) * conjugate(y) + assert adjoint(x / y) == conjugate(x) / conjugate(y) + assert adjoint(-x) == -conjugate(x) + + x, y = symbols('x y', commutative=False) + assert adjoint(adjoint(x)) == x + assert adjoint(x + y) == adjoint(x) + adjoint(y) + assert adjoint(x - y) == adjoint(x) - adjoint(y) + assert adjoint(x * y) == adjoint(y) * adjoint(x) + assert adjoint(x / y) == 1 / adjoint(y) * adjoint(x) + assert adjoint(-x) == -adjoint(x) + + +def test_conjugate(): + a = Symbol('a', real=True) + b = Symbol('b', imaginary=True) + assert conjugate(a) == a + assert conjugate(I*a) == -I*a + assert conjugate(b) == -b + assert conjugate(I*b) == I*b + assert conjugate(a*b) == -a*b + assert conjugate(I*a*b) == I*a*b + + x, y = symbols('x y') + assert conjugate(conjugate(x)) == x + assert conjugate(x).inverse() == conjugate + assert conjugate(x + y) == conjugate(x) + conjugate(y) + assert conjugate(x - y) == conjugate(x) - conjugate(y) + assert conjugate(x * y) == conjugate(x) * conjugate(y) + assert conjugate(x / y) == conjugate(x) / conjugate(y) + assert conjugate(-x) == -conjugate(x) + + a = Symbol('a', algebraic=True) + t = Symbol('t', transcendental=True) + assert re(a).is_algebraic + assert re(x).is_algebraic is None + assert re(t).is_algebraic is False + + +def test_conjugate_transpose(): + x = Symbol('x', commutative=False) + assert conjugate(transpose(x)) == adjoint(x) + assert transpose(conjugate(x)) == adjoint(x) + assert adjoint(transpose(x)) == conjugate(x) + assert transpose(adjoint(x)) == conjugate(x) + assert adjoint(conjugate(x)) == transpose(x) + assert conjugate(adjoint(x)) == transpose(x) + + x = Symbol('x') + assert conjugate(x) == adjoint(x) + assert transpose(x) == x + + +def test_transpose(): + a = Symbol('a', complex=True) + assert transpose(a) == a + assert transpose(I*a) == I*a + + x, y = symbols('x y') + assert transpose(transpose(x)) == x + assert transpose(x + y) == x + y + assert transpose(x - y) == x - y + assert transpose(x * y) == x * y + assert transpose(x / y) == x / y + assert transpose(-x) == -x + + x, y = symbols('x y', commutative=False) + assert transpose(transpose(x)) == x + assert transpose(x + y) == transpose(x) + transpose(y) + assert transpose(x - y) == transpose(x) - transpose(y) + assert transpose(x * y) == transpose(y) * transpose(x) + assert transpose(x / y) == 1 / transpose(y) * transpose(x) + assert transpose(-x) == -transpose(x) + + +@_both_exp_pow +def test_polarify(): + from sympy.functions.elementary.complexes import (polar_lift, polarify) + x = Symbol('x') + z = Symbol('z', polar=True) + f = Function('f') + ES = {} + + assert polarify(-1) == (polar_lift(-1), ES) + assert polarify(1 + I) == (polar_lift(1 + I), ES) + + assert polarify(exp(x), subs=False) == exp(x) + assert polarify(1 + x, subs=False) == 1 + x + assert polarify(f(I) + x, subs=False) == f(polar_lift(I)) + x + + assert polarify(x, lift=True) == polar_lift(x) + assert polarify(z, lift=True) == z + assert polarify(f(x), lift=True) == f(polar_lift(x)) + assert polarify(1 + x, lift=True) == polar_lift(1 + x) + assert polarify(1 + f(x), lift=True) == polar_lift(1 + f(polar_lift(x))) + + newex, subs = polarify(f(x) + z) + assert newex.subs(subs) == f(x) + z + + mu = Symbol("mu") + sigma = Symbol("sigma", positive=True) + + # Make sure polarify(lift=True) doesn't try to lift the integration + # variable + assert polarify( + Integral(sqrt(2)*x*exp(-(-mu + x)**2/(2*sigma**2))/(2*sqrt(pi)*sigma), + (x, -oo, oo)), lift=True) == Integral(sqrt(2)*(sigma*exp_polar(0))**exp_polar(I*pi)* + exp((sigma*exp_polar(0))**(2*exp_polar(I*pi))*exp_polar(I*pi)*polar_lift(-mu + x)** + (2*exp_polar(0))/2)*exp_polar(0)*polar_lift(x)/(2*sqrt(pi)), (x, -oo, oo)) + + +def test_unpolarify(): + from sympy.functions.elementary.complexes import (polar_lift, principal_branch, unpolarify) + from sympy.core.relational import Ne + from sympy.functions.elementary.hyperbolic import tanh + from sympy.functions.special.error_functions import erf + from sympy.functions.special.gamma_functions import (gamma, uppergamma) + from sympy.abc import x + p = exp_polar(7*I) + 1 + u = exp(7*I) + 1 + + assert unpolarify(1) == 1 + assert unpolarify(p) == u + assert unpolarify(p**2) == u**2 + assert unpolarify(p**x) == p**x + assert unpolarify(p*x) == u*x + assert unpolarify(p + x) == u + x + assert unpolarify(sqrt(sin(p))) == sqrt(sin(u)) + + # Test reduction to principal branch 2*pi. + t = principal_branch(x, 2*pi) + assert unpolarify(t) == x + assert unpolarify(sqrt(t)) == sqrt(t) + + # Test exponents_only. + assert unpolarify(p**p, exponents_only=True) == p**u + assert unpolarify(uppergamma(x, p**p)) == uppergamma(x, p**u) + + # Test functions. + assert unpolarify(sin(p)) == sin(u) + assert unpolarify(tanh(p)) == tanh(u) + assert unpolarify(gamma(p)) == gamma(u) + assert unpolarify(erf(p)) == erf(u) + assert unpolarify(uppergamma(x, p)) == uppergamma(x, p) + + assert unpolarify(uppergamma(sin(p), sin(p + exp_polar(0)))) == \ + uppergamma(sin(u), sin(u + 1)) + assert unpolarify(uppergamma(polar_lift(0), 2*exp_polar(0))) == \ + uppergamma(0, 2) + + assert unpolarify(Eq(p, 0)) == Eq(u, 0) + assert unpolarify(Ne(p, 0)) == Ne(u, 0) + assert unpolarify(polar_lift(x) > 0) == (x > 0) + + # Test bools + assert unpolarify(True) is True + + +def test_issue_4035(): + x = Symbol('x') + assert Abs(x).expand(trig=True) == Abs(x) + assert sign(x).expand(trig=True) == sign(x) + assert arg(x).expand(trig=True) == arg(x) + + +def test_issue_3206(): + x = Symbol('x') + assert Abs(Abs(x)) == Abs(x) + + +def test_issue_4754_derivative_conjugate(): + x = Symbol('x', real=True) + y = Symbol('y', imaginary=True) + f = Function('f') + assert (f(x).conjugate()).diff(x) == (f(x).diff(x)).conjugate() + assert (f(y).conjugate()).diff(y) == -(f(y).diff(y)).conjugate() + + +def test_derivatives_issue_4757(): + x = Symbol('x', real=True) + y = Symbol('y', imaginary=True) + f = Function('f') + assert re(f(x)).diff(x) == re(f(x).diff(x)) + assert im(f(x)).diff(x) == im(f(x).diff(x)) + assert re(f(y)).diff(y) == -I*im(f(y).diff(y)) + assert im(f(y)).diff(y) == -I*re(f(y).diff(y)) + assert Abs(f(x)).diff(x).subs(f(x), 1 + I*x).doit() == x/sqrt(1 + x**2) + assert arg(f(x)).diff(x).subs(f(x), 1 + I*x**2).doit() == 2*x/(1 + x**4) + assert Abs(f(y)).diff(y).subs(f(y), 1 + y).doit() == -y/sqrt(1 - y**2) + assert arg(f(y)).diff(y).subs(f(y), I + y**2).doit() == 2*y/(1 + y**4) + + +def test_issue_11413(): + from sympy.simplify.simplify import simplify + v0 = Symbol('v0') + v1 = Symbol('v1') + v2 = Symbol('v2') + V = Matrix([[v0],[v1],[v2]]) + U = V.normalized() + assert U == Matrix([ + [v0/sqrt(Abs(v0)**2 + Abs(v1)**2 + Abs(v2)**2)], + [v1/sqrt(Abs(v0)**2 + Abs(v1)**2 + Abs(v2)**2)], + [v2/sqrt(Abs(v0)**2 + Abs(v1)**2 + Abs(v2)**2)]]) + U.norm = sqrt(v0**2/(v0**2 + v1**2 + v2**2) + v1**2/(v0**2 + v1**2 + v2**2) + v2**2/(v0**2 + v1**2 + v2**2)) + assert simplify(U.norm) == 1 + + +def test_periodic_argument(): + from sympy.functions.elementary.complexes import (periodic_argument, polar_lift, principal_branch, unbranched_argument) + x = Symbol('x') + p = Symbol('p', positive=True) + + assert unbranched_argument(2 + I) == periodic_argument(2 + I, oo) + assert unbranched_argument(1 + x) == periodic_argument(1 + x, oo) + assert N_equals(unbranched_argument((1 + I)**2), pi/2) + assert N_equals(unbranched_argument((1 - I)**2), -pi/2) + assert N_equals(periodic_argument((1 + I)**2, 3*pi), pi/2) + assert N_equals(periodic_argument((1 - I)**2, 3*pi), -pi/2) + + assert unbranched_argument(principal_branch(x, pi)) == \ + periodic_argument(x, pi) + + assert unbranched_argument(polar_lift(2 + I)) == unbranched_argument(2 + I) + assert periodic_argument(polar_lift(2 + I), 2*pi) == \ + periodic_argument(2 + I, 2*pi) + assert periodic_argument(polar_lift(2 + I), 3*pi) == \ + periodic_argument(2 + I, 3*pi) + assert periodic_argument(polar_lift(2 + I), pi) == \ + periodic_argument(polar_lift(2 + I), pi) + + assert unbranched_argument(polar_lift(1 + I)) == pi/4 + assert periodic_argument(2*p, p) == periodic_argument(p, p) + assert periodic_argument(pi*p, p) == periodic_argument(p, p) + + assert Abs(polar_lift(1 + I)) == Abs(1 + I) + + +@XFAIL +def test_principal_branch_fail(): + # TODO XXX why does abs(x)._eval_evalf() not fall back to global evalf? + from sympy.functions.elementary.complexes import principal_branch + assert N_equals(principal_branch((1 + I)**2, pi/2), 0) + + +def test_principal_branch(): + from sympy.functions.elementary.complexes import (polar_lift, principal_branch) + p = Symbol('p', positive=True) + x = Symbol('x') + neg = Symbol('x', negative=True) + + assert principal_branch(polar_lift(x), p) == principal_branch(x, p) + assert principal_branch(polar_lift(2 + I), p) == principal_branch(2 + I, p) + assert principal_branch(2*x, p) == 2*principal_branch(x, p) + assert principal_branch(1, pi) == exp_polar(0) + assert principal_branch(-1, 2*pi) == exp_polar(I*pi) + assert principal_branch(-1, pi) == exp_polar(0) + assert principal_branch(exp_polar(3*pi*I)*x, 2*pi) == \ + principal_branch(exp_polar(I*pi)*x, 2*pi) + assert principal_branch(neg*exp_polar(pi*I), 2*pi) == neg*exp_polar(-I*pi) + # related to issue #14692 + assert principal_branch(exp_polar(-I*pi/2)/polar_lift(neg), 2*pi) == \ + exp_polar(-I*pi/2)/neg + + assert N_equals(principal_branch((1 + I)**2, 2*pi), 2*I) + assert N_equals(principal_branch((1 + I)**2, 3*pi), 2*I) + assert N_equals(principal_branch((1 + I)**2, 1*pi), 2*I) + + # test argument sanitization + assert principal_branch(x, I).func is principal_branch + assert principal_branch(x, -4).func is principal_branch + assert principal_branch(x, -oo).func is principal_branch + assert principal_branch(x, zoo).func is principal_branch + + +@XFAIL +def test_issue_6167_6151(): + n = pi**1000 + i = int(n) + assert sign(n - i) == 1 + assert abs(n - i) == n - i + x = Symbol('x') + eps = pi**-1500 + big = pi**1000 + one = cos(x)**2 + sin(x)**2 + e = big*one - big + eps + from sympy.simplify.simplify import simplify + assert sign(simplify(e)) == 1 + for xi in (111, 11, 1, Rational(1, 10)): + assert sign(e.subs(x, xi)) == 1 + + +def test_issue_14216(): + from sympy.functions.elementary.complexes import unpolarify + A = MatrixSymbol("A", 2, 2) + assert unpolarify(A[0, 0]) == A[0, 0] + assert unpolarify(A[0, 0]*A[1, 0]) == A[0, 0]*A[1, 0] + + +def test_issue_14238(): + # doesn't cause recursion error + r = Symbol('r', real=True) + assert Abs(r + Piecewise((0, r > 0), (1 - r, True))) + + +def test_issue_22189(): + x = Symbol('x') + for a in (sqrt(7 - 2*x) - 2, 1 - x): + assert Abs(a) - Abs(-a) == 0, a + + +def test_zero_assumptions(): + nr = Symbol('nonreal', real=False, finite=True) + ni = Symbol('nonimaginary', imaginary=False) + # imaginary implies not zero + nzni = Symbol('nonzerononimaginary', zero=False, imaginary=False) + + assert re(nr).is_zero is None + assert im(nr).is_zero is False + + assert re(ni).is_zero is None + assert im(ni).is_zero is None + + assert re(nzni).is_zero is False + assert im(nzni).is_zero is None + + +@_both_exp_pow +def test_issue_15893(): + f = Function('f', real=True) + x = Symbol('x', real=True) + eq = Derivative(Abs(f(x)), f(x)) + assert eq.doit() == sign(f(x)) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/functions/elementary/tests/test_miscellaneous.py b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/functions/elementary/tests/test_miscellaneous.py new file mode 100644 index 0000000000000000000000000000000000000000..374c4fb50eaae54a9884015c124c245385e1761e --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/functions/elementary/tests/test_miscellaneous.py @@ -0,0 +1,504 @@ +import itertools as it + +from sympy.core.expr import unchanged +from sympy.core.function import Function +from sympy.core.numbers import I, oo, Rational +from sympy.core.power import Pow +from sympy.core.singleton import S +from sympy.core.symbol import Symbol +from sympy.external import import_module +from sympy.functions.elementary.exponential import log +from sympy.functions.elementary.integers import floor, ceiling +from sympy.functions.elementary.miscellaneous import (sqrt, cbrt, root, Min, + Max, real_root, Rem) +from sympy.functions.elementary.trigonometric import cos, sin +from sympy.functions.special.delta_functions import Heaviside + +from sympy.utilities.lambdify import lambdify +from sympy.testing.pytest import raises, skip, ignore_warnings + +def test_Min(): + from sympy.abc import x, y, z + n = Symbol('n', negative=True) + n_ = Symbol('n_', negative=True) + nn = Symbol('nn', nonnegative=True) + nn_ = Symbol('nn_', nonnegative=True) + p = Symbol('p', positive=True) + p_ = Symbol('p_', positive=True) + np = Symbol('np', nonpositive=True) + np_ = Symbol('np_', nonpositive=True) + r = Symbol('r', real=True) + + assert Min(5, 4) == 4 + assert Min(-oo, -oo) is -oo + assert Min(-oo, n) is -oo + assert Min(n, -oo) is -oo + assert Min(-oo, np) is -oo + assert Min(np, -oo) is -oo + assert Min(-oo, 0) is -oo + assert Min(0, -oo) is -oo + assert Min(-oo, nn) is -oo + assert Min(nn, -oo) is -oo + assert Min(-oo, p) is -oo + assert Min(p, -oo) is -oo + assert Min(-oo, oo) is -oo + assert Min(oo, -oo) is -oo + assert Min(n, n) == n + assert unchanged(Min, n, np) + assert Min(np, n) == Min(n, np) + assert Min(n, 0) == n + assert Min(0, n) == n + assert Min(n, nn) == n + assert Min(nn, n) == n + assert Min(n, p) == n + assert Min(p, n) == n + assert Min(n, oo) == n + assert Min(oo, n) == n + assert Min(np, np) == np + assert Min(np, 0) == np + assert Min(0, np) == np + assert Min(np, nn) == np + assert Min(nn, np) == np + assert Min(np, p) == np + assert Min(p, np) == np + assert Min(np, oo) == np + assert Min(oo, np) == np + assert Min(0, 0) == 0 + assert Min(0, nn) == 0 + assert Min(nn, 0) == 0 + assert Min(0, p) == 0 + assert Min(p, 0) == 0 + assert Min(0, oo) == 0 + assert Min(oo, 0) == 0 + assert Min(nn, nn) == nn + assert unchanged(Min, nn, p) + assert Min(p, nn) == Min(nn, p) + assert Min(nn, oo) == nn + assert Min(oo, nn) == nn + assert Min(p, p) == p + assert Min(p, oo) == p + assert Min(oo, p) == p + assert Min(oo, oo) is oo + + assert Min(n, n_).func is Min + assert Min(nn, nn_).func is Min + assert Min(np, np_).func is Min + assert Min(p, p_).func is Min + + # lists + assert Min() is S.Infinity + assert Min(x) == x + assert Min(x, y) == Min(y, x) + assert Min(x, y, z) == Min(z, y, x) + assert Min(x, Min(y, z)) == Min(z, y, x) + assert Min(x, Max(y, -oo)) == Min(x, y) + assert Min(p, oo, n, p, p, p_) == n + assert Min(p_, n_, p) == n_ + assert Min(n, oo, -7, p, p, 2) == Min(n, -7) + assert Min(2, x, p, n, oo, n_, p, 2, -2, -2) == Min(-2, x, n, n_) + assert Min(0, x, 1, y) == Min(0, x, y) + assert Min(1000, 100, -100, x, p, n) == Min(n, x, -100) + assert unchanged(Min, sin(x), cos(x)) + assert Min(sin(x), cos(x)) == Min(cos(x), sin(x)) + assert Min(cos(x), sin(x)).subs(x, 1) == cos(1) + assert Min(cos(x), sin(x)).subs(x, S.Half) == sin(S.Half) + raises(ValueError, lambda: Min(cos(x), sin(x)).subs(x, I)) + raises(ValueError, lambda: Min(I)) + raises(ValueError, lambda: Min(I, x)) + raises(ValueError, lambda: Min(S.ComplexInfinity, x)) + + assert Min(1, x).diff(x) == Heaviside(1 - x) + assert Min(x, 1).diff(x) == Heaviside(1 - x) + assert Min(0, -x, 1 - 2*x).diff(x) == -Heaviside(x + Min(0, -2*x + 1)) \ + - 2*Heaviside(2*x + Min(0, -x) - 1) + + # issue 7619 + f = Function('f') + assert Min(1, 2*Min(f(1), 2)) # doesn't fail + + # issue 7233 + e = Min(0, x) + assert e.n().args == (0, x) + + # issue 8643 + m = Min(n, p_, n_, r) + assert m.is_positive is False + assert m.is_nonnegative is False + assert m.is_negative is True + + m = Min(p, p_) + assert m.is_positive is True + assert m.is_nonnegative is True + assert m.is_negative is False + + m = Min(p, nn_, p_) + assert m.is_positive is None + assert m.is_nonnegative is True + assert m.is_negative is False + + m = Min(nn, p, r) + assert m.is_positive is None + assert m.is_nonnegative is None + assert m.is_negative is None + + +def test_Max(): + from sympy.abc import x, y, z + n = Symbol('n', negative=True) + n_ = Symbol('n_', negative=True) + nn = Symbol('nn', nonnegative=True) + p = Symbol('p', positive=True) + p_ = Symbol('p_', positive=True) + r = Symbol('r', real=True) + + assert Max(5, 4) == 5 + + # lists + + assert Max() is S.NegativeInfinity + assert Max(x) == x + assert Max(x, y) == Max(y, x) + assert Max(x, y, z) == Max(z, y, x) + assert Max(x, Max(y, z)) == Max(z, y, x) + assert Max(x, Min(y, oo)) == Max(x, y) + assert Max(n, -oo, n_, p, 2) == Max(p, 2) + assert Max(n, -oo, n_, p) == p + assert Max(2, x, p, n, -oo, S.NegativeInfinity, n_, p, 2) == Max(2, x, p) + assert Max(0, x, 1, y) == Max(1, x, y) + assert Max(r, r + 1, r - 1) == 1 + r + assert Max(1000, 100, -100, x, p, n) == Max(p, x, 1000) + assert Max(cos(x), sin(x)) == Max(sin(x), cos(x)) + assert Max(cos(x), sin(x)).subs(x, 1) == sin(1) + assert Max(cos(x), sin(x)).subs(x, S.Half) == cos(S.Half) + raises(ValueError, lambda: Max(cos(x), sin(x)).subs(x, I)) + raises(ValueError, lambda: Max(I)) + raises(ValueError, lambda: Max(I, x)) + raises(ValueError, lambda: Max(S.ComplexInfinity, 1)) + assert Max(n, -oo, n_, p, 2) == Max(p, 2) + assert Max(n, -oo, n_, p, 1000) == Max(p, 1000) + + assert Max(1, x).diff(x) == Heaviside(x - 1) + assert Max(x, 1).diff(x) == Heaviside(x - 1) + assert Max(x**2, 1 + x, 1).diff(x) == \ + 2*x*Heaviside(x**2 - Max(1, x + 1)) \ + + Heaviside(x - Max(1, x**2) + 1) + + e = Max(0, x) + assert e.n().args == (0, x) + + # issue 8643 + m = Max(p, p_, n, r) + assert m.is_positive is True + assert m.is_nonnegative is True + assert m.is_negative is False + + m = Max(n, n_) + assert m.is_positive is False + assert m.is_nonnegative is False + assert m.is_negative is True + + m = Max(n, n_, r) + assert m.is_positive is None + assert m.is_nonnegative is None + assert m.is_negative is None + + m = Max(n, nn, r) + assert m.is_positive is None + assert m.is_nonnegative is True + assert m.is_negative is False + + +def test_minmax_assumptions(): + r = Symbol('r', real=True) + a = Symbol('a', real=True, algebraic=True) + t = Symbol('t', real=True, transcendental=True) + q = Symbol('q', rational=True) + p = Symbol('p', irrational=True) + n = Symbol('n', rational=True, integer=False) + i = Symbol('i', integer=True) + o = Symbol('o', odd=True) + e = Symbol('e', even=True) + k = Symbol('k', prime=True) + reals = [r, a, t, q, p, n, i, o, e, k] + + for ext in (Max, Min): + for x, y in it.product(reals, repeat=2): + + # Must be real + assert ext(x, y).is_real + + # Algebraic? + if x.is_algebraic and y.is_algebraic: + assert ext(x, y).is_algebraic + elif x.is_transcendental and y.is_transcendental: + assert ext(x, y).is_transcendental + else: + assert ext(x, y).is_algebraic is None + + # Rational? + if x.is_rational and y.is_rational: + assert ext(x, y).is_rational + elif x.is_irrational and y.is_irrational: + assert ext(x, y).is_irrational + else: + assert ext(x, y).is_rational is None + + # Integer? + if x.is_integer and y.is_integer: + assert ext(x, y).is_integer + elif x.is_noninteger and y.is_noninteger: + assert ext(x, y).is_noninteger + else: + assert ext(x, y).is_integer is None + + # Odd? + if x.is_odd and y.is_odd: + assert ext(x, y).is_odd + elif x.is_odd is False and y.is_odd is False: + assert ext(x, y).is_odd is False + else: + assert ext(x, y).is_odd is None + + # Even? + if x.is_even and y.is_even: + assert ext(x, y).is_even + elif x.is_even is False and y.is_even is False: + assert ext(x, y).is_even is False + else: + assert ext(x, y).is_even is None + + # Prime? + if x.is_prime and y.is_prime: + assert ext(x, y).is_prime + elif x.is_prime is False and y.is_prime is False: + assert ext(x, y).is_prime is False + else: + assert ext(x, y).is_prime is None + + +def test_issue_8413(): + x = Symbol('x', real=True) + # we can't evaluate in general because non-reals are not + # comparable: Min(floor(3.2 + I), 3.2 + I) -> ValueError + assert Min(floor(x), x) == floor(x) + assert Min(ceiling(x), x) == x + assert Max(floor(x), x) == x + assert Max(ceiling(x), x) == ceiling(x) + + +def test_root(): + from sympy.abc import x + n = Symbol('n', integer=True) + k = Symbol('k', integer=True) + + assert root(2, 2) == sqrt(2) + assert root(2, 1) == 2 + assert root(2, 3) == 2**Rational(1, 3) + assert root(2, 3) == cbrt(2) + assert root(2, -5) == 2**Rational(4, 5)/2 + + assert root(-2, 1) == -2 + + assert root(-2, 2) == sqrt(2)*I + assert root(-2, 1) == -2 + + assert root(x, 2) == sqrt(x) + assert root(x, 1) == x + assert root(x, 3) == x**Rational(1, 3) + assert root(x, 3) == cbrt(x) + assert root(x, -5) == x**Rational(-1, 5) + + assert root(x, n) == x**(1/n) + assert root(x, -n) == x**(-1/n) + + assert root(x, n, k) == (-1)**(2*k/n)*x**(1/n) + + +def test_real_root(): + assert real_root(-8, 3) == -2 + assert real_root(-16, 4) == root(-16, 4) + r = root(-7, 4) + assert real_root(r) == r + r1 = root(-1, 3) + r2 = r1**2 + r3 = root(-1, 4) + assert real_root(r1 + r2 + r3) == -1 + r2 + r3 + assert real_root(root(-2, 3)) == -root(2, 3) + assert real_root(-8., 3) == -2.0 + x = Symbol('x') + n = Symbol('n') + g = real_root(x, n) + assert g.subs({"x": -8, "n": 3}) == -2 + assert g.subs({"x": 8, "n": 3}) == 2 + # give principle root if there is no real root -- if this is not desired + # then maybe a Root class is needed to raise an error instead + assert g.subs({"x": I, "n": 3}) == cbrt(I) + assert g.subs({"x": -8, "n": 2}) == sqrt(-8) + assert g.subs({"x": I, "n": 2}) == sqrt(I) + + +def test_issue_11463(): + numpy = import_module('numpy') + if not numpy: + skip("numpy not installed.") + x = Symbol('x') + f = lambdify(x, real_root((log(x/(x-2))), 3), 'numpy') + # numpy.select evaluates all options before considering conditions, + # so it raises a warning about root of negative number which does + # not affect the outcome. This warning is suppressed here + with ignore_warnings(RuntimeWarning): + assert f(numpy.array(-1)) < -1 + + +def test_rewrite_MaxMin_as_Heaviside(): + from sympy.abc import x + assert Max(0, x).rewrite(Heaviside) == x*Heaviside(x) + assert Max(3, x).rewrite(Heaviside) == x*Heaviside(x - 3) + \ + 3*Heaviside(-x + 3) + assert Max(0, x+2, 2*x).rewrite(Heaviside) == \ + 2*x*Heaviside(2*x)*Heaviside(x - 2) + \ + (x + 2)*Heaviside(-x + 2)*Heaviside(x + 2) + + assert Min(0, x).rewrite(Heaviside) == x*Heaviside(-x) + assert Min(3, x).rewrite(Heaviside) == x*Heaviside(-x + 3) + \ + 3*Heaviside(x - 3) + assert Min(x, -x, -2).rewrite(Heaviside) == \ + x*Heaviside(-2*x)*Heaviside(-x - 2) - \ + x*Heaviside(2*x)*Heaviside(x - 2) \ + - 2*Heaviside(-x + 2)*Heaviside(x + 2) + + +def test_rewrite_MaxMin_as_Piecewise(): + from sympy.core.symbol import symbols + from sympy.functions.elementary.piecewise import Piecewise + x, y, z, a, b = symbols('x y z a b', real=True) + vx, vy, va = symbols('vx vy va') + assert Max(a, b).rewrite(Piecewise) == Piecewise((a, a >= b), (b, True)) + assert Max(x, y, z).rewrite(Piecewise) == Piecewise((x, (x >= y) & (x >= z)), (y, y >= z), (z, True)) + assert Max(x, y, a, b).rewrite(Piecewise) == Piecewise((a, (a >= b) & (a >= x) & (a >= y)), + (b, (b >= x) & (b >= y)), (x, x >= y), (y, True)) + assert Min(a, b).rewrite(Piecewise) == Piecewise((a, a <= b), (b, True)) + assert Min(x, y, z).rewrite(Piecewise) == Piecewise((x, (x <= y) & (x <= z)), (y, y <= z), (z, True)) + assert Min(x, y, a, b).rewrite(Piecewise) == Piecewise((a, (a <= b) & (a <= x) & (a <= y)), + (b, (b <= x) & (b <= y)), (x, x <= y), (y, True)) + + # Piecewise rewriting of Min/Max does also takes place for not explicitly real arguments + assert Max(vx, vy).rewrite(Piecewise) == Piecewise((vx, vx >= vy), (vy, True)) + assert Min(va, vx, vy).rewrite(Piecewise) == Piecewise((va, (va <= vx) & (va <= vy)), (vx, vx <= vy), (vy, True)) + + +def test_issue_11099(): + from sympy.abc import x, y + # some fixed value tests + fixed_test_data = {x: -2, y: 3} + assert Min(x, y).evalf(subs=fixed_test_data) == \ + Min(x, y).subs(fixed_test_data).evalf() + assert Max(x, y).evalf(subs=fixed_test_data) == \ + Max(x, y).subs(fixed_test_data).evalf() + # randomly generate some test data + from sympy.core.random import randint + for i in range(20): + random_test_data = {x: randint(-100, 100), y: randint(-100, 100)} + assert Min(x, y).evalf(subs=random_test_data) == \ + Min(x, y).subs(random_test_data).evalf() + assert Max(x, y).evalf(subs=random_test_data) == \ + Max(x, y).subs(random_test_data).evalf() + + +def test_issue_12638(): + from sympy.abc import a, b, c + assert Min(a, b, c, Max(a, b)) == Min(a, b, c) + assert Min(a, b, Max(a, b, c)) == Min(a, b) + assert Min(a, b, Max(a, c)) == Min(a, b) + +def test_issue_21399(): + from sympy.abc import a, b, c + assert Max(Min(a, b), Min(a, b, c)) == Min(a, b) + + +def test_instantiation_evaluation(): + from sympy.abc import v, w, x, y, z + assert Min(1, Max(2, x)) == 1 + assert Max(3, Min(2, x)) == 3 + assert Min(Max(x, y), Max(x, z)) == Max(x, Min(y, z)) + assert set(Min(Max(w, x), Max(y, z)).args) == { + Max(w, x), Max(y, z)} + assert Min(Max(x, y), Max(x, z), w) == Min( + w, Max(x, Min(y, z))) + A, B = Min, Max + for i in range(2): + assert A(x, B(x, y)) == x + assert A(x, B(y, A(x, w, z))) == A(x, B(y, A(w, z))) + A, B = B, A + assert Min(w, Max(x, y), Max(v, x, z)) == Min( + w, Max(x, Min(y, Max(v, z)))) + +def test_rewrite_as_Abs(): + from itertools import permutations + from sympy.functions.elementary.complexes import Abs + from sympy.abc import x, y, z, w + def test(e): + free = e.free_symbols + a = e.rewrite(Abs) + assert not a.has(Min, Max) + for i in permutations(range(len(free))): + reps = dict(zip(free, i)) + assert a.xreplace(reps) == e.xreplace(reps) + test(Min(x, y)) + test(Max(x, y)) + test(Min(x, y, z)) + test(Min(Max(w, x), Max(y, z))) + +def test_issue_14000(): + assert isinstance(sqrt(4, evaluate=False), Pow) == True + assert isinstance(cbrt(3.5, evaluate=False), Pow) == True + assert isinstance(root(16, 4, evaluate=False), Pow) == True + + assert sqrt(4, evaluate=False) == Pow(4, S.Half, evaluate=False) + assert cbrt(3.5, evaluate=False) == Pow(3.5, Rational(1, 3), evaluate=False) + assert root(4, 2, evaluate=False) == Pow(4, S.Half, evaluate=False) + + assert root(16, 4, 2, evaluate=False).has(Pow) == True + assert real_root(-8, 3, evaluate=False).has(Pow) == True + +def test_issue_6899(): + from sympy.core.function import Lambda + x = Symbol('x') + eqn = Lambda(x, x) + assert eqn.func(*eqn.args) == eqn + +def test_Rem(): + from sympy.abc import x, y + assert Rem(5, 3) == 2 + assert Rem(-5, 3) == -2 + assert Rem(5, -3) == 2 + assert Rem(-5, -3) == -2 + assert Rem(x**3, y) == Rem(x**3, y) + assert Rem(Rem(-5, 3) + 3, 3) == 1 + + +def test_minmax_no_evaluate(): + from sympy import evaluate + p = Symbol('p', positive=True) + + assert Max(1, 3) == 3 + assert Max(1, 3).args == () + assert Max(0, p) == p + assert Max(0, p).args == () + assert Min(0, p) == 0 + assert Min(0, p).args == () + + assert Max(1, 3, evaluate=False) != 3 + assert Max(1, 3, evaluate=False).args == (1, 3) + assert Max(0, p, evaluate=False) != p + assert Max(0, p, evaluate=False).args == (0, p) + assert Min(0, p, evaluate=False) != 0 + assert Min(0, p, evaluate=False).args == (0, p) + + with evaluate(False): + assert Max(1, 3) != 3 + assert Max(1, 3).args == (1, 3) + assert Max(0, p) != p + assert Max(0, p).args == (0, p) + assert Min(0, p) != 0 + assert Min(0, p).args == (0, p) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/functions/elementary/trigonometric.py b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/functions/elementary/trigonometric.py new file mode 100644 index 0000000000000000000000000000000000000000..24e5db81f17a215f5b344291f0e9bf4752e5317d --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/functions/elementary/trigonometric.py @@ -0,0 +1,3627 @@ +from __future__ import annotations +from sympy.core.add import Add +from sympy.core.cache import cacheit +from sympy.core.expr import Expr +from sympy.core.function import DefinedFunction, ArgumentIndexError, PoleError, expand_mul +from sympy.core.logic import fuzzy_not, fuzzy_or, FuzzyBool, fuzzy_and +from sympy.core.mod import Mod +from sympy.core.numbers import Rational, pi, Integer, Float, equal_valued +from sympy.core.relational import Ne, Eq +from sympy.core.singleton import S +from sympy.core.symbol import Symbol, Dummy +from sympy.core.sympify import sympify +from sympy.functions.combinatorial.factorials import factorial, RisingFactorial +from sympy.functions.combinatorial.numbers import bernoulli, euler +from sympy.functions.elementary.complexes import arg as arg_f, im, re +from sympy.functions.elementary.exponential import log, exp +from sympy.functions.elementary.integers import floor +from sympy.functions.elementary.miscellaneous import sqrt, Min, Max +from sympy.functions.elementary.piecewise import Piecewise +from sympy.functions.elementary._trigonometric_special import ( + cos_table, ipartfrac, fermat_coords) +from sympy.logic.boolalg import And +from sympy.ntheory import factorint +from sympy.polys.specialpolys import symmetric_poly +from sympy.utilities.iterables import numbered_symbols + + +############################################################################### +########################## UTILITIES ########################################## +############################################################################### + + +def _imaginary_unit_as_coefficient(arg): + """ Helper to extract symbolic coefficient for imaginary unit """ + if isinstance(arg, Float): + return None + else: + return arg.as_coefficient(S.ImaginaryUnit) + +############################################################################### +########################## TRIGONOMETRIC FUNCTIONS ############################ +############################################################################### + + +class TrigonometricFunction(DefinedFunction): + """Base class for trigonometric functions. """ + + unbranched = True + _singularities = (S.ComplexInfinity,) + + def _eval_is_rational(self): + s = self.func(*self.args) + if s.func == self.func: + if s.args[0].is_rational and fuzzy_not(s.args[0].is_zero): + return False + else: + return s.is_rational + + def _eval_is_algebraic(self): + s = self.func(*self.args) + if s.func == self.func: + if fuzzy_not(self.args[0].is_zero) and self.args[0].is_algebraic: + return False + pi_coeff = _pi_coeff(self.args[0]) + if pi_coeff is not None and pi_coeff.is_rational: + return True + else: + return s.is_algebraic + + def _eval_expand_complex(self, deep=True, **hints): + re_part, im_part = self.as_real_imag(deep=deep, **hints) + return re_part + im_part*S.ImaginaryUnit + + def _as_real_imag(self, deep=True, **hints): + if self.args[0].is_extended_real: + if deep: + hints['complex'] = False + return (self.args[0].expand(deep, **hints), S.Zero) + else: + return (self.args[0], S.Zero) + if deep: + re, im = self.args[0].expand(deep, **hints).as_real_imag() + else: + re, im = self.args[0].as_real_imag() + return (re, im) + + def _period(self, general_period, symbol=None): + f = expand_mul(self.args[0]) + if symbol is None: + symbol = tuple(f.free_symbols)[0] + + if not f.has(symbol): + return S.Zero + + if f == symbol: + return general_period + + if symbol in f.free_symbols: + if f.is_Mul: + g, h = f.as_independent(symbol) + if h == symbol: + return general_period/abs(g) + + if f.is_Add: + a, h = f.as_independent(symbol) + g, h = h.as_independent(symbol, as_Add=False) + if h == symbol: + return general_period/abs(g) + + raise NotImplementedError("Use the periodicity function instead.") + + +@cacheit +def _table2(): + # If nested sqrt's are worse than un-evaluation + # you can require q to be in (1, 2, 3, 4, 6, 12) + # q <= 12, q=15, q=20, q=24, q=30, q=40, q=60, q=120 return + # expressions with 2 or fewer sqrt nestings. + return { + 12: (3, 4), + 20: (4, 5), + 30: (5, 6), + 15: (6, 10), + 24: (6, 8), + 40: (8, 10), + 60: (20, 30), + 120: (40, 60) + } + + +def _peeloff_pi(arg): + r""" + Split ARG into two parts, a "rest" and a multiple of $\pi$. + This assumes ARG to be an Add. + The multiple of $\pi$ returned in the second position is always a Rational. + + Examples + ======== + + >>> from sympy.functions.elementary.trigonometric import _peeloff_pi + >>> from sympy import pi + >>> from sympy.abc import x, y + >>> _peeloff_pi(x + pi/2) + (x, 1/2) + >>> _peeloff_pi(x + 2*pi/3 + pi*y) + (x + pi*y + pi/6, 1/2) + + """ + pi_coeff = S.Zero + rest_terms = [] + for a in Add.make_args(arg): + K = a.coeff(pi) + if K and K.is_rational: + pi_coeff += K + else: + rest_terms.append(a) + + if pi_coeff is S.Zero: + return arg, S.Zero + + m1 = (pi_coeff % S.Half) + m2 = pi_coeff - m1 + if m2.is_integer or ((2*m2).is_integer and m2.is_even is False): + return Add(*(rest_terms + [m1*pi])), m2 + return arg, S.Zero + + +def _pi_coeff(arg: Expr, cycles: int = 1) -> Expr | None: + r""" + When arg is a Number times $\pi$ (e.g. $3\pi/2$) then return the Number + normalized to be in the range $[0, 2]$, else `None`. + + When an even multiple of $\pi$ is encountered, if it is multiplying + something with known parity then the multiple is returned as 0 otherwise + as 2. + + Examples + ======== + + >>> from sympy.functions.elementary.trigonometric import _pi_coeff + >>> from sympy import pi, Dummy + >>> from sympy.abc import x + >>> _pi_coeff(3*x*pi) + 3*x + >>> _pi_coeff(11*pi/7) + 11/7 + >>> _pi_coeff(-11*pi/7) + 3/7 + >>> _pi_coeff(4*pi) + 0 + >>> _pi_coeff(5*pi) + 1 + >>> _pi_coeff(5.0*pi) + 1 + >>> _pi_coeff(5.5*pi) + 3/2 + >>> _pi_coeff(2 + pi) + + >>> _pi_coeff(2*Dummy(integer=True)*pi) + 2 + >>> _pi_coeff(2*Dummy(even=True)*pi) + 0 + + """ + if arg is pi: + return S.One + elif not arg: + return S.Zero + elif arg.is_Mul: + cx = arg.coeff(pi) + if cx: + c, x = cx.as_coeff_Mul() # pi is not included as coeff + if c.is_Float: + # recast exact binary fractions to Rationals + f = abs(c) % 1 + if f != 0: + p = -int(round(log(f, 2).evalf())) + m = 2**p + cm = c*m + i = int(cm) + if equal_valued(i, cm): + c = Rational(i, m) + cx = c*x + else: + c = Rational(int(c)) + cx = c*x + if x.is_integer: + c2 = c % 2 + if c2 == 1: + return x + elif not c2: + if x.is_even is not None: # known parity + return S.Zero + return Integer(2) + else: + return c2*x + return cx + elif arg.is_zero: + return S.Zero + return None + + +class sin(TrigonometricFunction): + r""" + The sine function. + + Returns the sine of x (measured in radians). + + Explanation + =========== + + This function will evaluate automatically in the + case $x/\pi$ is some rational number [4]_. For example, + if $x$ is a multiple of $\pi$, $\pi/2$, $\pi/3$, $\pi/4$, and $\pi/6$. + + Examples + ======== + + >>> from sympy import sin, pi + >>> from sympy.abc import x + >>> sin(x**2).diff(x) + 2*x*cos(x**2) + >>> sin(1).diff(x) + 0 + >>> sin(pi) + 0 + >>> sin(pi/2) + 1 + >>> sin(pi/6) + 1/2 + >>> sin(pi/12) + -sqrt(2)/4 + sqrt(6)/4 + + + See Also + ======== + + csc, cos, sec, tan, cot + asin, acsc, acos, asec, atan, acot, atan2 + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Trigonometric_functions + .. [2] https://dlmf.nist.gov/4.14 + .. [3] https://functions.wolfram.com/ElementaryFunctions/Sin + .. [4] https://mathworld.wolfram.com/TrigonometryAngles.html + + """ + + def period(self, symbol=None): + return self._period(2*pi, symbol) + + def fdiff(self, argindex=1): + if argindex == 1: + return cos(self.args[0]) + else: + raise ArgumentIndexError(self, argindex) + + @classmethod + def eval(cls, arg): + from sympy.calculus.accumulationbounds import AccumBounds + from sympy.sets.setexpr import SetExpr + if arg.is_Number: + if arg is S.NaN: + return S.NaN + elif arg.is_zero: + return S.Zero + elif arg in (S.Infinity, S.NegativeInfinity): + return AccumBounds(-1, 1) + + if arg is S.ComplexInfinity: + return S.NaN + + if isinstance(arg, AccumBounds): + from sympy.sets.sets import FiniteSet + min, max = arg.min, arg.max + d = floor(min/(2*pi)) + if min is not S.NegativeInfinity: + min = min - d*2*pi + if max is not S.Infinity: + max = max - d*2*pi + if AccumBounds(min, max).intersection(FiniteSet(pi/2, pi*Rational(5, 2))) \ + is not S.EmptySet and \ + AccumBounds(min, max).intersection(FiniteSet(pi*Rational(3, 2), + pi*Rational(7, 2))) is not S.EmptySet: + return AccumBounds(-1, 1) + elif AccumBounds(min, max).intersection(FiniteSet(pi/2, pi*Rational(5, 2))) \ + is not S.EmptySet: + return AccumBounds(Min(sin(min), sin(max)), 1) + elif AccumBounds(min, max).intersection(FiniteSet(pi*Rational(3, 2), pi*Rational(8, 2))) \ + is not S.EmptySet: + return AccumBounds(-1, Max(sin(min), sin(max))) + else: + return AccumBounds(Min(sin(min), sin(max)), + Max(sin(min), sin(max))) + elif isinstance(arg, SetExpr): + return arg._eval_func(cls) + + if arg.could_extract_minus_sign(): + return -cls(-arg) + + i_coeff = _imaginary_unit_as_coefficient(arg) + if i_coeff is not None: + from sympy.functions.elementary.hyperbolic import sinh + return S.ImaginaryUnit*sinh(i_coeff) + + pi_coeff = _pi_coeff(arg) + if pi_coeff is not None: + if pi_coeff.is_integer: + return S.Zero + + if (2*pi_coeff).is_integer: + # is_even-case handled above as then pi_coeff.is_integer, + # so check if known to be not even + if pi_coeff.is_even is False: + return S.NegativeOne**(pi_coeff - S.Half) + + if not pi_coeff.is_Rational: + narg = pi_coeff*pi + if narg != arg: + return cls(narg) + return None + + # https://github.com/sympy/sympy/issues/6048 + # transform a sine to a cosine, to avoid redundant code + if pi_coeff.is_Rational: + x = pi_coeff % 2 + if x > 1: + return -cls((x % 1)*pi) + if 2*x > 1: + return cls((1 - x)*pi) + narg = ((pi_coeff + Rational(3, 2)) % 2)*pi + result = cos(narg) + if not isinstance(result, cos): + return result + if pi_coeff*pi != arg: + return cls(pi_coeff*pi) + return None + + if arg.is_Add: + x, m = _peeloff_pi(arg) + if m: + m = m*pi + return sin(m)*cos(x) + cos(m)*sin(x) + + if arg.is_zero: + return S.Zero + + if isinstance(arg, asin): + return arg.args[0] + + if isinstance(arg, atan): + x = arg.args[0] + return x/sqrt(1 + x**2) + + if isinstance(arg, atan2): + y, x = arg.args + return y/sqrt(x**2 + y**2) + + if isinstance(arg, acos): + x = arg.args[0] + return sqrt(1 - x**2) + + if isinstance(arg, acot): + x = arg.args[0] + return 1/(sqrt(1 + 1/x**2)*x) + + if isinstance(arg, acsc): + x = arg.args[0] + return 1/x + + if isinstance(arg, asec): + x = arg.args[0] + return sqrt(1 - 1/x**2) + + @staticmethod + @cacheit + def taylor_term(n, x, *previous_terms): + if n < 0 or n % 2 == 0: + return S.Zero + else: + x = sympify(x) + + if len(previous_terms) > 2: + p = previous_terms[-2] + return -p*x**2/(n*(n - 1)) + else: + return S.NegativeOne**(n//2)*x**n/factorial(n) + + def _eval_nseries(self, x, n, logx, cdir=0): + arg = self.args[0] + if logx is not None: + arg = arg.subs(log(x), logx) + if arg.subs(x, 0).has(S.NaN, S.ComplexInfinity): + raise PoleError("Cannot expand %s around 0" % (self)) + return super()._eval_nseries(x, n=n, logx=logx, cdir=cdir) + + def _eval_rewrite_as_exp(self, arg, **kwargs): + from sympy.functions.elementary.hyperbolic import HyperbolicFunction + I = S.ImaginaryUnit + if isinstance(arg, (TrigonometricFunction, HyperbolicFunction)): + arg = arg.func(arg.args[0]).rewrite(exp) + return (exp(arg*I) - exp(-arg*I))/(2*I) + + def _eval_rewrite_as_Pow(self, arg, **kwargs): + if isinstance(arg, log): + I = S.ImaginaryUnit + x = arg.args[0] + return I*x**-I/2 - I*x**I /2 + + def _eval_rewrite_as_cos(self, arg, **kwargs): + return cos(arg - pi/2, evaluate=False) + + def _eval_rewrite_as_tan(self, arg, **kwargs): + tan_half = tan(S.Half*arg) + return 2*tan_half/(1 + tan_half**2) + + def _eval_rewrite_as_sincos(self, arg, **kwargs): + return sin(arg)*cos(arg)/cos(arg) + + def _eval_rewrite_as_cot(self, arg, **kwargs): + cot_half = cot(S.Half*arg) + return Piecewise((0, And(Eq(im(arg), 0), Eq(Mod(arg, pi), 0))), + (2*cot_half/(1 + cot_half**2), True)) + + def _eval_rewrite_as_pow(self, arg, **kwargs): + return self.rewrite(cos, **kwargs).rewrite(pow, **kwargs) + + def _eval_rewrite_as_sqrt(self, arg, **kwargs): + return self.rewrite(cos, **kwargs).rewrite(sqrt, **kwargs) + + def _eval_rewrite_as_csc(self, arg, **kwargs): + return 1/csc(arg) + + def _eval_rewrite_as_sec(self, arg, **kwargs): + return 1/sec(arg - pi/2, evaluate=False) + + def _eval_rewrite_as_sinc(self, arg, **kwargs): + return arg*sinc(arg) + + def _eval_rewrite_as_besselj(self, arg, **kwargs): + from sympy.functions.special.bessel import besselj + return sqrt(pi*arg/2)*besselj(S.Half, arg) + + def _eval_conjugate(self): + return self.func(self.args[0].conjugate()) + + def as_real_imag(self, deep=True, **hints): + from sympy.functions.elementary.hyperbolic import cosh, sinh + re, im = self._as_real_imag(deep=deep, **hints) + return (sin(re)*cosh(im), cos(re)*sinh(im)) + + def _eval_expand_trig(self, **hints): + from sympy.functions.special.polynomials import chebyshevt, chebyshevu + arg = self.args[0] + x = None + if arg.is_Add: # TODO, implement more if deep stuff here + # TODO: Do this more efficiently for more than two terms + x, y = arg.as_two_terms() + sx = sin(x, evaluate=False)._eval_expand_trig() + sy = sin(y, evaluate=False)._eval_expand_trig() + cx = cos(x, evaluate=False)._eval_expand_trig() + cy = cos(y, evaluate=False)._eval_expand_trig() + return sx*cy + sy*cx + elif arg.is_Mul: + n, x = arg.as_coeff_Mul(rational=True) + if n.is_Integer: # n will be positive because of .eval + # canonicalization + + # See https://mathworld.wolfram.com/Multiple-AngleFormulas.html + if n.is_odd: + return S.NegativeOne**((n - 1)/2)*chebyshevt(n, sin(x)) + else: + return expand_mul(S.NegativeOne**(n/2 - 1)*cos(x)* + chebyshevu(n - 1, sin(x)), deep=False) + return sin(arg) + + def _eval_as_leading_term(self, x, logx, cdir): + from sympy.calculus.accumulationbounds import AccumBounds + arg = self.args[0] + x0 = arg.subs(x, 0).cancel() + n = x0/pi + if n.is_integer: + lt = (arg - n*pi).as_leading_term(x) + return (S.NegativeOne**n)*lt + if x0 is S.ComplexInfinity: + x0 = arg.limit(x, 0, dir='-' if re(cdir).is_negative else '+') + if x0 in [S.Infinity, S.NegativeInfinity]: + return AccumBounds(-1, 1) + return self.func(x0) if x0.is_finite else self + + def _eval_is_extended_real(self): + if self.args[0].is_extended_real: + return True + + def _eval_is_finite(self): + arg = self.args[0] + if arg.is_extended_real: + return True + + def _eval_is_zero(self): + rest, pi_mult = _peeloff_pi(self.args[0]) + if rest.is_zero: + return pi_mult.is_integer + + def _eval_is_complex(self): + if self.args[0].is_extended_real \ + or self.args[0].is_complex: + return True + + +class cos(TrigonometricFunction): + """ + The cosine function. + + Returns the cosine of x (measured in radians). + + Explanation + =========== + + See :func:`sin` for notes about automatic evaluation. + + Examples + ======== + + >>> from sympy import cos, pi + >>> from sympy.abc import x + >>> cos(x**2).diff(x) + -2*x*sin(x**2) + >>> cos(1).diff(x) + 0 + >>> cos(pi) + -1 + >>> cos(pi/2) + 0 + >>> cos(2*pi/3) + -1/2 + >>> cos(pi/12) + sqrt(2)/4 + sqrt(6)/4 + + See Also + ======== + + sin, csc, sec, tan, cot + asin, acsc, acos, asec, atan, acot, atan2 + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Trigonometric_functions + .. [2] https://dlmf.nist.gov/4.14 + .. [3] https://functions.wolfram.com/ElementaryFunctions/Cos + + """ + + def period(self, symbol=None): + return self._period(2*pi, symbol) + + def fdiff(self, argindex=1): + if argindex == 1: + return -sin(self.args[0]) + else: + raise ArgumentIndexError(self, argindex) + + @classmethod + def eval(cls, arg): + from sympy.functions.special.polynomials import chebyshevt + from sympy.calculus.accumulationbounds import AccumBounds + from sympy.sets.setexpr import SetExpr + if arg.is_Number: + if arg is S.NaN: + return S.NaN + elif arg.is_zero: + return S.One + elif arg in (S.Infinity, S.NegativeInfinity): + # In this case it is better to return AccumBounds(-1, 1) + # rather than returning S.NaN, since AccumBounds(-1, 1) + # preserves the information that sin(oo) is between + # -1 and 1, where S.NaN does not do that. + return AccumBounds(-1, 1) + + if arg is S.ComplexInfinity: + return S.NaN + + if isinstance(arg, AccumBounds): + return sin(arg + pi/2) + elif isinstance(arg, SetExpr): + return arg._eval_func(cls) + + if arg.is_extended_real and arg.is_finite is False: + return AccumBounds(-1, 1) + + if arg.could_extract_minus_sign(): + return cls(-arg) + + i_coeff = _imaginary_unit_as_coefficient(arg) + if i_coeff is not None: + from sympy.functions.elementary.hyperbolic import cosh + return cosh(i_coeff) + + pi_coeff = _pi_coeff(arg) + if pi_coeff is not None: + if pi_coeff.is_integer: + return (S.NegativeOne)**pi_coeff + + if (2*pi_coeff).is_integer: + # is_even-case handled above as then pi_coeff.is_integer, + # so check if known to be not even + if pi_coeff.is_even is False: + return S.Zero + + if not pi_coeff.is_Rational: + narg = pi_coeff*pi + if narg != arg: + return cls(narg) + return None + + # cosine formula ##################### + # https://github.com/sympy/sympy/issues/6048 + # explicit calculations are performed for + # cos(k pi/n) for n = 8,10,12,15,20,24,30,40,60,120 + # Some other exact values like cos(k pi/240) can be + # calculated using a partial-fraction decomposition + # by calling cos( X ).rewrite(sqrt) + if pi_coeff.is_Rational: + q = pi_coeff.q + p = pi_coeff.p % (2*q) + if p > q: + narg = (pi_coeff - 1)*pi + return -cls(narg) + if 2*p > q: + narg = (1 - pi_coeff)*pi + return -cls(narg) + + # If nested sqrt's are worse than un-evaluation + # you can require q to be in (1, 2, 3, 4, 6, 12) + # q <= 12, q=15, q=20, q=24, q=30, q=40, q=60, q=120 return + # expressions with 2 or fewer sqrt nestings. + table2 = _table2() + if q in table2: + a, b = table2[q] + a, b = p*pi/a, p*pi/b + nvala, nvalb = cls(a), cls(b) + if None in (nvala, nvalb): + return None + return nvala*nvalb + cls(pi/2 - a)*cls(pi/2 - b) + + if q > 12: + return None + + cst_table_some = { + 3: S.Half, + 5: (sqrt(5) + 1) / 4, + } + if q in cst_table_some: + cts = cst_table_some[pi_coeff.q] + return chebyshevt(pi_coeff.p, cts).expand() + + if 0 == q % 2: + narg = (pi_coeff*2)*pi + nval = cls(narg) + if None == nval: + return None + x = (2*pi_coeff + 1)/2 + sign_cos = (-1)**((-1 if x < 0 else 1)*int(abs(x))) + return sign_cos*sqrt( (1 + nval)/2 ) + return None + + if arg.is_Add: + x, m = _peeloff_pi(arg) + if m: + m = m*pi + return cos(m)*cos(x) - sin(m)*sin(x) + + if arg.is_zero: + return S.One + + if isinstance(arg, acos): + return arg.args[0] + + if isinstance(arg, atan): + x = arg.args[0] + return 1/sqrt(1 + x**2) + + if isinstance(arg, atan2): + y, x = arg.args + return x/sqrt(x**2 + y**2) + + if isinstance(arg, asin): + x = arg.args[0] + return sqrt(1 - x ** 2) + + if isinstance(arg, acot): + x = arg.args[0] + return 1/sqrt(1 + 1/x**2) + + if isinstance(arg, acsc): + x = arg.args[0] + return sqrt(1 - 1/x**2) + + if isinstance(arg, asec): + x = arg.args[0] + return 1/x + + @staticmethod + @cacheit + def taylor_term(n, x, *previous_terms): + if n < 0 or n % 2 == 1: + return S.Zero + else: + x = sympify(x) + + if len(previous_terms) > 2: + p = previous_terms[-2] + return -p*x**2/(n*(n - 1)) + else: + return S.NegativeOne**(n//2)*x**n/factorial(n) + + def _eval_nseries(self, x, n, logx, cdir=0): + arg = self.args[0] + if logx is not None: + arg = arg.subs(log(x), logx) + if arg.subs(x, 0).has(S.NaN, S.ComplexInfinity): + raise PoleError("Cannot expand %s around 0" % (self)) + return super()._eval_nseries(x, n=n, logx=logx, cdir=cdir) + + def _eval_rewrite_as_exp(self, arg, **kwargs): + I = S.ImaginaryUnit + from sympy.functions.elementary.hyperbolic import HyperbolicFunction + if isinstance(arg, (TrigonometricFunction, HyperbolicFunction)): + arg = arg.func(arg.args[0]).rewrite(exp, **kwargs) + return (exp(arg*I) + exp(-arg*I))/2 + + def _eval_rewrite_as_Pow(self, arg, **kwargs): + if isinstance(arg, log): + I = S.ImaginaryUnit + x = arg.args[0] + return x**I/2 + x**-I/2 + + def _eval_rewrite_as_sin(self, arg, **kwargs): + return sin(arg + pi/2, evaluate=False) + + def _eval_rewrite_as_tan(self, arg, **kwargs): + tan_half = tan(S.Half*arg)**2 + return (1 - tan_half)/(1 + tan_half) + + def _eval_rewrite_as_sincos(self, arg, **kwargs): + return sin(arg)*cos(arg)/sin(arg) + + def _eval_rewrite_as_cot(self, arg, **kwargs): + cot_half = cot(S.Half*arg)**2 + return Piecewise((1, And(Eq(im(arg), 0), Eq(Mod(arg, 2*pi), 0))), + ((cot_half - 1)/(cot_half + 1), True)) + + def _eval_rewrite_as_pow(self, arg, **kwargs): + return self._eval_rewrite_as_sqrt(arg, **kwargs) + + def _eval_rewrite_as_sqrt(self, arg: Expr, **kwargs): + from sympy.functions.special.polynomials import chebyshevt + + pi_coeff = _pi_coeff(arg) + if pi_coeff is None: + return None + + if isinstance(pi_coeff, Integer): + return None + + if not isinstance(pi_coeff, Rational): + return None + + cst_table_some = cos_table() + + if pi_coeff.q in cst_table_some: + rv = chebyshevt(pi_coeff.p, cst_table_some[pi_coeff.q]()) + if pi_coeff.q < 257: + rv = rv.expand() + return rv + + if not pi_coeff.q % 2: # recursively remove factors of 2 + pico2 = pi_coeff * 2 + nval = cos(pico2 * pi).rewrite(sqrt, **kwargs) + x = (pico2 + 1) / 2 + sign_cos = -1 if int(x) % 2 else 1 + return sign_cos * sqrt((1 + nval) / 2) + + FC = fermat_coords(pi_coeff.q) + if FC: + denoms = FC + else: + denoms = [b**e for b, e in factorint(pi_coeff.q).items()] + + apart = ipartfrac(*denoms) + decomp = (pi_coeff.p * Rational(n, d) for n, d in zip(apart, denoms)) + X = [(x[1], x[0]*pi) for x in zip(decomp, numbered_symbols('z'))] + pcls = cos(sum(x[0] for x in X))._eval_expand_trig().subs(X) + + if not FC or len(FC) == 1: + return pcls + return pcls.rewrite(sqrt, **kwargs) + + def _eval_rewrite_as_sec(self, arg, **kwargs): + return 1/sec(arg) + + def _eval_rewrite_as_csc(self, arg, **kwargs): + return 1/sec(arg).rewrite(csc, **kwargs) + + def _eval_rewrite_as_besselj(self, arg, **kwargs): + from sympy.functions.special.bessel import besselj + return Piecewise( + (sqrt(pi*arg/2)*besselj(-S.Half, arg), Ne(arg, 0)), + (1, True) + ) + + def _eval_conjugate(self): + return self.func(self.args[0].conjugate()) + + def as_real_imag(self, deep=True, **hints): + from sympy.functions.elementary.hyperbolic import cosh, sinh + re, im = self._as_real_imag(deep=deep, **hints) + return (cos(re)*cosh(im), -sin(re)*sinh(im)) + + def _eval_expand_trig(self, **hints): + from sympy.functions.special.polynomials import chebyshevt + arg = self.args[0] + x = None + if arg.is_Add: # TODO: Do this more efficiently for more than two terms + x, y = arg.as_two_terms() + sx = sin(x, evaluate=False)._eval_expand_trig() + sy = sin(y, evaluate=False)._eval_expand_trig() + cx = cos(x, evaluate=False)._eval_expand_trig() + cy = cos(y, evaluate=False)._eval_expand_trig() + return cx*cy - sx*sy + elif arg.is_Mul: + coeff, terms = arg.as_coeff_Mul(rational=True) + if coeff.is_Integer: + return chebyshevt(coeff, cos(terms)) + return cos(arg) + + def _eval_as_leading_term(self, x, logx, cdir): + from sympy.calculus.accumulationbounds import AccumBounds + arg = self.args[0] + x0 = arg.subs(x, 0).cancel() + n = (x0 + pi/2)/pi + if n.is_integer: + lt = (arg - n*pi + pi/2).as_leading_term(x) + return (S.NegativeOne**n)*lt + if x0 is S.ComplexInfinity: + x0 = arg.limit(x, 0, dir='-' if re(cdir).is_negative else '+') + if x0 in [S.Infinity, S.NegativeInfinity]: + return AccumBounds(-1, 1) + return self.func(x0) if x0.is_finite else self + + def _eval_is_extended_real(self): + if self.args[0].is_extended_real: + return True + + def _eval_is_finite(self): + arg = self.args[0] + + if arg.is_extended_real: + return True + + def _eval_is_complex(self): + if self.args[0].is_extended_real \ + or self.args[0].is_complex: + return True + + def _eval_is_zero(self): + rest, pi_mult = _peeloff_pi(self.args[0]) + if rest.is_zero and pi_mult: + return (pi_mult - S.Half).is_integer + + +class tan(TrigonometricFunction): + """ + The tangent function. + + Returns the tangent of x (measured in radians). + + Explanation + =========== + + See :class:`sin` for notes about automatic evaluation. + + Examples + ======== + + >>> from sympy import tan, pi + >>> from sympy.abc import x + >>> tan(x**2).diff(x) + 2*x*(tan(x**2)**2 + 1) + >>> tan(1).diff(x) + 0 + >>> tan(pi/8).expand() + -1 + sqrt(2) + + See Also + ======== + + sin, csc, cos, sec, cot + asin, acsc, acos, asec, atan, acot, atan2 + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Trigonometric_functions + .. [2] https://dlmf.nist.gov/4.14 + .. [3] https://functions.wolfram.com/ElementaryFunctions/Tan + + """ + + def period(self, symbol=None): + return self._period(pi, symbol) + + def fdiff(self, argindex=1): + if argindex == 1: + return S.One + self**2 + else: + raise ArgumentIndexError(self, argindex) + + def inverse(self, argindex=1): + """ + Returns the inverse of this function. + """ + return atan + + @classmethod + def eval(cls, arg): + from sympy.calculus.accumulationbounds import AccumBounds + if arg.is_Number: + if arg is S.NaN: + return S.NaN + elif arg.is_zero: + return S.Zero + elif arg in (S.Infinity, S.NegativeInfinity): + return AccumBounds(S.NegativeInfinity, S.Infinity) + + if arg is S.ComplexInfinity: + return S.NaN + + if isinstance(arg, AccumBounds): + min, max = arg.min, arg.max + d = floor(min/pi) + if min is not S.NegativeInfinity: + min = min - d*pi + if max is not S.Infinity: + max = max - d*pi + from sympy.sets.sets import FiniteSet + if AccumBounds(min, max).intersection(FiniteSet(pi/2, pi*Rational(3, 2))): + return AccumBounds(S.NegativeInfinity, S.Infinity) + else: + return AccumBounds(tan(min), tan(max)) + + if arg.could_extract_minus_sign(): + return -cls(-arg) + + i_coeff = _imaginary_unit_as_coefficient(arg) + if i_coeff is not None: + from sympy.functions.elementary.hyperbolic import tanh + return S.ImaginaryUnit*tanh(i_coeff) + + pi_coeff = _pi_coeff(arg, 2) + if pi_coeff is not None: + if pi_coeff.is_integer: + return S.Zero + + if not pi_coeff.is_Rational: + narg = pi_coeff*pi + if narg != arg: + return cls(narg) + return None + + if pi_coeff.is_Rational: + q = pi_coeff.q + p = pi_coeff.p % q + # ensure simplified results are returned for n*pi/5, n*pi/10 + table10 = { + 1: sqrt(1 - 2*sqrt(5)/5), + 2: sqrt(5 - 2*sqrt(5)), + 3: sqrt(1 + 2*sqrt(5)/5), + 4: sqrt(5 + 2*sqrt(5)) + } + if q in (5, 10): + n = 10*p/q + if n > 5: + n = 10 - n + return -table10[n] + else: + return table10[n] + if not pi_coeff.q % 2: + narg = pi_coeff*pi*2 + cresult, sresult = cos(narg), cos(narg - pi/2) + if not isinstance(cresult, cos) \ + and not isinstance(sresult, cos): + if sresult == 0: + return S.ComplexInfinity + return 1/sresult - cresult/sresult + + table2 = _table2() + if q in table2: + a, b = table2[q] + nvala, nvalb = cls(p*pi/a), cls(p*pi/b) + if None in (nvala, nvalb): + return None + return (nvala - nvalb)/(1 + nvala*nvalb) + narg = ((pi_coeff + S.Half) % 1 - S.Half)*pi + # see cos() to specify which expressions should be + # expanded automatically in terms of radicals + cresult, sresult = cos(narg), cos(narg - pi/2) + if not isinstance(cresult, cos) \ + and not isinstance(sresult, cos): + if cresult == 0: + return S.ComplexInfinity + return (sresult/cresult) + if narg != arg: + return cls(narg) + + if arg.is_Add: + x, m = _peeloff_pi(arg) + if m: + tanm = tan(m*pi) + if tanm is S.ComplexInfinity: + return -cot(x) + else: # tanm == 0 + return tan(x) + + if arg.is_zero: + return S.Zero + + if isinstance(arg, atan): + return arg.args[0] + + if isinstance(arg, atan2): + y, x = arg.args + return y/x + + if isinstance(arg, asin): + x = arg.args[0] + return x/sqrt(1 - x**2) + + if isinstance(arg, acos): + x = arg.args[0] + return sqrt(1 - x**2)/x + + if isinstance(arg, acot): + x = arg.args[0] + return 1/x + + if isinstance(arg, acsc): + x = arg.args[0] + return 1/(sqrt(1 - 1/x**2)*x) + + if isinstance(arg, asec): + x = arg.args[0] + return sqrt(1 - 1/x**2)*x + + @staticmethod + @cacheit + def taylor_term(n, x, *previous_terms): + if n < 0 or n % 2 == 0: + return S.Zero + else: + x = sympify(x) + + a, b = ((n - 1)//2), 2**(n + 1) + + B = bernoulli(n + 1) + F = factorial(n + 1) + + return S.NegativeOne**a*b*(b - 1)*B/F*x**n + + def _eval_nseries(self, x, n, logx, cdir=0): + i = self.args[0].limit(x, 0)*2/pi + if i and i.is_Integer: + return self.rewrite(cos)._eval_nseries(x, n=n, logx=logx) + return super()._eval_nseries(x, n=n, logx=logx) + + def _eval_rewrite_as_Pow(self, arg, **kwargs): + if isinstance(arg, log): + I = S.ImaginaryUnit + x = arg.args[0] + return I*(x**-I - x**I)/(x**-I + x**I) + + def _eval_conjugate(self): + return self.func(self.args[0].conjugate()) + + def as_real_imag(self, deep=True, **hints): + re, im = self._as_real_imag(deep=deep, **hints) + if im: + from sympy.functions.elementary.hyperbolic import cosh, sinh + denom = cos(2*re) + cosh(2*im) + return (sin(2*re)/denom, sinh(2*im)/denom) + else: + return (self.func(re), S.Zero) + + def _eval_expand_trig(self, **hints): + arg = self.args[0] + x = None + if arg.is_Add: + n = len(arg.args) + TX = [] + for x in arg.args: + tx = tan(x, evaluate=False)._eval_expand_trig() + TX.append(tx) + + Yg = numbered_symbols('Y') + Y = [ next(Yg) for i in range(n) ] + + p = [0, 0] + for i in range(n + 1): + p[1 - i % 2] += symmetric_poly(i, Y)*(-1)**((i % 4)//2) + return (p[0]/p[1]).subs(list(zip(Y, TX))) + + elif arg.is_Mul: + coeff, terms = arg.as_coeff_Mul(rational=True) + if coeff.is_Integer and coeff > 1: + I = S.ImaginaryUnit + z = Symbol('dummy', real=True) + P = ((1 + I*z)**coeff).expand() + return (im(P)/re(P)).subs([(z, tan(terms))]) + return tan(arg) + + def _eval_rewrite_as_exp(self, arg, **kwargs): + I = S.ImaginaryUnit + from sympy.functions.elementary.hyperbolic import HyperbolicFunction + if isinstance(arg, (TrigonometricFunction, HyperbolicFunction)): + arg = arg.func(arg.args[0]).rewrite(exp) + neg_exp, pos_exp = exp(-arg*I), exp(arg*I) + return I*(neg_exp - pos_exp)/(neg_exp + pos_exp) + + def _eval_rewrite_as_sin(self, x, **kwargs): + return 2*sin(x)**2/sin(2*x) + + def _eval_rewrite_as_cos(self, x, **kwargs): + return cos(x - pi/2, evaluate=False)/cos(x) + + def _eval_rewrite_as_sincos(self, arg, **kwargs): + return sin(arg)/cos(arg) + + def _eval_rewrite_as_cot(self, arg, **kwargs): + return 1/cot(arg) + + def _eval_rewrite_as_sec(self, arg, **kwargs): + sin_in_sec_form = sin(arg).rewrite(sec, **kwargs) + cos_in_sec_form = cos(arg).rewrite(sec, **kwargs) + return sin_in_sec_form/cos_in_sec_form + + def _eval_rewrite_as_csc(self, arg, **kwargs): + sin_in_csc_form = sin(arg).rewrite(csc, **kwargs) + cos_in_csc_form = cos(arg).rewrite(csc, **kwargs) + return sin_in_csc_form/cos_in_csc_form + + def _eval_rewrite_as_pow(self, arg, **kwargs): + y = self.rewrite(cos, **kwargs).rewrite(pow, **kwargs) + if y.has(cos): + return None + return y + + def _eval_rewrite_as_sqrt(self, arg, **kwargs): + y = self.rewrite(cos, **kwargs).rewrite(sqrt, **kwargs) + if y.has(cos): + return None + return y + + def _eval_rewrite_as_besselj(self, arg, **kwargs): + from sympy.functions.special.bessel import besselj + return besselj(S.Half, arg)/besselj(-S.Half, arg) + + def _eval_as_leading_term(self, x, logx, cdir): + from sympy.calculus.accumulationbounds import AccumBounds + from sympy.functions.elementary.complexes import re + arg = self.args[0] + x0 = arg.subs(x, 0).cancel() + n = 2*x0/pi + if n.is_integer: + lt = (arg - n*pi/2).as_leading_term(x) + return lt if n.is_even else -1/lt + if x0 is S.ComplexInfinity: + x0 = arg.limit(x, 0, dir='-' if re(cdir).is_negative else '+') + if x0 in (S.Infinity, S.NegativeInfinity): + return AccumBounds(S.NegativeInfinity, S.Infinity) + return self.func(x0) if x0.is_finite else self + + def _eval_is_extended_real(self): + # FIXME: currently tan(pi/2) return zoo + return self.args[0].is_extended_real + + def _eval_is_real(self): + arg = self.args[0] + if arg.is_real and (arg/pi - S.Half).is_integer is False: + return True + + def _eval_is_finite(self): + arg = self.args[0] + + if arg.is_real and (arg/pi - S.Half).is_integer is False: + return True + + if arg.is_imaginary: + return True + + def _eval_is_zero(self): + rest, pi_mult = _peeloff_pi(self.args[0]) + if rest.is_zero: + return pi_mult.is_integer + + def _eval_is_complex(self): + arg = self.args[0] + + if arg.is_real and (arg/pi - S.Half).is_integer is False: + return True + + +class cot(TrigonometricFunction): + """ + The cotangent function. + + Returns the cotangent of x (measured in radians). + + Explanation + =========== + + See :class:`sin` for notes about automatic evaluation. + + Examples + ======== + + >>> from sympy import cot, pi + >>> from sympy.abc import x + >>> cot(x**2).diff(x) + 2*x*(-cot(x**2)**2 - 1) + >>> cot(1).diff(x) + 0 + >>> cot(pi/12) + sqrt(3) + 2 + + See Also + ======== + + sin, csc, cos, sec, tan + asin, acsc, acos, asec, atan, acot, atan2 + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Trigonometric_functions + .. [2] https://dlmf.nist.gov/4.14 + .. [3] https://functions.wolfram.com/ElementaryFunctions/Cot + + """ + + def period(self, symbol=None): + return self._period(pi, symbol) + + def fdiff(self, argindex=1): + if argindex == 1: + return S.NegativeOne - self**2 + else: + raise ArgumentIndexError(self, argindex) + + def inverse(self, argindex=1): + """ + Returns the inverse of this function. + """ + return acot + + @classmethod + def eval(cls, arg): + from sympy.calculus.accumulationbounds import AccumBounds + if arg.is_Number: + if arg is S.NaN: + return S.NaN + if arg.is_zero: + return S.ComplexInfinity + elif arg in (S.Infinity, S.NegativeInfinity): + return AccumBounds(S.NegativeInfinity, S.Infinity) + + if arg is S.ComplexInfinity: + return S.NaN + + if isinstance(arg, AccumBounds): + return -tan(arg + pi/2) + + if arg.could_extract_minus_sign(): + return -cls(-arg) + + i_coeff = _imaginary_unit_as_coefficient(arg) + if i_coeff is not None: + from sympy.functions.elementary.hyperbolic import coth + return -S.ImaginaryUnit*coth(i_coeff) + + pi_coeff = _pi_coeff(arg, 2) + if pi_coeff is not None: + if pi_coeff.is_integer: + return S.ComplexInfinity + + if not pi_coeff.is_Rational: + narg = pi_coeff*pi + if narg != arg: + return cls(narg) + return None + + if pi_coeff.is_Rational: + if pi_coeff.q in (5, 10): + return tan(pi/2 - arg) + if pi_coeff.q > 2 and not pi_coeff.q % 2: + narg = pi_coeff*pi*2 + cresult, sresult = cos(narg), cos(narg - pi/2) + if not isinstance(cresult, cos) \ + and not isinstance(sresult, cos): + return 1/sresult + cresult/sresult + q = pi_coeff.q + p = pi_coeff.p % q + table2 = _table2() + if q in table2: + a, b = table2[q] + nvala, nvalb = cls(p*pi/a), cls(p*pi/b) + if None in (nvala, nvalb): + return None + return (1 + nvala*nvalb)/(nvalb - nvala) + narg = (((pi_coeff + S.Half) % 1) - S.Half)*pi + # see cos() to specify which expressions should be + # expanded automatically in terms of radicals + cresult, sresult = cos(narg), cos(narg - pi/2) + if not isinstance(cresult, cos) \ + and not isinstance(sresult, cos): + if sresult == 0: + return S.ComplexInfinity + return cresult/sresult + if narg != arg: + return cls(narg) + + if arg.is_Add: + x, m = _peeloff_pi(arg) + if m: + cotm = cot(m*pi) + if cotm is S.ComplexInfinity: + return cot(x) + else: # cotm == 0 + return -tan(x) + + if arg.is_zero: + return S.ComplexInfinity + + if isinstance(arg, acot): + return arg.args[0] + + if isinstance(arg, atan): + x = arg.args[0] + return 1/x + + if isinstance(arg, atan2): + y, x = arg.args + return x/y + + if isinstance(arg, asin): + x = arg.args[0] + return sqrt(1 - x**2)/x + + if isinstance(arg, acos): + x = arg.args[0] + return x/sqrt(1 - x**2) + + if isinstance(arg, acsc): + x = arg.args[0] + return sqrt(1 - 1/x**2)*x + + if isinstance(arg, asec): + x = arg.args[0] + return 1/(sqrt(1 - 1/x**2)*x) + + @staticmethod + @cacheit + def taylor_term(n, x, *previous_terms): + if n == 0: + return 1/sympify(x) + elif n < 0 or n % 2 == 0: + return S.Zero + else: + x = sympify(x) + + B = bernoulli(n + 1) + F = factorial(n + 1) + + return S.NegativeOne**((n + 1)//2)*2**(n + 1)*B/F*x**n + + def _eval_nseries(self, x, n, logx, cdir=0): + i = self.args[0].limit(x, 0)/pi + if i and i.is_Integer: + return self.rewrite(cos)._eval_nseries(x, n=n, logx=logx) + return self.rewrite(tan)._eval_nseries(x, n=n, logx=logx) + + def _eval_conjugate(self): + return self.func(self.args[0].conjugate()) + + def as_real_imag(self, deep=True, **hints): + re, im = self._as_real_imag(deep=deep, **hints) + if im: + from sympy.functions.elementary.hyperbolic import cosh, sinh + denom = cos(2*re) - cosh(2*im) + return (-sin(2*re)/denom, sinh(2*im)/denom) + else: + return (self.func(re), S.Zero) + + def _eval_rewrite_as_exp(self, arg, **kwargs): + from sympy.functions.elementary.hyperbolic import HyperbolicFunction + I = S.ImaginaryUnit + if isinstance(arg, (TrigonometricFunction, HyperbolicFunction)): + arg = arg.func(arg.args[0]).rewrite(exp, **kwargs) + neg_exp, pos_exp = exp(-arg*I), exp(arg*I) + return I*(pos_exp + neg_exp)/(pos_exp - neg_exp) + + def _eval_rewrite_as_Pow(self, arg, **kwargs): + if isinstance(arg, log): + I = S.ImaginaryUnit + x = arg.args[0] + return -I*(x**-I + x**I)/(x**-I - x**I) + + def _eval_rewrite_as_sin(self, x, **kwargs): + return sin(2*x)/(2*(sin(x)**2)) + + def _eval_rewrite_as_cos(self, x, **kwargs): + return cos(x)/cos(x - pi/2, evaluate=False) + + def _eval_rewrite_as_sincos(self, arg, **kwargs): + return cos(arg)/sin(arg) + + def _eval_rewrite_as_tan(self, arg, **kwargs): + return 1/tan(arg) + + def _eval_rewrite_as_sec(self, arg, **kwargs): + cos_in_sec_form = cos(arg).rewrite(sec, **kwargs) + sin_in_sec_form = sin(arg).rewrite(sec, **kwargs) + return cos_in_sec_form/sin_in_sec_form + + def _eval_rewrite_as_csc(self, arg, **kwargs): + cos_in_csc_form = cos(arg).rewrite(csc, **kwargs) + sin_in_csc_form = sin(arg).rewrite(csc, **kwargs) + return cos_in_csc_form/sin_in_csc_form + + def _eval_rewrite_as_pow(self, arg, **kwargs): + y = self.rewrite(cos, **kwargs).rewrite(pow, **kwargs) + if y.has(cos): + return None + return y + + def _eval_rewrite_as_sqrt(self, arg, **kwargs): + y = self.rewrite(cos, **kwargs).rewrite(sqrt, **kwargs) + if y.has(cos): + return None + return y + + def _eval_rewrite_as_besselj(self, arg, **kwargs): + from sympy.functions.special.bessel import besselj + return besselj(-S.Half, arg)/besselj(S.Half, arg) + + def _eval_as_leading_term(self, x, logx, cdir): + from sympy.calculus.accumulationbounds import AccumBounds + from sympy.functions.elementary.complexes import re + arg = self.args[0] + x0 = arg.subs(x, 0).cancel() + n = 2*x0/pi + if n.is_integer: + lt = (arg - n*pi/2).as_leading_term(x) + return 1/lt if n.is_even else -lt + if x0 is S.ComplexInfinity: + x0 = arg.limit(x, 0, dir='-' if re(cdir).is_negative else '+') + if x0 in (S.Infinity, S.NegativeInfinity): + return AccumBounds(S.NegativeInfinity, S.Infinity) + return self.func(x0) if x0.is_finite else self + + def _eval_is_extended_real(self): + return self.args[0].is_extended_real + + def _eval_expand_trig(self, **hints): + arg = self.args[0] + x = None + if arg.is_Add: + n = len(arg.args) + CX = [] + for x in arg.args: + cx = cot(x, evaluate=False)._eval_expand_trig() + CX.append(cx) + + Yg = numbered_symbols('Y') + Y = [ next(Yg) for i in range(n) ] + + p = [0, 0] + for i in range(n, -1, -1): + p[(n - i) % 2] += symmetric_poly(i, Y)*(-1)**(((n - i) % 4)//2) + return (p[0]/p[1]).subs(list(zip(Y, CX))) + elif arg.is_Mul: + coeff, terms = arg.as_coeff_Mul(rational=True) + if coeff.is_Integer and coeff > 1: + I = S.ImaginaryUnit + z = Symbol('dummy', real=True) + P = ((z + I)**coeff).expand() + return (re(P)/im(P)).subs([(z, cot(terms))]) + return cot(arg) # XXX sec and csc return 1/cos and 1/sin + + def _eval_is_finite(self): + arg = self.args[0] + if arg.is_real and (arg/pi).is_integer is False: + return True + if arg.is_imaginary: + return True + + def _eval_is_real(self): + arg = self.args[0] + if arg.is_real and (arg/pi).is_integer is False: + return True + + def _eval_is_complex(self): + arg = self.args[0] + if arg.is_real and (arg/pi).is_integer is False: + return True + + def _eval_is_zero(self): + rest, pimult = _peeloff_pi(self.args[0]) + if pimult and rest.is_zero: + return (pimult - S.Half).is_integer + + def _eval_subs(self, old, new): + arg = self.args[0] + argnew = arg.subs(old, new) + if arg != argnew and (argnew/pi).is_integer: + return S.ComplexInfinity + return cot(argnew) + + +class ReciprocalTrigonometricFunction(TrigonometricFunction): + """Base class for reciprocal functions of trigonometric functions. """ + + _reciprocal_of = None # mandatory, to be defined in subclass + _singularities = (S.ComplexInfinity,) + + # _is_even and _is_odd are used for correct evaluation of csc(-x), sec(-x) + # TODO refactor into TrigonometricFunction common parts of + # trigonometric functions eval() like even/odd, func(x+2*k*pi), etc. + + # optional, to be defined in subclasses: + _is_even: FuzzyBool = None + _is_odd: FuzzyBool = None + + @classmethod + def eval(cls, arg): + if arg.could_extract_minus_sign(): + if cls._is_even: + return cls(-arg) + if cls._is_odd: + return -cls(-arg) + + pi_coeff = _pi_coeff(arg) + if (pi_coeff is not None + and not (2*pi_coeff).is_integer + and pi_coeff.is_Rational): + q = pi_coeff.q + p = pi_coeff.p % (2*q) + if p > q: + narg = (pi_coeff - 1)*pi + return -cls(narg) + if 2*p > q: + narg = (1 - pi_coeff)*pi + if cls._is_odd: + return cls(narg) + elif cls._is_even: + return -cls(narg) + + if hasattr(arg, 'inverse') and arg.inverse() == cls: + return arg.args[0] + + t = cls._reciprocal_of.eval(arg) + if t is None: + return t + elif any(isinstance(i, cos) for i in (t, -t)): + return (1/t).rewrite(sec) + elif any(isinstance(i, sin) for i in (t, -t)): + return (1/t).rewrite(csc) + else: + return 1/t + + def _call_reciprocal(self, method_name, *args, **kwargs): + # Calls method_name on _reciprocal_of + o = self._reciprocal_of(self.args[0]) + return getattr(o, method_name)(*args, **kwargs) + + def _calculate_reciprocal(self, method_name, *args, **kwargs): + # If calling method_name on _reciprocal_of returns a value != None + # then return the reciprocal of that value + t = self._call_reciprocal(method_name, *args, **kwargs) + return 1/t if t is not None else t + + def _rewrite_reciprocal(self, method_name, arg): + # Special handling for rewrite functions. If reciprocal rewrite returns + # unmodified expression, then return None + t = self._call_reciprocal(method_name, arg) + if t is not None and t != self._reciprocal_of(arg): + return 1/t + + def _period(self, symbol): + f = expand_mul(self.args[0]) + return self._reciprocal_of(f).period(symbol) + + def fdiff(self, argindex=1): + return -self._calculate_reciprocal("fdiff", argindex)/self**2 + + def _eval_rewrite_as_exp(self, arg, **kwargs): + return self._rewrite_reciprocal("_eval_rewrite_as_exp", arg) + + def _eval_rewrite_as_Pow(self, arg, **kwargs): + return self._rewrite_reciprocal("_eval_rewrite_as_Pow", arg) + + def _eval_rewrite_as_sin(self, arg, **kwargs): + return self._rewrite_reciprocal("_eval_rewrite_as_sin", arg) + + def _eval_rewrite_as_cos(self, arg, **kwargs): + return self._rewrite_reciprocal("_eval_rewrite_as_cos", arg) + + def _eval_rewrite_as_tan(self, arg, **kwargs): + return self._rewrite_reciprocal("_eval_rewrite_as_tan", arg) + + def _eval_rewrite_as_pow(self, arg, **kwargs): + return self._rewrite_reciprocal("_eval_rewrite_as_pow", arg) + + def _eval_rewrite_as_sqrt(self, arg, **kwargs): + return self._rewrite_reciprocal("_eval_rewrite_as_sqrt", arg) + + def _eval_conjugate(self): + return self.func(self.args[0].conjugate()) + + def as_real_imag(self, deep=True, **hints): + return (1/self._reciprocal_of(self.args[0])).as_real_imag(deep, + **hints) + + def _eval_expand_trig(self, **hints): + return self._calculate_reciprocal("_eval_expand_trig", **hints) + + def _eval_is_extended_real(self): + return self._reciprocal_of(self.args[0])._eval_is_extended_real() + + def _eval_as_leading_term(self, x, logx, cdir): + return (1/self._reciprocal_of(self.args[0]))._eval_as_leading_term(x, logx=logx, cdir=cdir) + + def _eval_is_finite(self): + return (1/self._reciprocal_of(self.args[0])).is_finite + + def _eval_nseries(self, x, n, logx, cdir=0): + return (1/self._reciprocal_of(self.args[0]))._eval_nseries(x, n, logx) + + +class sec(ReciprocalTrigonometricFunction): + """ + The secant function. + + Returns the secant of x (measured in radians). + + Explanation + =========== + + See :class:`sin` for notes about automatic evaluation. + + Examples + ======== + + >>> from sympy import sec + >>> from sympy.abc import x + >>> sec(x**2).diff(x) + 2*x*tan(x**2)*sec(x**2) + >>> sec(1).diff(x) + 0 + + See Also + ======== + + sin, csc, cos, tan, cot + asin, acsc, acos, asec, atan, acot, atan2 + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Trigonometric_functions + .. [2] https://dlmf.nist.gov/4.14 + .. [3] https://functions.wolfram.com/ElementaryFunctions/Sec + + """ + + _reciprocal_of = cos + _is_even = True + + def period(self, symbol=None): + return self._period(symbol) + + def _eval_rewrite_as_cot(self, arg, **kwargs): + cot_half_sq = cot(arg/2)**2 + return (cot_half_sq + 1)/(cot_half_sq - 1) + + def _eval_rewrite_as_cos(self, arg, **kwargs): + return (1/cos(arg)) + + def _eval_rewrite_as_sincos(self, arg, **kwargs): + return sin(arg)/(cos(arg)*sin(arg)) + + def _eval_rewrite_as_sin(self, arg, **kwargs): + return (1/cos(arg).rewrite(sin, **kwargs)) + + def _eval_rewrite_as_tan(self, arg, **kwargs): + return (1/cos(arg).rewrite(tan, **kwargs)) + + def _eval_rewrite_as_csc(self, arg, **kwargs): + return csc(pi/2 - arg, evaluate=False) + + def fdiff(self, argindex=1): + if argindex == 1: + return tan(self.args[0])*sec(self.args[0]) + else: + raise ArgumentIndexError(self, argindex) + + def _eval_rewrite_as_besselj(self, arg, **kwargs): + from sympy.functions.special.bessel import besselj + return Piecewise( + (1/(sqrt(pi*arg)/(sqrt(2))*besselj(-S.Half, arg)), Ne(arg, 0)), + (1, True) + ) + + def _eval_is_complex(self): + arg = self.args[0] + + if arg.is_complex and (arg/pi - S.Half).is_integer is False: + return True + + @staticmethod + @cacheit + def taylor_term(n, x, *previous_terms): + # Reference Formula: + # https://functions.wolfram.com/ElementaryFunctions/Sec/06/01/02/01/ + if n < 0 or n % 2 == 1: + return S.Zero + else: + x = sympify(x) + k = n//2 + return S.NegativeOne**k*euler(2*k)/factorial(2*k)*x**(2*k) + + def _eval_as_leading_term(self, x, logx, cdir): + from sympy.calculus.accumulationbounds import AccumBounds + from sympy.functions.elementary.complexes import re + arg = self.args[0] + x0 = arg.subs(x, 0).cancel() + n = (x0 + pi/2)/pi + if n.is_integer: + lt = (arg - n*pi + pi/2).as_leading_term(x) + return (S.NegativeOne**n)/lt + if x0 is S.ComplexInfinity: + x0 = arg.limit(x, 0, dir='-' if re(cdir).is_negative else '+') + if x0 in (S.Infinity, S.NegativeInfinity): + return AccumBounds(S.NegativeInfinity, S.Infinity) + return self.func(x0) if x0.is_finite else self + + +class csc(ReciprocalTrigonometricFunction): + """ + The cosecant function. + + Returns the cosecant of x (measured in radians). + + Explanation + =========== + + See :func:`sin` for notes about automatic evaluation. + + Examples + ======== + + >>> from sympy import csc + >>> from sympy.abc import x + >>> csc(x**2).diff(x) + -2*x*cot(x**2)*csc(x**2) + >>> csc(1).diff(x) + 0 + + See Also + ======== + + sin, cos, sec, tan, cot + asin, acsc, acos, asec, atan, acot, atan2 + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Trigonometric_functions + .. [2] https://dlmf.nist.gov/4.14 + .. [3] https://functions.wolfram.com/ElementaryFunctions/Csc + + """ + + _reciprocal_of = sin + _is_odd = True + + def period(self, symbol=None): + return self._period(symbol) + + def _eval_rewrite_as_sin(self, arg, **kwargs): + return (1/sin(arg)) + + def _eval_rewrite_as_sincos(self, arg, **kwargs): + return cos(arg)/(sin(arg)*cos(arg)) + + def _eval_rewrite_as_cot(self, arg, **kwargs): + cot_half = cot(arg/2) + return (1 + cot_half**2)/(2*cot_half) + + def _eval_rewrite_as_cos(self, arg, **kwargs): + return 1/sin(arg).rewrite(cos, **kwargs) + + def _eval_rewrite_as_sec(self, arg, **kwargs): + return sec(pi/2 - arg, evaluate=False) + + def _eval_rewrite_as_tan(self, arg, **kwargs): + return (1/sin(arg).rewrite(tan, **kwargs)) + + def _eval_rewrite_as_besselj(self, arg, **kwargs): + from sympy.functions.special.bessel import besselj + return sqrt(2/pi)*(1/(sqrt(arg)*besselj(S.Half, arg))) + + def fdiff(self, argindex=1): + if argindex == 1: + return -cot(self.args[0])*csc(self.args[0]) + else: + raise ArgumentIndexError(self, argindex) + + def _eval_is_complex(self): + arg = self.args[0] + if arg.is_real and (arg/pi).is_integer is False: + return True + + @staticmethod + @cacheit + def taylor_term(n, x, *previous_terms): + if n == 0: + return 1/sympify(x) + elif n < 0 or n % 2 == 0: + return S.Zero + else: + x = sympify(x) + k = n//2 + 1 + return (S.NegativeOne**(k - 1)*2*(2**(2*k - 1) - 1)* + bernoulli(2*k)*x**(2*k - 1)/factorial(2*k)) + + def _eval_as_leading_term(self, x, logx, cdir): + from sympy.calculus.accumulationbounds import AccumBounds + from sympy.functions.elementary.complexes import re + arg = self.args[0] + x0 = arg.subs(x, 0).cancel() + n = x0/pi + if n.is_integer: + lt = (arg - n*pi).as_leading_term(x) + return (S.NegativeOne**n)/lt + if x0 is S.ComplexInfinity: + x0 = arg.limit(x, 0, dir='-' if re(cdir).is_negative else '+') + if x0 in (S.Infinity, S.NegativeInfinity): + return AccumBounds(S.NegativeInfinity, S.Infinity) + return self.func(x0) if x0.is_finite else self + + +class sinc(DefinedFunction): + r""" + Represents an unnormalized sinc function: + + .. math:: + + \operatorname{sinc}(x) = + \begin{cases} + \frac{\sin x}{x} & \qquad x \neq 0 \\ + 1 & \qquad x = 0 + \end{cases} + + Examples + ======== + + >>> from sympy import sinc, oo, jn + >>> from sympy.abc import x + >>> sinc(x) + sinc(x) + + * Automated Evaluation + + >>> sinc(0) + 1 + >>> sinc(oo) + 0 + + * Differentiation + + >>> sinc(x).diff() + cos(x)/x - sin(x)/x**2 + + * Series Expansion + + >>> sinc(x).series() + 1 - x**2/6 + x**4/120 + O(x**6) + + * As zero'th order spherical Bessel Function + + >>> sinc(x).rewrite(jn) + jn(0, x) + + See also + ======== + + sin + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Sinc_function + + """ + _singularities = (S.ComplexInfinity,) + + def fdiff(self, argindex=1): + x = self.args[0] + if argindex == 1: + # We would like to return the Piecewise here, but Piecewise.diff + # currently can't handle removable singularities, meaning things + # like sinc(x).diff(x, 2) give the wrong answer at x = 0. See + # https://github.com/sympy/sympy/issues/11402. + # + # return Piecewise(((x*cos(x) - sin(x))/x**2, Ne(x, S.Zero)), (S.Zero, S.true)) + return cos(x)/x - sin(x)/x**2 + else: + raise ArgumentIndexError(self, argindex) + + @classmethod + def eval(cls, arg): + if arg.is_zero: + return S.One + if arg.is_Number: + if arg in [S.Infinity, S.NegativeInfinity]: + return S.Zero + elif arg is S.NaN: + return S.NaN + + if arg is S.ComplexInfinity: + return S.NaN + + if arg.could_extract_minus_sign(): + return cls(-arg) + + pi_coeff = _pi_coeff(arg) + if pi_coeff is not None: + if pi_coeff.is_integer: + if fuzzy_not(arg.is_zero): + return S.Zero + elif (2*pi_coeff).is_integer: + return S.NegativeOne**(pi_coeff - S.Half)/arg + + def _eval_nseries(self, x, n, logx, cdir=0): + x = self.args[0] + return (sin(x)/x)._eval_nseries(x, n, logx) + + def _eval_rewrite_as_jn(self, arg, **kwargs): + from sympy.functions.special.bessel import jn + return jn(0, arg) + + def _eval_rewrite_as_sin(self, arg, **kwargs): + return Piecewise((sin(arg)/arg, Ne(arg, S.Zero)), (S.One, S.true)) + + def _eval_is_zero(self): + if self.args[0].is_infinite: + return True + rest, pi_mult = _peeloff_pi(self.args[0]) + if rest.is_zero: + return fuzzy_and([pi_mult.is_integer, pi_mult.is_nonzero]) + if rest.is_Number and pi_mult.is_integer: + return False + + def _eval_is_real(self): + if self.args[0].is_extended_real or self.args[0].is_imaginary: + return True + + _eval_is_finite = _eval_is_real + + +############################################################################### +########################### TRIGONOMETRIC INVERSES ############################ +############################################################################### + + +class InverseTrigonometricFunction(DefinedFunction): + """Base class for inverse trigonometric functions.""" + _singularities: tuple[Expr, ...] = (S.One, S.NegativeOne, S.Zero, S.ComplexInfinity) + + @staticmethod + @cacheit + def _asin_table(): + # Only keys with could_extract_minus_sign() == False + # are actually needed. + return { + sqrt(3)/2: pi/3, + sqrt(2)/2: pi/4, + 1/sqrt(2): pi/4, + sqrt((5 - sqrt(5))/8): pi/5, + sqrt(2)*sqrt(5 - sqrt(5))/4: pi/5, + sqrt((5 + sqrt(5))/8): pi*Rational(2, 5), + sqrt(2)*sqrt(5 + sqrt(5))/4: pi*Rational(2, 5), + S.Half: pi/6, + sqrt(2 - sqrt(2))/2: pi/8, + sqrt(S.Half - sqrt(2)/4): pi/8, + sqrt(2 + sqrt(2))/2: pi*Rational(3, 8), + sqrt(S.Half + sqrt(2)/4): pi*Rational(3, 8), + (sqrt(5) - 1)/4: pi/10, + (1 - sqrt(5))/4: -pi/10, + (sqrt(5) + 1)/4: pi*Rational(3, 10), + sqrt(6)/4 - sqrt(2)/4: pi/12, + -sqrt(6)/4 + sqrt(2)/4: -pi/12, + (sqrt(3) - 1)/sqrt(8): pi/12, + (1 - sqrt(3))/sqrt(8): -pi/12, + sqrt(6)/4 + sqrt(2)/4: pi*Rational(5, 12), + (1 + sqrt(3))/sqrt(8): pi*Rational(5, 12) + } + + + @staticmethod + @cacheit + def _atan_table(): + # Only keys with could_extract_minus_sign() == False + # are actually needed. + return { + sqrt(3)/3: pi/6, + 1/sqrt(3): pi/6, + sqrt(3): pi/3, + sqrt(2) - 1: pi/8, + 1 - sqrt(2): -pi/8, + 1 + sqrt(2): pi*Rational(3, 8), + sqrt(5 - 2*sqrt(5)): pi/5, + sqrt(5 + 2*sqrt(5)): pi*Rational(2, 5), + sqrt(1 - 2*sqrt(5)/5): pi/10, + sqrt(1 + 2*sqrt(5)/5): pi*Rational(3, 10), + 2 - sqrt(3): pi/12, + -2 + sqrt(3): -pi/12, + 2 + sqrt(3): pi*Rational(5, 12) + } + + @staticmethod + @cacheit + def _acsc_table(): + # Keys for which could_extract_minus_sign() + # will obviously return True are omitted. + return { + 2*sqrt(3)/3: pi/3, + sqrt(2): pi/4, + sqrt(2 + 2*sqrt(5)/5): pi/5, + 1/sqrt(Rational(5, 8) - sqrt(5)/8): pi/5, + sqrt(2 - 2*sqrt(5)/5): pi*Rational(2, 5), + 1/sqrt(Rational(5, 8) + sqrt(5)/8): pi*Rational(2, 5), + 2: pi/6, + sqrt(4 + 2*sqrt(2)): pi/8, + 2/sqrt(2 - sqrt(2)): pi/8, + sqrt(4 - 2*sqrt(2)): pi*Rational(3, 8), + 2/sqrt(2 + sqrt(2)): pi*Rational(3, 8), + 1 + sqrt(5): pi/10, + sqrt(5) - 1: pi*Rational(3, 10), + -(sqrt(5) - 1): pi*Rational(-3, 10), + sqrt(6) + sqrt(2): pi/12, + sqrt(6) - sqrt(2): pi*Rational(5, 12), + -(sqrt(6) - sqrt(2)): pi*Rational(-5, 12) + } + + +class asin(InverseTrigonometricFunction): + r""" + The inverse sine function. + + Returns the arcsine of x in radians. + + Explanation + =========== + + ``asin(x)`` will evaluate automatically in the cases + $x \in \{\infty, -\infty, 0, 1, -1\}$ and for some instances when the + result is a rational multiple of $\pi$ (see the ``eval`` class method). + + A purely imaginary argument will lead to an asinh expression. + + Examples + ======== + + >>> from sympy import asin, oo + >>> asin(1) + pi/2 + >>> asin(-1) + -pi/2 + >>> asin(-oo) + oo*I + >>> asin(oo) + -oo*I + + See Also + ======== + + sin, csc, cos, sec, tan, cot + acsc, acos, asec, atan, acot, atan2 + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Inverse_trigonometric_functions + .. [2] https://dlmf.nist.gov/4.23 + .. [3] https://functions.wolfram.com/ElementaryFunctions/ArcSin + + """ + + def fdiff(self, argindex=1): + if argindex == 1: + return 1/sqrt(1 - self.args[0]**2) + else: + raise ArgumentIndexError(self, argindex) + + def _eval_is_rational(self): + s = self.func(*self.args) + if s.func == self.func: + if s.args[0].is_rational: + return False + else: + return s.is_rational + + def _eval_is_positive(self): + return self._eval_is_extended_real() and self.args[0].is_positive + + def _eval_is_negative(self): + return self._eval_is_extended_real() and self.args[0].is_negative + + @classmethod + def eval(cls, arg): + if arg.is_Number: + if arg is S.NaN: + return S.NaN + elif arg is S.Infinity: + return S.NegativeInfinity*S.ImaginaryUnit + elif arg is S.NegativeInfinity: + return S.Infinity*S.ImaginaryUnit + elif arg.is_zero: + return S.Zero + elif arg is S.One: + return pi/2 + elif arg is S.NegativeOne: + return -pi/2 + + if arg is S.ComplexInfinity: + return S.ComplexInfinity + + if arg.could_extract_minus_sign(): + return -cls(-arg) + + if arg.is_number: + asin_table = cls._asin_table() + if arg in asin_table: + return asin_table[arg] + + i_coeff = _imaginary_unit_as_coefficient(arg) + if i_coeff is not None: + from sympy.functions.elementary.hyperbolic import asinh + return S.ImaginaryUnit*asinh(i_coeff) + + if arg.is_zero: + return S.Zero + + if isinstance(arg, sin): + ang = arg.args[0] + if ang.is_comparable: + ang %= 2*pi # restrict to [0,2*pi) + if ang > pi: # restrict to (-pi,pi] + ang = pi - ang + + # restrict to [-pi/2,pi/2] + if ang > pi/2: + ang = pi - ang + if ang < -pi/2: + ang = -pi - ang + + return ang + + if isinstance(arg, cos): # acos(x) + asin(x) = pi/2 + ang = arg.args[0] + if ang.is_comparable: + return pi/2 - acos(arg) + + @staticmethod + @cacheit + def taylor_term(n, x, *previous_terms): + if n < 0 or n % 2 == 0: + return S.Zero + else: + x = sympify(x) + if len(previous_terms) >= 2 and n > 2: + p = previous_terms[-2] + return p*(n - 2)**2/(n*(n - 1))*x**2 + else: + k = (n - 1) // 2 + R = RisingFactorial(S.Half, k) + F = factorial(k) + return R/F*x**n/n + + def _eval_as_leading_term(self, x, logx, cdir): + arg = self.args[0] + x0 = arg.subs(x, 0).cancel() + if x0 is S.NaN: + return self.func(arg.as_leading_term(x)) + if x0.is_zero: + return arg.as_leading_term(x) + + # Handling branch points + if x0 in (-S.One, S.One, S.ComplexInfinity): + return self.rewrite(log)._eval_as_leading_term(x, logx=logx, cdir=cdir).expand() + # Handling points lying on branch cuts (-oo, -1) U (1, oo) + if (1 - x0**2).is_negative: + ndir = arg.dir(x, cdir if cdir else 1) + if im(ndir).is_negative: + if x0.is_negative: + return -pi - self.func(x0) + elif im(ndir).is_positive: + if x0.is_positive: + return pi - self.func(x0) + else: + return self.rewrite(log)._eval_as_leading_term(x, logx=logx, cdir=cdir).expand() + return self.func(x0) + + def _eval_nseries(self, x, n, logx, cdir=0): # asin + from sympy.series.order import O + arg0 = self.args[0].subs(x, 0) + # Handling branch points + if arg0 is S.One: + t = Dummy('t', positive=True) + ser = asin(S.One - t**2).rewrite(log).nseries(t, 0, 2*n) + arg1 = S.One - self.args[0] + f = arg1.as_leading_term(x) + g = (arg1 - f)/ f + if not g.is_meromorphic(x, 0): # cannot be expanded + return O(1) if n == 0 else pi/2 + O(sqrt(x)) + res1 = sqrt(S.One + g)._eval_nseries(x, n=n, logx=logx) + res = (res1.removeO()*sqrt(f)).expand() + return ser.removeO().subs(t, res).expand().powsimp() + O(x**n, x) + + if arg0 is S.NegativeOne: + t = Dummy('t', positive=True) + ser = asin(S.NegativeOne + t**2).rewrite(log).nseries(t, 0, 2*n) + arg1 = S.One + self.args[0] + f = arg1.as_leading_term(x) + g = (arg1 - f)/ f + if not g.is_meromorphic(x, 0): # cannot be expanded + return O(1) if n == 0 else -pi/2 + O(sqrt(x)) + res1 = sqrt(S.One + g)._eval_nseries(x, n=n, logx=logx) + res = (res1.removeO()*sqrt(f)).expand() + return ser.removeO().subs(t, res).expand().powsimp() + O(x**n, x) + + res = super()._eval_nseries(x, n=n, logx=logx) + if arg0 is S.ComplexInfinity: + return res + # Handling points lying on branch cuts (-oo, -1) U (1, oo) + if (1 - arg0**2).is_negative: + ndir = self.args[0].dir(x, cdir if cdir else 1) + if im(ndir).is_negative: + if arg0.is_negative: + return -pi - res + elif im(ndir).is_positive: + if arg0.is_positive: + return pi - res + else: + return self.rewrite(log)._eval_nseries(x, n, logx=logx, cdir=cdir) + return res + + def _eval_rewrite_as_acos(self, x, **kwargs): + return pi/2 - acos(x) + + def _eval_rewrite_as_atan(self, x, **kwargs): + return 2*atan(x/(1 + sqrt(1 - x**2))) + + def _eval_rewrite_as_log(self, x, **kwargs): + return -S.ImaginaryUnit*log(S.ImaginaryUnit*x + sqrt(1 - x**2)) + + _eval_rewrite_as_tractable = _eval_rewrite_as_log + + def _eval_rewrite_as_acot(self, arg, **kwargs): + return 2*acot((1 + sqrt(1 - arg**2))/arg) + + def _eval_rewrite_as_asec(self, arg, **kwargs): + return pi/2 - asec(1/arg) + + def _eval_rewrite_as_acsc(self, arg, **kwargs): + return acsc(1/arg) + + def _eval_is_extended_real(self): + x = self.args[0] + return x.is_extended_real and (1 - abs(x)).is_nonnegative + + def inverse(self, argindex=1): + """ + Returns the inverse of this function. + """ + return sin + + +class acos(InverseTrigonometricFunction): + r""" + The inverse cosine function. + + Explanation + =========== + + Returns the arc cosine of x (measured in radians). + + ``acos(x)`` will evaluate automatically in the cases + $x \in \{\infty, -\infty, 0, 1, -1\}$ and for some instances when + the result is a rational multiple of $\pi$ (see the eval class method). + + ``acos(zoo)`` evaluates to ``zoo`` + (see note in :class:`sympy.functions.elementary.trigonometric.asec`) + + A purely imaginary argument will be rewritten to asinh. + + Examples + ======== + + >>> from sympy import acos, oo + >>> acos(1) + 0 + >>> acos(0) + pi/2 + >>> acos(oo) + oo*I + + See Also + ======== + + sin, csc, cos, sec, tan, cot + asin, acsc, asec, atan, acot, atan2 + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Inverse_trigonometric_functions + .. [2] https://dlmf.nist.gov/4.23 + .. [3] https://functions.wolfram.com/ElementaryFunctions/ArcCos + + """ + + def fdiff(self, argindex=1): + if argindex == 1: + return -1/sqrt(1 - self.args[0]**2) + else: + raise ArgumentIndexError(self, argindex) + + def _eval_is_rational(self): + s = self.func(*self.args) + if s.func == self.func: + if s.args[0].is_rational: + return False + else: + return s.is_rational + + @classmethod + def eval(cls, arg): + if arg.is_Number: + if arg is S.NaN: + return S.NaN + elif arg is S.Infinity: + return S.Infinity*S.ImaginaryUnit + elif arg is S.NegativeInfinity: + return S.NegativeInfinity*S.ImaginaryUnit + elif arg.is_zero: + return pi/2 + elif arg is S.One: + return S.Zero + elif arg is S.NegativeOne: + return pi + + if arg is S.ComplexInfinity: + return S.ComplexInfinity + + if arg.is_number: + asin_table = cls._asin_table() + if arg in asin_table: + return pi/2 - asin_table[arg] + elif -arg in asin_table: + return pi/2 + asin_table[-arg] + + i_coeff = _imaginary_unit_as_coefficient(arg) + if i_coeff is not None: + return pi/2 - asin(arg) + + if arg.is_Mul and len(arg.args) == 2 and arg.args[0] == -1: + narg = arg.args[1] + minus = True + else: + narg = arg + minus = False + + if isinstance(narg, cos): + # acos(cos(x)) = x or acos(-cos(x)) = pi - x + ang = narg.args[0] + if ang.is_comparable: + if minus: + ang = pi - ang + ang %= 2*pi # restrict to [0,2*pi) + if ang > pi: # restrict to [0,pi] + ang = 2*pi - ang + return ang + + if isinstance(narg, sin): # acos(x) + asin(x) = pi/2 + ang = narg.args[0] + if ang.is_comparable: + if minus: + return pi/2 + asin(narg) + return pi/2 - asin(narg) + + @staticmethod + @cacheit + def taylor_term(n, x, *previous_terms): + if n == 0: + return pi/2 + elif n < 0 or n % 2 == 0: + return S.Zero + else: + x = sympify(x) + if len(previous_terms) >= 2 and n > 2: + p = previous_terms[-2] + return p*(n - 2)**2/(n*(n - 1))*x**2 + else: + k = (n - 1) // 2 + R = RisingFactorial(S.Half, k) + F = factorial(k) + return -R/F*x**n/n + + def _eval_as_leading_term(self, x, logx, cdir): + arg = self.args[0] + x0 = arg.subs(x, 0).cancel() + if x0 is S.NaN: + return self.func(arg.as_leading_term(x)) + # Handling branch points + if x0 == 1: + return sqrt(2)*sqrt((S.One - arg).as_leading_term(x)) + if x0 in (-S.One, S.ComplexInfinity): + return self.rewrite(log)._eval_as_leading_term(x, logx=logx, cdir=cdir) + # Handling points lying on branch cuts (-oo, -1) U (1, oo) + if (1 - x0**2).is_negative: + ndir = arg.dir(x, cdir if cdir else 1) + if im(ndir).is_negative: + if x0.is_negative: + return 2*pi - self.func(x0) + elif im(ndir).is_positive: + if x0.is_positive: + return -self.func(x0) + else: + return self.rewrite(log)._eval_as_leading_term(x, logx=logx, cdir=cdir).expand() + return self.func(x0) + + def _eval_is_extended_real(self): + x = self.args[0] + return x.is_extended_real and (1 - abs(x)).is_nonnegative + + def _eval_is_nonnegative(self): + return self._eval_is_extended_real() + + def _eval_nseries(self, x, n, logx, cdir=0): # acos + from sympy.series.order import O + arg0 = self.args[0].subs(x, 0) + # Handling branch points + if arg0 is S.One: + t = Dummy('t', positive=True) + ser = acos(S.One - t**2).rewrite(log).nseries(t, 0, 2*n) + arg1 = S.One - self.args[0] + f = arg1.as_leading_term(x) + g = (arg1 - f)/ f + if not g.is_meromorphic(x, 0): # cannot be expanded + return O(1) if n == 0 else O(sqrt(x)) + res1 = sqrt(S.One + g)._eval_nseries(x, n=n, logx=logx) + res = (res1.removeO()*sqrt(f)).expand() + return ser.removeO().subs(t, res).expand().powsimp() + O(x**n, x) + + if arg0 is S.NegativeOne: + t = Dummy('t', positive=True) + ser = acos(S.NegativeOne + t**2).rewrite(log).nseries(t, 0, 2*n) + arg1 = S.One + self.args[0] + f = arg1.as_leading_term(x) + g = (arg1 - f)/ f + if not g.is_meromorphic(x, 0): # cannot be expanded + return O(1) if n == 0 else pi + O(sqrt(x)) + res1 = sqrt(S.One + g)._eval_nseries(x, n=n, logx=logx) + res = (res1.removeO()*sqrt(f)).expand() + return ser.removeO().subs(t, res).expand().powsimp() + O(x**n, x) + + res = super()._eval_nseries(x, n=n, logx=logx) + if arg0 is S.ComplexInfinity: + return res + # Handling points lying on branch cuts (-oo, -1) U (1, oo) + if (1 - arg0**2).is_negative: + ndir = self.args[0].dir(x, cdir if cdir else 1) + if im(ndir).is_negative: + if arg0.is_negative: + return 2*pi - res + elif im(ndir).is_positive: + if arg0.is_positive: + return -res + else: + return self.rewrite(log)._eval_nseries(x, n, logx=logx, cdir=cdir) + return res + + def _eval_rewrite_as_log(self, x, **kwargs): + return pi/2 + S.ImaginaryUnit*\ + log(S.ImaginaryUnit*x + sqrt(1 - x**2)) + + _eval_rewrite_as_tractable = _eval_rewrite_as_log + + def _eval_rewrite_as_asin(self, x, **kwargs): + return pi/2 - asin(x) + + def _eval_rewrite_as_atan(self, x, **kwargs): + return atan(sqrt(1 - x**2)/x) + (pi/2)*(1 - x*sqrt(1/x**2)) + + def inverse(self, argindex=1): + """ + Returns the inverse of this function. + """ + return cos + + def _eval_rewrite_as_acot(self, arg, **kwargs): + return pi/2 - 2*acot((1 + sqrt(1 - arg**2))/arg) + + def _eval_rewrite_as_asec(self, arg, **kwargs): + return asec(1/arg) + + def _eval_rewrite_as_acsc(self, arg, **kwargs): + return pi/2 - acsc(1/arg) + + def _eval_conjugate(self): + z = self.args[0] + r = self.func(self.args[0].conjugate()) + if z.is_extended_real is False: + return r + elif z.is_extended_real and (z + 1).is_nonnegative and (z - 1).is_nonpositive: + return r + + +class atan(InverseTrigonometricFunction): + r""" + The inverse tangent function. + + Returns the arc tangent of x (measured in radians). + + Explanation + =========== + + ``atan(x)`` will evaluate automatically in the cases + $x \in \{\infty, -\infty, 0, 1, -1\}$ and for some instances when the + result is a rational multiple of $\pi$ (see the eval class method). + + Examples + ======== + + >>> from sympy import atan, oo + >>> atan(0) + 0 + >>> atan(1) + pi/4 + >>> atan(oo) + pi/2 + + See Also + ======== + + sin, csc, cos, sec, tan, cot + asin, acsc, acos, asec, acot, atan2 + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Inverse_trigonometric_functions + .. [2] https://dlmf.nist.gov/4.23 + .. [3] https://functions.wolfram.com/ElementaryFunctions/ArcTan + + """ + + args: tuple[Expr] + + _singularities = (S.ImaginaryUnit, -S.ImaginaryUnit) + + def fdiff(self, argindex=1): + if argindex == 1: + return 1/(1 + self.args[0]**2) + else: + raise ArgumentIndexError(self, argindex) + + def _eval_is_rational(self): + s = self.func(*self.args) + if s.func == self.func: + if s.args[0].is_rational: + return False + else: + return s.is_rational + + def _eval_is_positive(self): + return self.args[0].is_extended_positive + + def _eval_is_nonnegative(self): + return self.args[0].is_extended_nonnegative + + def _eval_is_zero(self): + return self.args[0].is_zero + + def _eval_is_real(self): + return self.args[0].is_extended_real + + @classmethod + def eval(cls, arg): + if arg.is_Number: + if arg is S.NaN: + return S.NaN + elif arg is S.Infinity: + return pi/2 + elif arg is S.NegativeInfinity: + return -pi/2 + elif arg.is_zero: + return S.Zero + elif arg is S.One: + return pi/4 + elif arg is S.NegativeOne: + return -pi/4 + + if arg is S.ComplexInfinity: + from sympy.calculus.accumulationbounds import AccumBounds + return AccumBounds(-pi/2, pi/2) + + if arg.could_extract_minus_sign(): + return -cls(-arg) + + if arg.is_number: + atan_table = cls._atan_table() + if arg in atan_table: + return atan_table[arg] + + i_coeff = _imaginary_unit_as_coefficient(arg) + if i_coeff is not None: + from sympy.functions.elementary.hyperbolic import atanh + return S.ImaginaryUnit*atanh(i_coeff) + + if arg.is_zero: + return S.Zero + + if isinstance(arg, tan): + ang = arg.args[0] + if ang.is_comparable: + ang %= pi # restrict to [0,pi) + if ang > pi/2: # restrict to [-pi/2,pi/2] + ang -= pi + + return ang + + if isinstance(arg, cot): # atan(x) + acot(x) = pi/2 + ang = arg.args[0] + if ang.is_comparable: + ang = pi/2 - acot(arg) + if ang > pi/2: # restrict to [-pi/2,pi/2] + ang -= pi + return ang + + @staticmethod + @cacheit + def taylor_term(n, x, *previous_terms): + if n < 0 or n % 2 == 0: + return S.Zero + else: + x = sympify(x) + return S.NegativeOne**((n - 1)//2)*x**n/n + + def _eval_as_leading_term(self, x, logx, cdir): + arg = self.args[0] + x0 = arg.subs(x, 0).cancel() + if x0 is S.NaN: + return self.func(arg.as_leading_term(x)) + if x0.is_zero: + return arg.as_leading_term(x) + # Handling branch points + if x0 in (-S.ImaginaryUnit, S.ImaginaryUnit, S.ComplexInfinity): + return self.rewrite(log)._eval_as_leading_term(x, logx=logx, cdir=cdir).expand() + # Handling points lying on branch cuts (-I*oo, -I) U (I, I*oo) + if (1 + x0**2).is_negative: + ndir = arg.dir(x, cdir if cdir else 1) + if re(ndir).is_negative: + if im(x0).is_positive: + return self.func(x0) - pi + elif re(ndir).is_positive: + if im(x0).is_negative: + return self.func(x0) + pi + else: + return self.rewrite(log)._eval_as_leading_term(x, logx=logx, cdir=cdir).expand() + return self.func(x0) + + def _eval_nseries(self, x, n, logx, cdir=0): # atan + arg0 = self.args[0].subs(x, 0) + + # Handling branch points + if arg0 in (S.ImaginaryUnit, S.NegativeOne*S.ImaginaryUnit): + return self.rewrite(log)._eval_nseries(x, n, logx=logx, cdir=cdir) + + res = super()._eval_nseries(x, n=n, logx=logx) + ndir = self.args[0].dir(x, cdir if cdir else 1) + if arg0 is S.ComplexInfinity: + if re(ndir) > 0: + return res - pi + return res + # Handling points lying on branch cuts (-I*oo, -I) U (I, I*oo) + if (1 + arg0**2).is_negative: + if re(ndir).is_negative: + if im(arg0).is_positive: + return res - pi + elif re(ndir).is_positive: + if im(arg0).is_negative: + return res + pi + else: + return self.rewrite(log)._eval_nseries(x, n, logx=logx, cdir=cdir) + return res + + def _eval_rewrite_as_log(self, x, **kwargs): + return S.ImaginaryUnit/2*(log(S.One - S.ImaginaryUnit*x) + - log(S.One + S.ImaginaryUnit*x)) + + _eval_rewrite_as_tractable = _eval_rewrite_as_log + + def _eval_aseries(self, n, args0, x, logx): + if args0[0] in [S.Infinity, S.NegativeInfinity]: + return (pi/2 - atan(1/self.args[0]))._eval_nseries(x, n, logx) + else: + return super()._eval_aseries(n, args0, x, logx) + + def inverse(self, argindex=1): + """ + Returns the inverse of this function. + """ + return tan + + def _eval_rewrite_as_asin(self, arg, **kwargs): + return sqrt(arg**2)/arg*(pi/2 - asin(1/sqrt(1 + arg**2))) + + def _eval_rewrite_as_acos(self, arg, **kwargs): + return sqrt(arg**2)/arg*acos(1/sqrt(1 + arg**2)) + + def _eval_rewrite_as_acot(self, arg, **kwargs): + return acot(1/arg) + + def _eval_rewrite_as_asec(self, arg, **kwargs): + return sqrt(arg**2)/arg*asec(sqrt(1 + arg**2)) + + def _eval_rewrite_as_acsc(self, arg, **kwargs): + return sqrt(arg**2)/arg*(pi/2 - acsc(sqrt(1 + arg**2))) + + +class acot(InverseTrigonometricFunction): + r""" + The inverse cotangent function. + + Returns the arc cotangent of x (measured in radians). + + Explanation + =========== + + ``acot(x)`` will evaluate automatically in the cases + $x \in \{\infty, -\infty, \tilde{\infty}, 0, 1, -1\}$ + and for some instances when the result is a rational multiple of $\pi$ + (see the eval class method). + + A purely imaginary argument will lead to an ``acoth`` expression. + + ``acot(x)`` has a branch cut along $(-i, i)$, hence it is discontinuous + at 0. Its range for real $x$ is $(-\frac{\pi}{2}, \frac{\pi}{2}]$. + + Examples + ======== + + >>> from sympy import acot, sqrt + >>> acot(0) + pi/2 + >>> acot(1) + pi/4 + >>> acot(sqrt(3) - 2) + -5*pi/12 + + See Also + ======== + + sin, csc, cos, sec, tan, cot + asin, acsc, acos, asec, atan, atan2 + + References + ========== + + .. [1] https://dlmf.nist.gov/4.23 + .. [2] https://functions.wolfram.com/ElementaryFunctions/ArcCot + + """ + _singularities = (S.ImaginaryUnit, -S.ImaginaryUnit) + + def fdiff(self, argindex=1): + if argindex == 1: + return -1/(1 + self.args[0]**2) + else: + raise ArgumentIndexError(self, argindex) + + def _eval_is_rational(self): + s = self.func(*self.args) + if s.func == self.func: + if s.args[0].is_rational: + return False + else: + return s.is_rational + + def _eval_is_positive(self): + return self.args[0].is_nonnegative + + def _eval_is_negative(self): + return self.args[0].is_negative + + def _eval_is_extended_real(self): + return self.args[0].is_extended_real + + @classmethod + def eval(cls, arg): + if arg.is_Number: + if arg is S.NaN: + return S.NaN + elif arg is S.Infinity: + return S.Zero + elif arg is S.NegativeInfinity: + return S.Zero + elif arg.is_zero: + return pi/ 2 + elif arg is S.One: + return pi/4 + elif arg is S.NegativeOne: + return -pi/4 + + if arg is S.ComplexInfinity: + return S.Zero + + if arg.could_extract_minus_sign(): + return -cls(-arg) + + if arg.is_number: + atan_table = cls._atan_table() + if arg in atan_table: + ang = pi/2 - atan_table[arg] + if ang > pi/2: # restrict to (-pi/2,pi/2] + ang -= pi + return ang + + i_coeff = _imaginary_unit_as_coefficient(arg) + if i_coeff is not None: + from sympy.functions.elementary.hyperbolic import acoth + return -S.ImaginaryUnit*acoth(i_coeff) + + if arg.is_zero: + return pi*S.Half + + if isinstance(arg, cot): + ang = arg.args[0] + if ang.is_comparable: + ang %= pi # restrict to [0,pi) + if ang > pi/2: # restrict to (-pi/2,pi/2] + ang -= pi + return ang + + if isinstance(arg, tan): # atan(x) + acot(x) = pi/2 + ang = arg.args[0] + if ang.is_comparable: + ang = pi/2 - atan(arg) + if ang > pi/2: # restrict to (-pi/2,pi/2] + ang -= pi + return ang + + @staticmethod + @cacheit + def taylor_term(n, x, *previous_terms): + if n == 0: + return pi/2 # FIX THIS + elif n < 0 or n % 2 == 0: + return S.Zero + else: + x = sympify(x) + return S.NegativeOne**((n + 1)//2)*x**n/n + + def _eval_as_leading_term(self, x, logx, cdir): + arg = self.args[0] + x0 = arg.subs(x, 0).cancel() + if x0 is S.NaN: + return self.func(arg.as_leading_term(x)) + if x0 is S.ComplexInfinity: + return (1/arg).as_leading_term(x) + # Handling branch points + if x0 in (-S.ImaginaryUnit, S.ImaginaryUnit, S.Zero): + return self.rewrite(log)._eval_as_leading_term(x, logx=logx, cdir=cdir).expand() + # Handling points lying on branch cuts [-I, I] + if x0.is_imaginary and (1 + x0**2).is_positive: + ndir = arg.dir(x, cdir if cdir else 1) + if re(ndir).is_positive: + if im(x0).is_positive: + return self.func(x0) + pi + elif re(ndir).is_negative: + if im(x0).is_negative: + return self.func(x0) - pi + else: + return self.rewrite(log)._eval_as_leading_term(x, logx=logx, cdir=cdir).expand() + return self.func(x0) + + def _eval_nseries(self, x, n, logx, cdir=0): # acot + arg0 = self.args[0].subs(x, 0) + + # Handling branch points + if arg0 in (S.ImaginaryUnit, S.NegativeOne*S.ImaginaryUnit): + return self.rewrite(log)._eval_nseries(x, n, logx=logx, cdir=cdir) + + res = super()._eval_nseries(x, n=n, logx=logx) + if arg0 is S.ComplexInfinity: + return res + ndir = self.args[0].dir(x, cdir if cdir else 1) + if arg0.is_zero: + if re(ndir) < 0: + return res - pi + return res + # Handling points lying on branch cuts [-I, I] + if arg0.is_imaginary and (1 + arg0**2).is_positive: + if re(ndir).is_positive: + if im(arg0).is_positive: + return res + pi + elif re(ndir).is_negative: + if im(arg0).is_negative: + return res - pi + else: + return self.rewrite(log)._eval_nseries(x, n, logx=logx, cdir=cdir) + return res + + def _eval_aseries(self, n, args0, x, logx): + if args0[0] in [S.Infinity, S.NegativeInfinity]: + return atan(1/self.args[0])._eval_nseries(x, n, logx) + else: + return super()._eval_aseries(n, args0, x, logx) + + def _eval_rewrite_as_log(self, x, **kwargs): + return S.ImaginaryUnit/2*(log(1 - S.ImaginaryUnit/x) + - log(1 + S.ImaginaryUnit/x)) + + _eval_rewrite_as_tractable = _eval_rewrite_as_log + + def inverse(self, argindex=1): + """ + Returns the inverse of this function. + """ + return cot + + def _eval_rewrite_as_asin(self, arg, **kwargs): + return (arg*sqrt(1/arg**2)* + (pi/2 - asin(sqrt(-arg**2)/sqrt(-arg**2 - 1)))) + + def _eval_rewrite_as_acos(self, arg, **kwargs): + return arg*sqrt(1/arg**2)*acos(sqrt(-arg**2)/sqrt(-arg**2 - 1)) + + def _eval_rewrite_as_atan(self, arg, **kwargs): + return atan(1/arg) + + def _eval_rewrite_as_asec(self, arg, **kwargs): + return arg*sqrt(1/arg**2)*asec(sqrt((1 + arg**2)/arg**2)) + + def _eval_rewrite_as_acsc(self, arg, **kwargs): + return arg*sqrt(1/arg**2)*(pi/2 - acsc(sqrt((1 + arg**2)/arg**2))) + + +class asec(InverseTrigonometricFunction): + r""" + The inverse secant function. + + Returns the arc secant of x (measured in radians). + + Explanation + =========== + + ``asec(x)`` will evaluate automatically in the cases + $x \in \{\infty, -\infty, 0, 1, -1\}$ and for some instances when the + result is a rational multiple of $\pi$ (see the eval class method). + + ``asec(x)`` has branch cut in the interval $[-1, 1]$. For complex arguments, + it can be defined [4]_ as + + .. math:: + \operatorname{sec^{-1}}(z) = -i\frac{\log\left(\sqrt{1 - z^2} + 1\right)}{z} + + At ``x = 0``, for positive branch cut, the limit evaluates to ``zoo``. For + negative branch cut, the limit + + .. math:: + \lim_{z \to 0}-i\frac{\log\left(-\sqrt{1 - z^2} + 1\right)}{z} + + simplifies to :math:`-i\log\left(z/2 + O\left(z^3\right)\right)` which + ultimately evaluates to ``zoo``. + + As ``acos(x) = asec(1/x)``, a similar argument can be given for + ``acos(x)``. + + Examples + ======== + + >>> from sympy import asec, oo + >>> asec(1) + 0 + >>> asec(-1) + pi + >>> asec(0) + zoo + >>> asec(-oo) + pi/2 + + See Also + ======== + + sin, csc, cos, sec, tan, cot + asin, acsc, acos, atan, acot, atan2 + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Inverse_trigonometric_functions + .. [2] https://dlmf.nist.gov/4.23 + .. [3] https://functions.wolfram.com/ElementaryFunctions/ArcSec + .. [4] https://reference.wolfram.com/language/ref/ArcSec.html + + """ + + @classmethod + def eval(cls, arg): + if arg.is_zero: + return S.ComplexInfinity + if arg.is_Number: + if arg is S.NaN: + return S.NaN + elif arg is S.One: + return S.Zero + elif arg is S.NegativeOne: + return pi + if arg in [S.Infinity, S.NegativeInfinity, S.ComplexInfinity]: + return pi/2 + + if arg.is_number: + acsc_table = cls._acsc_table() + if arg in acsc_table: + return pi/2 - acsc_table[arg] + elif -arg in acsc_table: + return pi/2 + acsc_table[-arg] + + if arg.is_infinite: + return pi/2 + + if arg.is_Mul and len(arg.args) == 2 and arg.args[0] == -1: + narg = arg.args[1] + minus = True + else: + narg = arg + minus = False + + if isinstance(narg, sec): + # asec(sec(x)) = x or asec(-sec(x)) = pi - x + ang = narg.args[0] + if ang.is_comparable: + if minus: + ang = pi - ang + ang %= 2*pi # restrict to [0,2*pi) + if ang > pi: # restrict to [0,pi] + ang = 2*pi - ang + return ang + + if isinstance(narg, csc): # asec(x) + acsc(x) = pi/2 + ang = narg.args[0] + if ang.is_comparable: + if minus: + pi/2 + acsc(narg) + return pi/2 - acsc(narg) + + def fdiff(self, argindex=1): + if argindex == 1: + return 1/(self.args[0]**2*sqrt(1 - 1/self.args[0]**2)) + else: + raise ArgumentIndexError(self, argindex) + + def inverse(self, argindex=1): + """ + Returns the inverse of this function. + """ + return sec + + @staticmethod + @cacheit + def taylor_term(n, x, *previous_terms): + if n == 0: + return S.ImaginaryUnit*log(2 / x) + elif n < 0 or n % 2 == 1: + return S.Zero + else: + x = sympify(x) + if len(previous_terms) > 2 and n > 2: + p = previous_terms[-2] + return p * ((n - 1)*(n-2)) * x**2/(4 * (n//2)**2) + else: + k = n // 2 + R = RisingFactorial(S.Half, k) * n + F = factorial(k) * n // 2 * n // 2 + return -S.ImaginaryUnit * R / F * x**n / 4 + + def _eval_as_leading_term(self, x, logx, cdir): + arg = self.args[0] + x0 = arg.subs(x, 0).cancel() + if x0 is S.NaN: + return self.func(arg.as_leading_term(x)) + # Handling branch points + if x0 == 1: + return sqrt(2)*sqrt((arg - S.One).as_leading_term(x)) + if x0 in (-S.One, S.Zero): + return self.rewrite(log)._eval_as_leading_term(x, logx=logx, cdir=cdir) + # Handling points lying on branch cuts (-1, 1) + if x0.is_real and (1 - x0**2).is_positive: + ndir = arg.dir(x, cdir if cdir else 1) + if im(ndir).is_negative: + if x0.is_positive: + return -self.func(x0) + elif im(ndir).is_positive: + if x0.is_negative: + return 2*pi - self.func(x0) + else: + return self.rewrite(log)._eval_as_leading_term(x, logx=logx, cdir=cdir).expand() + return self.func(x0) + + def _eval_nseries(self, x, n, logx, cdir=0): # asec + from sympy.series.order import O + arg0 = self.args[0].subs(x, 0) + # Handling branch points + if arg0 is S.One: + t = Dummy('t', positive=True) + ser = asec(S.One + t**2).rewrite(log).nseries(t, 0, 2*n) + arg1 = S.NegativeOne + self.args[0] + f = arg1.as_leading_term(x) + g = (arg1 - f)/ f + res1 = sqrt(S.One + g)._eval_nseries(x, n=n, logx=logx) + res = (res1.removeO()*sqrt(f)).expand() + return ser.removeO().subs(t, res).expand().powsimp() + O(x**n, x) + + if arg0 is S.NegativeOne: + t = Dummy('t', positive=True) + ser = asec(S.NegativeOne - t**2).rewrite(log).nseries(t, 0, 2*n) + arg1 = S.NegativeOne - self.args[0] + f = arg1.as_leading_term(x) + g = (arg1 - f)/ f + res1 = sqrt(S.One + g)._eval_nseries(x, n=n, logx=logx) + res = (res1.removeO()*sqrt(f)).expand() + return ser.removeO().subs(t, res).expand().powsimp() + O(x**n, x) + + res = super()._eval_nseries(x, n=n, logx=logx) + if arg0 is S.ComplexInfinity: + return res + # Handling points lying on branch cuts (-1, 1) + if arg0.is_real and (1 - arg0**2).is_positive: + ndir = self.args[0].dir(x, cdir if cdir else 1) + if im(ndir).is_negative: + if arg0.is_positive: + return -res + elif im(ndir).is_positive: + if arg0.is_negative: + return 2*pi - res + else: + return self.rewrite(log)._eval_nseries(x, n, logx=logx, cdir=cdir) + return res + + def _eval_is_extended_real(self): + x = self.args[0] + if x.is_extended_real is False: + return False + return fuzzy_or(((x - 1).is_nonnegative, (-x - 1).is_nonnegative)) + + def _eval_rewrite_as_log(self, arg, **kwargs): + return pi/2 + S.ImaginaryUnit*log(S.ImaginaryUnit/arg + sqrt(1 - 1/arg**2)) + + _eval_rewrite_as_tractable = _eval_rewrite_as_log + + def _eval_rewrite_as_asin(self, arg, **kwargs): + return pi/2 - asin(1/arg) + + def _eval_rewrite_as_acos(self, arg, **kwargs): + return acos(1/arg) + + def _eval_rewrite_as_atan(self, x, **kwargs): + sx2x = sqrt(x**2)/x + return pi/2*(1 - sx2x) + sx2x*atan(sqrt(x**2 - 1)) + + def _eval_rewrite_as_acot(self, x, **kwargs): + sx2x = sqrt(x**2)/x + return pi/2*(1 - sx2x) + sx2x*acot(1/sqrt(x**2 - 1)) + + def _eval_rewrite_as_acsc(self, arg, **kwargs): + return pi/2 - acsc(arg) + + +class acsc(InverseTrigonometricFunction): + r""" + The inverse cosecant function. + + Returns the arc cosecant of x (measured in radians). + + Explanation + =========== + + ``acsc(x)`` will evaluate automatically in the cases + $x \in \{\infty, -\infty, 0, 1, -1\}$` and for some instances when the + result is a rational multiple of $\pi$ (see the ``eval`` class method). + + Examples + ======== + + >>> from sympy import acsc, oo + >>> acsc(1) + pi/2 + >>> acsc(-1) + -pi/2 + >>> acsc(oo) + 0 + >>> acsc(-oo) == acsc(oo) + True + >>> acsc(0) + zoo + + See Also + ======== + + sin, csc, cos, sec, tan, cot + asin, acos, asec, atan, acot, atan2 + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Inverse_trigonometric_functions + .. [2] https://dlmf.nist.gov/4.23 + .. [3] https://functions.wolfram.com/ElementaryFunctions/ArcCsc + + """ + + @classmethod + def eval(cls, arg): + if arg.is_zero: + return S.ComplexInfinity + if arg.is_Number: + if arg is S.NaN: + return S.NaN + elif arg is S.One: + return pi/2 + elif arg is S.NegativeOne: + return -pi/2 + if arg in [S.Infinity, S.NegativeInfinity, S.ComplexInfinity]: + return S.Zero + + if arg.could_extract_minus_sign(): + return -cls(-arg) + + if arg.is_infinite: + return S.Zero + + if arg.is_number: + acsc_table = cls._acsc_table() + if arg in acsc_table: + return acsc_table[arg] + + if isinstance(arg, csc): + ang = arg.args[0] + if ang.is_comparable: + ang %= 2*pi # restrict to [0,2*pi) + if ang > pi: # restrict to (-pi,pi] + ang = pi - ang + + # restrict to [-pi/2,pi/2] + if ang > pi/2: + ang = pi - ang + if ang < -pi/2: + ang = -pi - ang + + return ang + + if isinstance(arg, sec): # asec(x) + acsc(x) = pi/2 + ang = arg.args[0] + if ang.is_comparable: + return pi/2 - asec(arg) + + def fdiff(self, argindex=1): + if argindex == 1: + return -1/(self.args[0]**2*sqrt(1 - 1/self.args[0]**2)) + else: + raise ArgumentIndexError(self, argindex) + + def inverse(self, argindex=1): + """ + Returns the inverse of this function. + """ + return csc + + @staticmethod + @cacheit + def taylor_term(n, x, *previous_terms): + if n == 0: + return pi/2 - S.ImaginaryUnit*log(2) + S.ImaginaryUnit*log(x) + elif n < 0 or n % 2 == 1: + return S.Zero + else: + x = sympify(x) + if len(previous_terms) > 2 and n > 2: + p = previous_terms[-2] + return p * ((n - 1)*(n-2)) * x**2/(4 * (n//2)**2) + else: + k = n // 2 + R = RisingFactorial(S.Half, k) * n + F = factorial(k) * n // 2 * n // 2 + return S.ImaginaryUnit * R / F * x**n / 4 + + def _eval_as_leading_term(self, x, logx, cdir): + arg = self.args[0] + x0 = arg.subs(x, 0).cancel() + if x0 is S.NaN: + return self.func(arg.as_leading_term(x)) + # Handling branch points + if x0 in (-S.One, S.One, S.Zero): + return self.rewrite(log)._eval_as_leading_term(x, logx=logx, cdir=cdir).expand() + if x0 is S.ComplexInfinity: + return (1/arg).as_leading_term(x) + # Handling points lying on branch cuts (-1, 1) + if x0.is_real and (1 - x0**2).is_positive: + ndir = arg.dir(x, cdir if cdir else 1) + if im(ndir).is_negative: + if x0.is_positive: + return pi - self.func(x0) + elif im(ndir).is_positive: + if x0.is_negative: + return -pi - self.func(x0) + else: + return self.rewrite(log)._eval_as_leading_term(x, logx=logx, cdir=cdir).expand() + return self.func(x0) + + def _eval_nseries(self, x, n, logx, cdir=0): # acsc + from sympy.series.order import O + arg0 = self.args[0].subs(x, 0) + # Handling branch points + if arg0 is S.One: + t = Dummy('t', positive=True) + ser = acsc(S.One + t**2).rewrite(log).nseries(t, 0, 2*n) + arg1 = S.NegativeOne + self.args[0] + f = arg1.as_leading_term(x) + g = (arg1 - f)/ f + res1 = sqrt(S.One + g)._eval_nseries(x, n=n, logx=logx) + res = (res1.removeO()*sqrt(f)).expand() + return ser.removeO().subs(t, res).expand().powsimp() + O(x**n, x) + + if arg0 is S.NegativeOne: + t = Dummy('t', positive=True) + ser = acsc(S.NegativeOne - t**2).rewrite(log).nseries(t, 0, 2*n) + arg1 = S.NegativeOne - self.args[0] + f = arg1.as_leading_term(x) + g = (arg1 - f)/ f + res1 = sqrt(S.One + g)._eval_nseries(x, n=n, logx=logx) + res = (res1.removeO()*sqrt(f)).expand() + return ser.removeO().subs(t, res).expand().powsimp() + O(x**n, x) + + res = super()._eval_nseries(x, n=n, logx=logx) + if arg0 is S.ComplexInfinity: + return res + # Handling points lying on branch cuts (-1, 1) + if arg0.is_real and (1 - arg0**2).is_positive: + ndir = self.args[0].dir(x, cdir if cdir else 1) + if im(ndir).is_negative: + if arg0.is_positive: + return pi - res + elif im(ndir).is_positive: + if arg0.is_negative: + return -pi - res + else: + return self.rewrite(log)._eval_nseries(x, n, logx=logx, cdir=cdir) + return res + + def _eval_rewrite_as_log(self, arg, **kwargs): + return -S.ImaginaryUnit*log(S.ImaginaryUnit/arg + sqrt(1 - 1/arg**2)) + + _eval_rewrite_as_tractable = _eval_rewrite_as_log + + def _eval_rewrite_as_asin(self, arg, **kwargs): + return asin(1/arg) + + def _eval_rewrite_as_acos(self, arg, **kwargs): + return pi/2 - acos(1/arg) + + def _eval_rewrite_as_atan(self, x, **kwargs): + return sqrt(x**2)/x*(pi/2 - atan(sqrt(x**2 - 1))) + + def _eval_rewrite_as_acot(self, arg, **kwargs): + return sqrt(arg**2)/arg*(pi/2 - acot(1/sqrt(arg**2 - 1))) + + def _eval_rewrite_as_asec(self, arg, **kwargs): + return pi/2 - asec(arg) + + +class atan2(InverseTrigonometricFunction): + r""" + The function ``atan2(y, x)`` computes `\operatorname{atan}(y/x)` taking + two arguments `y` and `x`. Signs of both `y` and `x` are considered to + determine the appropriate quadrant of `\operatorname{atan}(y/x)`. + The range is `(-\pi, \pi]`. The complete definition reads as follows: + + .. math:: + + \operatorname{atan2}(y, x) = + \begin{cases} + \arctan\left(\frac y x\right) & \qquad x > 0 \\ + \arctan\left(\frac y x\right) + \pi& \qquad y \ge 0, x < 0 \\ + \arctan\left(\frac y x\right) - \pi& \qquad y < 0, x < 0 \\ + +\frac{\pi}{2} & \qquad y > 0, x = 0 \\ + -\frac{\pi}{2} & \qquad y < 0, x = 0 \\ + \text{undefined} & \qquad y = 0, x = 0 + \end{cases} + + Attention: Note the role reversal of both arguments. The `y`-coordinate + is the first argument and the `x`-coordinate the second. + + If either `x` or `y` is complex: + + .. math:: + + \operatorname{atan2}(y, x) = + -i\log\left(\frac{x + iy}{\sqrt{x^2 + y^2}}\right) + + Examples + ======== + + Going counter-clock wise around the origin we find the + following angles: + + >>> from sympy import atan2 + >>> atan2(0, 1) + 0 + >>> atan2(1, 1) + pi/4 + >>> atan2(1, 0) + pi/2 + >>> atan2(1, -1) + 3*pi/4 + >>> atan2(0, -1) + pi + >>> atan2(-1, -1) + -3*pi/4 + >>> atan2(-1, 0) + -pi/2 + >>> atan2(-1, 1) + -pi/4 + + which are all correct. Compare this to the results of the ordinary + `\operatorname{atan}` function for the point `(x, y) = (-1, 1)` + + >>> from sympy import atan, S + >>> atan(S(1)/-1) + -pi/4 + >>> atan2(1, -1) + 3*pi/4 + + where only the `\operatorname{atan2}` function returns what we expect. + We can differentiate the function with respect to both arguments: + + >>> from sympy import diff + >>> from sympy.abc import x, y + >>> diff(atan2(y, x), x) + -y/(x**2 + y**2) + + >>> diff(atan2(y, x), y) + x/(x**2 + y**2) + + We can express the `\operatorname{atan2}` function in terms of + complex logarithms: + + >>> from sympy import log + >>> atan2(y, x).rewrite(log) + -I*log((x + I*y)/sqrt(x**2 + y**2)) + + and in terms of `\operatorname(atan)`: + + >>> from sympy import atan + >>> atan2(y, x).rewrite(atan) + Piecewise((2*atan(y/(x + sqrt(x**2 + y**2))), Ne(y, 0)), (pi, re(x) < 0), (0, Ne(x, 0)), (nan, True)) + + but note that this form is undefined on the negative real axis. + + See Also + ======== + + sin, csc, cos, sec, tan, cot + asin, acsc, acos, asec, atan, acot + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Inverse_trigonometric_functions + .. [2] https://en.wikipedia.org/wiki/Atan2 + .. [3] https://functions.wolfram.com/ElementaryFunctions/ArcTan2 + + """ + + @classmethod + def eval(cls, y, x): + from sympy.functions.special.delta_functions import Heaviside + if x is S.NegativeInfinity: + if y.is_zero: + # Special case y = 0 because we define Heaviside(0) = 1/2 + return pi + return 2*pi*(Heaviside(re(y))) - pi + elif x is S.Infinity: + return S.Zero + elif x.is_imaginary and y.is_imaginary and x.is_number and y.is_number: + x = im(x) + y = im(y) + + if x.is_extended_real and y.is_extended_real: + if x.is_positive: + return atan(y/x) + elif x.is_negative: + if y.is_negative: + return atan(y/x) - pi + elif y.is_nonnegative: + return atan(y/x) + pi + elif x.is_zero: + if y.is_positive: + return pi/2 + elif y.is_negative: + return -pi/2 + elif y.is_zero: + return S.NaN + if y.is_zero: + if x.is_extended_nonzero: + return pi*(S.One - Heaviside(x)) + if x.is_number: + return Piecewise((pi, re(x) < 0), + (0, Ne(x, 0)), + (S.NaN, True)) + if x.is_number and y.is_number: + return -S.ImaginaryUnit*log( + (x + S.ImaginaryUnit*y)/sqrt(x**2 + y**2)) + + def _eval_rewrite_as_log(self, y, x, **kwargs): + return -S.ImaginaryUnit*log((x + S.ImaginaryUnit*y)/sqrt(x**2 + y**2)) + + def _eval_rewrite_as_atan(self, y, x, **kwargs): + return Piecewise((2*atan(y/(x + sqrt(x**2 + y**2))), Ne(y, 0)), + (pi, re(x) < 0), + (0, Ne(x, 0)), + (S.NaN, True)) + + def _eval_rewrite_as_arg(self, y, x, **kwargs): + if x.is_extended_real and y.is_extended_real: + return arg_f(x + y*S.ImaginaryUnit) + n = x + S.ImaginaryUnit*y + d = x**2 + y**2 + return arg_f(n/sqrt(d)) - S.ImaginaryUnit*log(abs(n)/sqrt(abs(d))) + + def _eval_is_extended_real(self): + return self.args[0].is_extended_real and self.args[1].is_extended_real + + def _eval_conjugate(self): + return self.func(self.args[0].conjugate(), self.args[1].conjugate()) + + def fdiff(self, argindex): + y, x = self.args + if argindex == 1: + # Diff wrt y + return x/(x**2 + y**2) + elif argindex == 2: + # Diff wrt x + return -y/(x**2 + y**2) + else: + raise ArgumentIndexError(self, argindex) + + def _eval_evalf(self, prec): + y, x = self.args + if x.is_extended_real and y.is_extended_real: + return super()._eval_evalf(prec) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/functions/special/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/functions/special/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ab52ace36a8dfbe73179dbf4419a54f7fa1af5fa --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/functions/special/__init__.py @@ -0,0 +1 @@ +# Stub __init__.py for the sympy.functions.special package diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/functions/special/bessel.py b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/functions/special/bessel.py new file mode 100644 index 0000000000000000000000000000000000000000..a24e7dc442d2a5a9bf7047113fd81b36c6b6ba36 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/functions/special/bessel.py @@ -0,0 +1,2208 @@ +from functools import wraps + +from sympy.core import S +from sympy.core.add import Add +from sympy.core.cache import cacheit +from sympy.core.expr import Expr +from sympy.core.function import DefinedFunction, ArgumentIndexError, _mexpand +from sympy.core.logic import fuzzy_or, fuzzy_not +from sympy.core.numbers import Rational, pi, I +from sympy.core.power import Pow +from sympy.core.symbol import Dummy, uniquely_named_symbol, Wild +from sympy.core.sympify import sympify +from sympy.functions.combinatorial.factorials import factorial, RisingFactorial +from sympy.functions.elementary.trigonometric import sin, cos, csc, cot +from sympy.functions.elementary.integers import ceiling +from sympy.functions.elementary.exponential import exp, log +from sympy.functions.elementary.miscellaneous import cbrt, sqrt, root +from sympy.functions.elementary.complexes import (Abs, re, im, polar_lift, unpolarify) +from sympy.functions.special.gamma_functions import gamma, digamma, uppergamma +from sympy.functions.special.hyper import hyper +from sympy.polys.orthopolys import spherical_bessel_fn + +from mpmath import mp, workprec + +# TODO +# o Scorer functions G1 and G2 +# o Asymptotic expansions +# These are possible, e.g. for fixed order, but since the bessel type +# functions are oscillatory they are not actually tractable at +# infinity, so this is not particularly useful right now. +# o Nicer series expansions. +# o More rewriting. +# o Add solvers to ode.py (or rather add solvers for the hypergeometric equation). + + +class BesselBase(DefinedFunction): + """ + Abstract base class for Bessel-type functions. + + This class is meant to reduce code duplication. + All Bessel-type functions can 1) be differentiated, with the derivatives + expressed in terms of similar functions, and 2) be rewritten in terms + of other Bessel-type functions. + + Here, Bessel-type functions are assumed to have one complex parameter. + + To use this base class, define class attributes ``_a`` and ``_b`` such that + ``2*F_n' = -_a*F_{n+1} + b*F_{n-1}``. + + """ + + @property + def order(self): + """ The order of the Bessel-type function. """ + return self.args[0] + + @property + def argument(self): + """ The argument of the Bessel-type function. """ + return self.args[1] + + @classmethod + def eval(cls, nu, z): + return + + def fdiff(self, argindex=2): + if argindex != 2: + raise ArgumentIndexError(self, argindex) + return (self._b/2 * self.__class__(self.order - 1, self.argument) - + self._a/2 * self.__class__(self.order + 1, self.argument)) + + def _eval_conjugate(self): + z = self.argument + if z.is_extended_negative is False: + return self.__class__(self.order.conjugate(), z.conjugate()) + + def _eval_is_meromorphic(self, x, a): + nu, z = self.order, self.argument + + if nu.has(x): + return False + if not z._eval_is_meromorphic(x, a): + return None + z0 = z.subs(x, a) + if nu.is_integer: + if isinstance(self, (besselj, besseli, hn1, hn2, jn, yn)) or not nu.is_zero: + return fuzzy_not(z0.is_infinite) + return fuzzy_not(fuzzy_or([z0.is_zero, z0.is_infinite])) + + def _eval_expand_func(self, **hints): + nu, z, f = self.order, self.argument, self.__class__ + if nu.is_real: + if (nu - 1).is_positive: + return (-self._a*self._b*f(nu - 2, z)._eval_expand_func() + + 2*self._a*(nu - 1)*f(nu - 1, z)._eval_expand_func()/z) + elif (nu + 1).is_negative: + return (2*self._b*(nu + 1)*f(nu + 1, z)._eval_expand_func()/z - + self._a*self._b*f(nu + 2, z)._eval_expand_func()) + return self + + def _eval_simplify(self, **kwargs): + from sympy.simplify.simplify import besselsimp + return besselsimp(self) + + +class besselj(BesselBase): + r""" + Bessel function of the first kind. + + Explanation + =========== + + The Bessel $J$ function of order $\nu$ is defined to be the function + satisfying Bessel's differential equation + + .. math :: + z^2 \frac{\mathrm{d}^2 w}{\mathrm{d}z^2} + + z \frac{\mathrm{d}w}{\mathrm{d}z} + (z^2 - \nu^2) w = 0, + + with Laurent expansion + + .. math :: + J_\nu(z) = z^\nu \left(\frac{1}{\Gamma(\nu + 1) 2^\nu} + O(z^2) \right), + + if $\nu$ is not a negative integer. If $\nu=-n \in \mathbb{Z}_{<0}$ + *is* a negative integer, then the definition is + + .. math :: + J_{-n}(z) = (-1)^n J_n(z). + + Examples + ======== + + Create a Bessel function object: + + >>> from sympy import besselj, jn + >>> from sympy.abc import z, n + >>> b = besselj(n, z) + + Differentiate it: + + >>> b.diff(z) + besselj(n - 1, z)/2 - besselj(n + 1, z)/2 + + Rewrite in terms of spherical Bessel functions: + + >>> b.rewrite(jn) + sqrt(2)*sqrt(z)*jn(n - 1/2, z)/sqrt(pi) + + Access the parameter and argument: + + >>> b.order + n + >>> b.argument + z + + See Also + ======== + + bessely, besseli, besselk + + References + ========== + + .. [1] Abramowitz, Milton; Stegun, Irene A., eds. (1965), "Chapter 9", + Handbook of Mathematical Functions with Formulas, Graphs, and + Mathematical Tables + .. [2] Luke, Y. L. (1969), The Special Functions and Their + Approximations, Volume 1 + .. [3] https://en.wikipedia.org/wiki/Bessel_function + .. [4] https://functions.wolfram.com/Bessel-TypeFunctions/BesselJ/ + + """ + + _a = S.One + _b = S.One + + @classmethod + def eval(cls, nu, z): + if z.is_zero: + if nu.is_zero: + return S.One + elif (nu.is_integer and nu.is_zero is False) or re(nu).is_positive: + return S.Zero + elif re(nu).is_negative and not (nu.is_integer is True): + return S.ComplexInfinity + elif nu.is_imaginary: + return S.NaN + if z in (S.Infinity, S.NegativeInfinity): + return S.Zero + + if z.could_extract_minus_sign(): + return (z)**nu*(-z)**(-nu)*besselj(nu, -z) + if nu.is_integer: + if nu.could_extract_minus_sign(): + return S.NegativeOne**(-nu)*besselj(-nu, z) + newz = z.extract_multiplicatively(I) + if newz: # NOTE we don't want to change the function if z==0 + return I**(nu)*besseli(nu, newz) + + # branch handling: + if nu.is_integer: + newz = unpolarify(z) + if newz != z: + return besselj(nu, newz) + else: + newz, n = z.extract_branch_factor() + if n != 0: + return exp(2*n*pi*nu*I)*besselj(nu, newz) + nnu = unpolarify(nu) + if nu != nnu: + return besselj(nnu, z) + + def _eval_rewrite_as_besseli(self, nu, z, **kwargs): + return exp(I*pi*nu/2)*besseli(nu, polar_lift(-I)*z) + + def _eval_rewrite_as_bessely(self, nu, z, **kwargs): + if nu.is_integer is False: + return csc(pi*nu)*bessely(-nu, z) - cot(pi*nu)*bessely(nu, z) + + def _eval_rewrite_as_jn(self, nu, z, **kwargs): + return sqrt(2*z/pi)*jn(nu - S.Half, self.argument) + + def _eval_as_leading_term(self, x, logx, cdir): + nu, z = self.args + try: + arg = z.as_leading_term(x) + except NotImplementedError: + return self + c, e = arg.as_coeff_exponent(x) + + if e.is_positive: + return arg**nu/(2**nu*gamma(nu + 1)) + elif e.is_negative: + cdir = 1 if cdir == 0 else cdir + sign = c*cdir**e + if not sign.is_negative: + # Refer Abramowitz and Stegun 1965, p. 364 for more information on + # asymptotic approximation of besselj function. + return sqrt(2)*cos(z - pi*(2*nu + 1)/4)/sqrt(pi*z) + return self + + return super(besselj, self)._eval_as_leading_term(x, logx=logx, cdir=cdir) + + def _eval_is_extended_real(self): + nu, z = self.args + if nu.is_integer and z.is_extended_real: + return True + + def _eval_nseries(self, x, n, logx, cdir=0): + # Refer https://functions.wolfram.com/Bessel-TypeFunctions/BesselJ/06/01/04/01/01/0003/ + # for more information on nseries expansion of besselj function. + from sympy.series.order import Order + nu, z = self.args + + # In case of powers less than 1, number of terms need to be computed + # separately to avoid repeated callings of _eval_nseries with wrong n + try: + _, exp = z.leadterm(x) + except (ValueError, NotImplementedError): + return self + + if exp.is_positive: + newn = ceiling(n/exp) + o = Order(x**n, x) + r = (z/2)._eval_nseries(x, n, logx, cdir).removeO() + if r is S.Zero: + return o + t = (_mexpand(r**2) + o).removeO() + + term = r**nu/gamma(nu + 1) + s = [term] + for k in range(1, (newn + 1)//2): + term *= -t/(k*(nu + k)) + term = (_mexpand(term) + o).removeO() + s.append(term) + return Add(*s) + o + + return super(besselj, self)._eval_nseries(x, n, logx, cdir) + + +class bessely(BesselBase): + r""" + Bessel function of the second kind. + + Explanation + =========== + + The Bessel $Y$ function of order $\nu$ is defined as + + .. math :: + Y_\nu(z) = \lim_{\mu \to \nu} \frac{J_\mu(z) \cos(\pi \mu) + - J_{-\mu}(z)}{\sin(\pi \mu)}, + + where $J_\mu(z)$ is the Bessel function of the first kind. + + It is a solution to Bessel's equation, and linearly independent from + $J_\nu$. + + Examples + ======== + + >>> from sympy import bessely, yn + >>> from sympy.abc import z, n + >>> b = bessely(n, z) + >>> b.diff(z) + bessely(n - 1, z)/2 - bessely(n + 1, z)/2 + >>> b.rewrite(yn) + sqrt(2)*sqrt(z)*yn(n - 1/2, z)/sqrt(pi) + + See Also + ======== + + besselj, besseli, besselk + + References + ========== + + .. [1] https://functions.wolfram.com/Bessel-TypeFunctions/BesselY/ + + """ + + _a = S.One + _b = S.One + + @classmethod + def eval(cls, nu, z): + if z.is_zero: + if nu.is_zero: + return S.NegativeInfinity + elif re(nu).is_zero is False: + return S.ComplexInfinity + elif re(nu).is_zero: + return S.NaN + if z in (S.Infinity, S.NegativeInfinity): + return S.Zero + if z == I*S.Infinity: + return exp(I*pi*(nu + 1)/2) * S.Infinity + if z == I*S.NegativeInfinity: + return exp(-I*pi*(nu + 1)/2) * S.Infinity + + if nu.is_integer: + if nu.could_extract_minus_sign(): + return S.NegativeOne**(-nu)*bessely(-nu, z) + + def _eval_rewrite_as_besselj(self, nu, z, **kwargs): + if nu.is_integer is False: + return csc(pi*nu)*(cos(pi*nu)*besselj(nu, z) - besselj(-nu, z)) + + def _eval_rewrite_as_besseli(self, nu, z, **kwargs): + aj = self._eval_rewrite_as_besselj(*self.args) + if aj: + return aj.rewrite(besseli) + + def _eval_rewrite_as_yn(self, nu, z, **kwargs): + return sqrt(2*z/pi) * yn(nu - S.Half, self.argument) + + def _eval_as_leading_term(self, x, logx, cdir): + nu, z = self.args + try: + arg = z.as_leading_term(x) + except NotImplementedError: + return self + c, e = arg.as_coeff_exponent(x) + + if e.is_positive: + term_one = ((2/pi)*log(z/2)*besselj(nu, z)) + term_two = -(z/2)**(-nu)*factorial(nu - 1)/pi if (nu).is_positive else S.Zero + term_three = -(z/2)**nu/(pi*factorial(nu))*(digamma(nu + 1) - S.EulerGamma) + arg = Add(*[term_one, term_two, term_three]).as_leading_term(x, logx=logx) + return arg + elif e.is_negative: + cdir = 1 if cdir == 0 else cdir + sign = c*cdir**e + if not sign.is_negative: + # Refer Abramowitz and Stegun 1965, p. 364 for more information on + # asymptotic approximation of bessely function. + return sqrt(2)*(-sin(pi*nu/2 - z + pi/4) + 3*cos(pi*nu/2 - z + pi/4)/(8*z))*sqrt(1/z)/sqrt(pi) + return self + + return super(bessely, self)._eval_as_leading_term(x, logx=logx, cdir=cdir) + + def _eval_is_extended_real(self): + nu, z = self.args + if nu.is_integer and z.is_positive: + return True + + def _eval_nseries(self, x, n, logx, cdir=0): + # Refer https://functions.wolfram.com/Bessel-TypeFunctions/BesselY/06/01/04/01/02/0008/ + # for more information on nseries expansion of bessely function. + from sympy.series.order import Order + nu, z = self.args + + # In case of powers less than 1, number of terms need to be computed + # separately to avoid repeated callings of _eval_nseries with wrong n + try: + _, exp = z.leadterm(x) + except (ValueError, NotImplementedError): + return self + + if exp.is_positive and nu.is_integer: + newn = ceiling(n/exp) + bn = besselj(nu, z) + a = ((2/pi)*log(z/2)*bn)._eval_nseries(x, n, logx, cdir) + + b, c = [], [] + o = Order(x**n, x) + r = (z/2)._eval_nseries(x, n, logx, cdir).removeO() + if r is S.Zero: + return o + t = (_mexpand(r**2) + o).removeO() + + if nu > S.Zero: + term = r**(-nu)*factorial(nu - 1)/pi + b.append(term) + for k in range(1, nu): + denom = (nu - k)*k + if denom == S.Zero: + term *= t/k + else: + term *= t/denom + term = (_mexpand(term) + o).removeO() + b.append(term) + + p = r**nu/(pi*factorial(nu)) + term = p*(digamma(nu + 1) - S.EulerGamma) + c.append(term) + for k in range(1, (newn + 1)//2): + p *= -t/(k*(k + nu)) + p = (_mexpand(p) + o).removeO() + term = p*(digamma(k + nu + 1) + digamma(k + 1)) + c.append(term) + return a - Add(*b) - Add(*c) # Order term comes from a + + return super(bessely, self)._eval_nseries(x, n, logx, cdir) + + +class besseli(BesselBase): + r""" + Modified Bessel function of the first kind. + + Explanation + =========== + + The Bessel $I$ function is a solution to the modified Bessel equation + + .. math :: + z^2 \frac{\mathrm{d}^2 w}{\mathrm{d}z^2} + + z \frac{\mathrm{d}w}{\mathrm{d}z} + (z^2 + \nu^2)^2 w = 0. + + It can be defined as + + .. math :: + I_\nu(z) = i^{-\nu} J_\nu(iz), + + where $J_\nu(z)$ is the Bessel function of the first kind. + + Examples + ======== + + >>> from sympy import besseli + >>> from sympy.abc import z, n + >>> besseli(n, z).diff(z) + besseli(n - 1, z)/2 + besseli(n + 1, z)/2 + + See Also + ======== + + besselj, bessely, besselk + + References + ========== + + .. [1] https://functions.wolfram.com/Bessel-TypeFunctions/BesselI/ + + """ + + _a = -S.One + _b = S.One + + @classmethod + def eval(cls, nu, z): + if z.is_zero: + if nu.is_zero: + return S.One + elif (nu.is_integer and nu.is_zero is False) or re(nu).is_positive: + return S.Zero + elif re(nu).is_negative and not (nu.is_integer is True): + return S.ComplexInfinity + elif nu.is_imaginary: + return S.NaN + if im(z) in (S.Infinity, S.NegativeInfinity): + return S.Zero + if z is S.Infinity: + return S.Infinity + if z is S.NegativeInfinity: + return (-1)**nu*S.Infinity + + if z.could_extract_minus_sign(): + return (z)**nu*(-z)**(-nu)*besseli(nu, -z) + if nu.is_integer: + if nu.could_extract_minus_sign(): + return besseli(-nu, z) + newz = z.extract_multiplicatively(I) + if newz: # NOTE we don't want to change the function if z==0 + return I**(-nu)*besselj(nu, -newz) + + # branch handling: + if nu.is_integer: + newz = unpolarify(z) + if newz != z: + return besseli(nu, newz) + else: + newz, n = z.extract_branch_factor() + if n != 0: + return exp(2*n*pi*nu*I)*besseli(nu, newz) + nnu = unpolarify(nu) + if nu != nnu: + return besseli(nnu, z) + + def _eval_rewrite_as_tractable(self, nu, z, limitvar=None, **kwargs): + if z.is_extended_real: + return exp(z)*_besseli(nu, z) + + def _eval_rewrite_as_besselj(self, nu, z, **kwargs): + return exp(-I*pi*nu/2)*besselj(nu, polar_lift(I)*z) + + def _eval_rewrite_as_bessely(self, nu, z, **kwargs): + aj = self._eval_rewrite_as_besselj(*self.args) + if aj: + return aj.rewrite(bessely) + + def _eval_rewrite_as_jn(self, nu, z, **kwargs): + return self._eval_rewrite_as_besselj(*self.args).rewrite(jn) + + def _eval_is_extended_real(self): + nu, z = self.args + if nu.is_integer and z.is_extended_real: + return True + + def _eval_as_leading_term(self, x, logx, cdir): + nu, z = self.args + try: + arg = z.as_leading_term(x) + except NotImplementedError: + return self + c, e = arg.as_coeff_exponent(x) + + if e.is_positive: + return arg**nu/(2**nu*gamma(nu + 1)) + elif e.is_negative: + cdir = 1 if cdir == 0 else cdir + sign = c*cdir**e + if not sign.is_negative: + # Refer Abramowitz and Stegun 1965, p. 377 for more information on + # asymptotic approximation of besseli function. + return exp(z)/sqrt(2*pi*z) + return self + + return super(besseli, self)._eval_as_leading_term(x, logx=logx, cdir=cdir) + + def _eval_nseries(self, x, n, logx, cdir=0): + # Refer https://functions.wolfram.com/Bessel-TypeFunctions/BesselI/06/01/04/01/01/0003/ + # for more information on nseries expansion of besseli function. + from sympy.series.order import Order + nu, z = self.args + + # In case of powers less than 1, number of terms need to be computed + # separately to avoid repeated callings of _eval_nseries with wrong n + try: + _, exp = z.leadterm(x) + except (ValueError, NotImplementedError): + return self + + if exp.is_positive: + newn = ceiling(n/exp) + o = Order(x**n, x) + r = (z/2)._eval_nseries(x, n, logx, cdir).removeO() + if r is S.Zero: + return o + t = (_mexpand(r**2) + o).removeO() + + term = r**nu/gamma(nu + 1) + s = [term] + for k in range(1, (newn + 1)//2): + term *= t/(k*(nu + k)) + term = (_mexpand(term) + o).removeO() + s.append(term) + return Add(*s) + o + + return super(besseli, self)._eval_nseries(x, n, logx, cdir) + + def _eval_aseries(self, n, args0, x, logx): + from sympy.functions.combinatorial.factorials import RisingFactorial + from sympy.series.order import Order + point = args0[1] + + if point in [S.Infinity, S.NegativeInfinity]: + nu, z = self.args + s = [(RisingFactorial(Rational(2*nu - 1, 2), k)*RisingFactorial(Rational(2*nu + 1, 2), k))/\ + ((2)**(k)*z**(Rational(2*k + 1, 2))*factorial(k)) for k in range(n)] + [Order(1/z**(Rational(2*n + 1, 2)), x)] + return exp(z)/sqrt(2*pi) * (Add(*s)) + + return super()._eval_aseries(n, args0, x, logx) + + +class besselk(BesselBase): + r""" + Modified Bessel function of the second kind. + + Explanation + =========== + + The Bessel $K$ function of order $\nu$ is defined as + + .. math :: + K_\nu(z) = \lim_{\mu \to \nu} \frac{\pi}{2} + \frac{I_{-\mu}(z) -I_\mu(z)}{\sin(\pi \mu)}, + + where $I_\mu(z)$ is the modified Bessel function of the first kind. + + It is a solution of the modified Bessel equation, and linearly independent + from $Y_\nu$. + + Examples + ======== + + >>> from sympy import besselk + >>> from sympy.abc import z, n + >>> besselk(n, z).diff(z) + -besselk(n - 1, z)/2 - besselk(n + 1, z)/2 + + See Also + ======== + + besselj, besseli, bessely + + References + ========== + + .. [1] https://functions.wolfram.com/Bessel-TypeFunctions/BesselK/ + + """ + + _a = S.One + _b = -S.One + + @classmethod + def eval(cls, nu, z): + if z.is_zero: + if nu.is_zero: + return S.Infinity + elif re(nu).is_zero is False: + return S.ComplexInfinity + elif re(nu).is_zero: + return S.NaN + if z in (S.Infinity, I*S.Infinity, I*S.NegativeInfinity): + return S.Zero + + if nu.is_integer: + if nu.could_extract_minus_sign(): + return besselk(-nu, z) + + def _eval_rewrite_as_besseli(self, nu, z, **kwargs): + if nu.is_integer is False: + return pi*csc(pi*nu)*(besseli(-nu, z) - besseli(nu, z))/2 + + def _eval_rewrite_as_besselj(self, nu, z, **kwargs): + ai = self._eval_rewrite_as_besseli(*self.args) + if ai: + return ai.rewrite(besselj) + + def _eval_rewrite_as_bessely(self, nu, z, **kwargs): + aj = self._eval_rewrite_as_besselj(*self.args) + if aj: + return aj.rewrite(bessely) + + def _eval_rewrite_as_yn(self, nu, z, **kwargs): + ay = self._eval_rewrite_as_bessely(*self.args) + if ay: + return ay.rewrite(yn) + + def _eval_is_extended_real(self): + nu, z = self.args + if nu.is_integer and z.is_positive: + return True + + def _eval_rewrite_as_tractable(self, nu, z, limitvar=None, **kwargs): + if z.is_extended_real: + return exp(-z)*_besselk(nu, z) + + def _eval_as_leading_term(self, x, logx, cdir): + nu, z = self.args + try: + arg = z.as_leading_term(x) + except NotImplementedError: + return self + _, e = arg.as_coeff_exponent(x) + + if e.is_positive: + if nu.is_zero: + # Equation 9.6.8 of Abramowitz and Stegun (10th ed, 1972). + term = -log(z) - S.EulerGamma + log(2) + elif nu.is_nonzero: + # Equation 9.6.9 of Abramowitz and Stegun (10th ed, 1972). + term = gamma(Abs(nu))*(z/2)**(-Abs(nu))/2 + else: + raise NotImplementedError(f"Cannot proceed without knowing if {nu} is zero or not.") + + return term.as_leading_term(x, logx=logx) + elif e.is_negative: + # Equation 9.7.2 of Abramowitz and Stegun (10th ed, 1972). + return sqrt(pi)*exp(-arg)/sqrt(2*arg) + else: + return self.func(nu, arg) + + def _eval_nseries(self, x, n, logx, cdir=0): + from sympy.series.order import Order + nu, z = self.args + + try: + _, exp = z.leadterm(x) + except (ValueError, NotImplementedError): + return self + + # In case of powers less than 1, number of terms need to be computed + # separately to avoid repeated callings of _eval_nseries with wrong n + if exp.is_positive: + r = (z/2)._eval_nseries(x, n, logx, cdir).removeO() + if r is S.Zero: + return Order(z**(-nu) + z**nu, x) + + o = Order(x**n, x) + if nu.is_integer: + # Reference: https://functions.wolfram.com/Bessel-TypeFunctions/BesselK/06/01/04/01/02/0008/ (only for integer order) + newn = ceiling(n/exp) + bn = besseli(nu, z) + a = ((-1)**(nu - 1)*log(z/2)*bn)._eval_nseries(x, n, logx, cdir) + + b, c = [], [] + t = _mexpand(r**2) + + if nu > S.Zero: + term = r**(-nu)*factorial(nu - 1)/2 + b.append(term) + for k in range(1, nu): + term *= t/((k - nu)*k) + term = (_mexpand(term) + o).removeO() + b.append(term) + + p = r**nu*(-1)**nu/(2*factorial(nu)) + term = p*(digamma(nu + 1) - S.EulerGamma) + c.append(term) + for k in range(1, (newn + 1)//2): + p *= t/(k*(k + nu)) + p = (_mexpand(p) + o).removeO() + term = p*(digamma(k + nu + 1) + digamma(k + 1)) + c.append(term) + return a + Add(*b) + Add(*c) + o + elif nu.is_noninteger: + # Reference: https://functions.wolfram.com/Bessel-TypeFunctions/BesselK/06/01/04/01/01/0003/ + # (only for non-integer order). + # While the expression in the reference above seems correct + # for non-real order as well, it would need some manipulation + # (not implemented) to be written as a power series in x with + # real exponents [e.g. Dunster 1990. "Bessel functions + # of purely imaginary order, with an application to second-order + # linear differential equations having a large parameter". + # SIAM J. Math. Anal. Vol 21, No. 4, pp 995-1018.]. + newn_a = ceiling((n+nu)/exp) + newn_b = ceiling((n-nu)/exp) + + a, b = [], [] + for k in range((newn_a+1)//2): + term = gamma(nu)*r**(2*k-nu)/(2*RisingFactorial(1-nu, k)*factorial(k)) + a.append(_mexpand(term)) + for k in range((newn_b+1)//2): + term = gamma(-nu)*r**(2*k+nu)/(2*RisingFactorial(nu+1, k)*factorial(k)) + b.append(_mexpand(term)) + return Add(*a) + Add(*b) + o + else: + raise NotImplementedError("besselk expansion is only implemented for real order") + + return super(besselk, self)._eval_nseries(x, n, logx, cdir) + + def _eval_aseries(self, n, args0, x, logx): + from sympy.functions.combinatorial.factorials import RisingFactorial + from sympy.series.order import Order + point = args0[1] + + if point in [S.Infinity, S.NegativeInfinity]: + nu, z = self.args + s = [(RisingFactorial(Rational(2*nu - 1, 2), k)*RisingFactorial(Rational(2*nu + 1, 2), k))/\ + ((-2)**(k)*z**(Rational(2*k + 1, 2))*factorial(k)) for k in range(n)] +[Order(1/z**(Rational(2*n + 1, 2)), x)] + return (exp(-z)*sqrt(pi/2))*Add(*s) + + return super()._eval_aseries(n, args0, x, logx) + + +class hankel1(BesselBase): + r""" + Hankel function of the first kind. + + Explanation + =========== + + This function is defined as + + .. math :: + H_\nu^{(1)} = J_\nu(z) + iY_\nu(z), + + where $J_\nu(z)$ is the Bessel function of the first kind, and + $Y_\nu(z)$ is the Bessel function of the second kind. + + It is a solution to Bessel's equation. + + Examples + ======== + + >>> from sympy import hankel1 + >>> from sympy.abc import z, n + >>> hankel1(n, z).diff(z) + hankel1(n - 1, z)/2 - hankel1(n + 1, z)/2 + + See Also + ======== + + hankel2, besselj, bessely + + References + ========== + + .. [1] https://functions.wolfram.com/Bessel-TypeFunctions/HankelH1/ + + """ + + _a = S.One + _b = S.One + + def _eval_conjugate(self): + z = self.argument + if z.is_extended_negative is False: + return hankel2(self.order.conjugate(), z.conjugate()) + + +class hankel2(BesselBase): + r""" + Hankel function of the second kind. + + Explanation + =========== + + This function is defined as + + .. math :: + H_\nu^{(2)} = J_\nu(z) - iY_\nu(z), + + where $J_\nu(z)$ is the Bessel function of the first kind, and + $Y_\nu(z)$ is the Bessel function of the second kind. + + It is a solution to Bessel's equation, and linearly independent from + $H_\nu^{(1)}$. + + Examples + ======== + + >>> from sympy import hankel2 + >>> from sympy.abc import z, n + >>> hankel2(n, z).diff(z) + hankel2(n - 1, z)/2 - hankel2(n + 1, z)/2 + + See Also + ======== + + hankel1, besselj, bessely + + References + ========== + + .. [1] https://functions.wolfram.com/Bessel-TypeFunctions/HankelH2/ + + """ + + _a = S.One + _b = S.One + + def _eval_conjugate(self): + z = self.argument + if z.is_extended_negative is False: + return hankel1(self.order.conjugate(), z.conjugate()) + + +def assume_integer_order(fn): + @wraps(fn) + def g(self, nu, z): + if nu.is_integer: + return fn(self, nu, z) + return g + + +class SphericalBesselBase(BesselBase): + """ + Base class for spherical Bessel functions. + + These are thin wrappers around ordinary Bessel functions, + since spherical Bessel functions differ from the ordinary + ones just by a slight change in order. + + To use this class, define the ``_eval_evalf()`` and ``_expand()`` methods. + + """ + + def _expand(self, **hints): + """ Expand self into a polynomial. Nu is guaranteed to be Integer. """ + raise NotImplementedError('expansion') + + def _eval_expand_func(self, **hints): + if self.order.is_Integer: + return self._expand(**hints) + return self + + def fdiff(self, argindex=2): + if argindex != 2: + raise ArgumentIndexError(self, argindex) + return self.__class__(self.order - 1, self.argument) - \ + self * (self.order + 1)/self.argument + + +def _jn(n, z): + return (spherical_bessel_fn(n, z)*sin(z) + + S.NegativeOne**(n + 1)*spherical_bessel_fn(-n - 1, z)*cos(z)) + + +def _yn(n, z): + # (-1)**(n + 1) * _jn(-n - 1, z) + return (S.NegativeOne**(n + 1) * spherical_bessel_fn(-n - 1, z)*sin(z) - + spherical_bessel_fn(n, z)*cos(z)) + + +class jn(SphericalBesselBase): + r""" + Spherical Bessel function of the first kind. + + Explanation + =========== + + This function is a solution to the spherical Bessel equation + + .. math :: + z^2 \frac{\mathrm{d}^2 w}{\mathrm{d}z^2} + + 2z \frac{\mathrm{d}w}{\mathrm{d}z} + (z^2 - \nu(\nu + 1)) w = 0. + + It can be defined as + + .. math :: + j_\nu(z) = \sqrt{\frac{\pi}{2z}} J_{\nu + \frac{1}{2}}(z), + + where $J_\nu(z)$ is the Bessel function of the first kind. + + The spherical Bessel functions of integral order are + calculated using the formula: + + .. math:: j_n(z) = f_n(z) \sin{z} + (-1)^{n+1} f_{-n-1}(z) \cos{z}, + + where the coefficients $f_n(z)$ are available as + :func:`sympy.polys.orthopolys.spherical_bessel_fn`. + + Examples + ======== + + >>> from sympy import Symbol, jn, sin, cos, expand_func, besselj, bessely + >>> z = Symbol("z") + >>> nu = Symbol("nu", integer=True) + >>> print(expand_func(jn(0, z))) + sin(z)/z + >>> expand_func(jn(1, z)) == sin(z)/z**2 - cos(z)/z + True + >>> expand_func(jn(3, z)) + (-6/z**2 + 15/z**4)*sin(z) + (1/z - 15/z**3)*cos(z) + >>> jn(nu, z).rewrite(besselj) + sqrt(2)*sqrt(pi)*sqrt(1/z)*besselj(nu + 1/2, z)/2 + >>> jn(nu, z).rewrite(bessely) + (-1)**nu*sqrt(2)*sqrt(pi)*sqrt(1/z)*bessely(-nu - 1/2, z)/2 + >>> jn(2, 5.2+0.3j).evalf(20) + 0.099419756723640344491 - 0.054525080242173562897*I + + See Also + ======== + + besselj, bessely, besselk, yn + + References + ========== + + .. [1] https://dlmf.nist.gov/10.47 + + """ + @classmethod + def eval(cls, nu, z): + if z.is_zero: + if nu.is_zero: + return S.One + elif nu.is_integer: + if nu.is_positive: + return S.Zero + else: + return S.ComplexInfinity + if z in (S.NegativeInfinity, S.Infinity): + return S.Zero + + def _eval_rewrite_as_besselj(self, nu, z, **kwargs): + return sqrt(pi/(2*z)) * besselj(nu + S.Half, z) + + def _eval_rewrite_as_bessely(self, nu, z, **kwargs): + return S.NegativeOne**nu * sqrt(pi/(2*z)) * bessely(-nu - S.Half, z) + + def _eval_rewrite_as_yn(self, nu, z, **kwargs): + return S.NegativeOne**(nu) * yn(-nu - 1, z) + + def _expand(self, **hints): + return _jn(self.order, self.argument) + + def _eval_evalf(self, prec): + if self.order.is_Integer: + return self.rewrite(besselj)._eval_evalf(prec) + + +class yn(SphericalBesselBase): + r""" + Spherical Bessel function of the second kind. + + Explanation + =========== + + This function is another solution to the spherical Bessel equation, and + linearly independent from $j_n$. It can be defined as + + .. math :: + y_\nu(z) = \sqrt{\frac{\pi}{2z}} Y_{\nu + \frac{1}{2}}(z), + + where $Y_\nu(z)$ is the Bessel function of the second kind. + + For integral orders $n$, $y_n$ is calculated using the formula: + + .. math:: y_n(z) = (-1)^{n+1} j_{-n-1}(z) + + Examples + ======== + + >>> from sympy import Symbol, yn, sin, cos, expand_func, besselj, bessely + >>> z = Symbol("z") + >>> nu = Symbol("nu", integer=True) + >>> print(expand_func(yn(0, z))) + -cos(z)/z + >>> expand_func(yn(1, z)) == -cos(z)/z**2-sin(z)/z + True + >>> yn(nu, z).rewrite(besselj) + (-1)**(nu + 1)*sqrt(2)*sqrt(pi)*sqrt(1/z)*besselj(-nu - 1/2, z)/2 + >>> yn(nu, z).rewrite(bessely) + sqrt(2)*sqrt(pi)*sqrt(1/z)*bessely(nu + 1/2, z)/2 + >>> yn(2, 5.2+0.3j).evalf(20) + 0.18525034196069722536 + 0.014895573969924817587*I + + See Also + ======== + + besselj, bessely, besselk, jn + + References + ========== + + .. [1] https://dlmf.nist.gov/10.47 + + """ + @assume_integer_order + def _eval_rewrite_as_besselj(self, nu, z, **kwargs): + return S.NegativeOne**(nu+1) * sqrt(pi/(2*z)) * besselj(-nu - S.Half, z) + + @assume_integer_order + def _eval_rewrite_as_bessely(self, nu, z, **kwargs): + return sqrt(pi/(2*z)) * bessely(nu + S.Half, z) + + def _eval_rewrite_as_jn(self, nu, z, **kwargs): + return S.NegativeOne**(nu + 1) * jn(-nu - 1, z) + + def _expand(self, **hints): + return _yn(self.order, self.argument) + + def _eval_evalf(self, prec): + if self.order.is_Integer: + return self.rewrite(bessely)._eval_evalf(prec) + + +class SphericalHankelBase(SphericalBesselBase): + + @assume_integer_order + def _eval_rewrite_as_besselj(self, nu, z, **kwargs): + # jn +- I*yn + # jn as beeselj: sqrt(pi/(2*z)) * besselj(nu + S.Half, z) + # yn as besselj: (-1)**(nu+1) * sqrt(pi/(2*z)) * besselj(-nu - S.Half, z) + hks = self._hankel_kind_sign + return sqrt(pi/(2*z))*(besselj(nu + S.Half, z) + + hks*I*S.NegativeOne**(nu+1)*besselj(-nu - S.Half, z)) + + @assume_integer_order + def _eval_rewrite_as_bessely(self, nu, z, **kwargs): + # jn +- I*yn + # jn as bessely: (-1)**nu * sqrt(pi/(2*z)) * bessely(-nu - S.Half, z) + # yn as bessely: sqrt(pi/(2*z)) * bessely(nu + S.Half, z) + hks = self._hankel_kind_sign + return sqrt(pi/(2*z))*(S.NegativeOne**nu*bessely(-nu - S.Half, z) + + hks*I*bessely(nu + S.Half, z)) + + def _eval_rewrite_as_yn(self, nu, z, **kwargs): + hks = self._hankel_kind_sign + return jn(nu, z).rewrite(yn) + hks*I*yn(nu, z) + + def _eval_rewrite_as_jn(self, nu, z, **kwargs): + hks = self._hankel_kind_sign + return jn(nu, z) + hks*I*yn(nu, z).rewrite(jn) + + def _eval_expand_func(self, **hints): + if self.order.is_Integer: + return self._expand(**hints) + else: + nu = self.order + z = self.argument + hks = self._hankel_kind_sign + return jn(nu, z) + hks*I*yn(nu, z) + + def _expand(self, **hints): + n = self.order + z = self.argument + hks = self._hankel_kind_sign + + # fully expanded version + # return ((fn(n, z) * sin(z) + + # (-1)**(n + 1) * fn(-n - 1, z) * cos(z)) + # jn + # (hks * I * (-1)**(n + 1) * + # (fn(-n - 1, z) * hk * I * sin(z) + + # (-1)**(-n) * fn(n, z) * I * cos(z))) # +-I*yn + # ) + + return (_jn(n, z) + hks*I*_yn(n, z)).expand() + + def _eval_evalf(self, prec): + if self.order.is_Integer: + return self.rewrite(besselj)._eval_evalf(prec) + + +class hn1(SphericalHankelBase): + r""" + Spherical Hankel function of the first kind. + + Explanation + =========== + + This function is defined as + + .. math:: h_\nu^(1)(z) = j_\nu(z) + i y_\nu(z), + + where $j_\nu(z)$ and $y_\nu(z)$ are the spherical + Bessel function of the first and second kinds. + + For integral orders $n$, $h_n^(1)$ is calculated using the formula: + + .. math:: h_n^(1)(z) = j_{n}(z) + i (-1)^{n+1} j_{-n-1}(z) + + Examples + ======== + + >>> from sympy import Symbol, hn1, hankel1, expand_func, yn, jn + >>> z = Symbol("z") + >>> nu = Symbol("nu", integer=True) + >>> print(expand_func(hn1(nu, z))) + jn(nu, z) + I*yn(nu, z) + >>> print(expand_func(hn1(0, z))) + sin(z)/z - I*cos(z)/z + >>> print(expand_func(hn1(1, z))) + -I*sin(z)/z - cos(z)/z + sin(z)/z**2 - I*cos(z)/z**2 + >>> hn1(nu, z).rewrite(jn) + (-1)**(nu + 1)*I*jn(-nu - 1, z) + jn(nu, z) + >>> hn1(nu, z).rewrite(yn) + (-1)**nu*yn(-nu - 1, z) + I*yn(nu, z) + >>> hn1(nu, z).rewrite(hankel1) + sqrt(2)*sqrt(pi)*sqrt(1/z)*hankel1(nu, z)/2 + + See Also + ======== + + hn2, jn, yn, hankel1, hankel2 + + References + ========== + + .. [1] https://dlmf.nist.gov/10.47 + + """ + + _hankel_kind_sign = S.One + + @assume_integer_order + def _eval_rewrite_as_hankel1(self, nu, z, **kwargs): + return sqrt(pi/(2*z))*hankel1(nu, z) + + +class hn2(SphericalHankelBase): + r""" + Spherical Hankel function of the second kind. + + Explanation + =========== + + This function is defined as + + .. math:: h_\nu^(2)(z) = j_\nu(z) - i y_\nu(z), + + where $j_\nu(z)$ and $y_\nu(z)$ are the spherical + Bessel function of the first and second kinds. + + For integral orders $n$, $h_n^(2)$ is calculated using the formula: + + .. math:: h_n^(2)(z) = j_{n} - i (-1)^{n+1} j_{-n-1}(z) + + Examples + ======== + + >>> from sympy import Symbol, hn2, hankel2, expand_func, jn, yn + >>> z = Symbol("z") + >>> nu = Symbol("nu", integer=True) + >>> print(expand_func(hn2(nu, z))) + jn(nu, z) - I*yn(nu, z) + >>> print(expand_func(hn2(0, z))) + sin(z)/z + I*cos(z)/z + >>> print(expand_func(hn2(1, z))) + I*sin(z)/z - cos(z)/z + sin(z)/z**2 + I*cos(z)/z**2 + >>> hn2(nu, z).rewrite(hankel2) + sqrt(2)*sqrt(pi)*sqrt(1/z)*hankel2(nu, z)/2 + >>> hn2(nu, z).rewrite(jn) + -(-1)**(nu + 1)*I*jn(-nu - 1, z) + jn(nu, z) + >>> hn2(nu, z).rewrite(yn) + (-1)**nu*yn(-nu - 1, z) - I*yn(nu, z) + + See Also + ======== + + hn1, jn, yn, hankel1, hankel2 + + References + ========== + + .. [1] https://dlmf.nist.gov/10.47 + + """ + + _hankel_kind_sign = -S.One + + @assume_integer_order + def _eval_rewrite_as_hankel2(self, nu, z, **kwargs): + return sqrt(pi/(2*z))*hankel2(nu, z) + + +def jn_zeros(n, k, method="sympy", dps=15): + """ + Zeros of the spherical Bessel function of the first kind. + + Explanation + =========== + + This returns an array of zeros of $jn$ up to the $k$-th zero. + + * method = "sympy": uses `mpmath.besseljzero + `_ + * method = "scipy": uses the + `SciPy's sph_jn `_ + and + `newton `_ + to find all + roots, which is faster than computing the zeros using a general + numerical solver, but it requires SciPy and only works with low + precision floating point numbers. (The function used with + method="sympy" is a recent addition to mpmath; before that a general + solver was used.) + + Examples + ======== + + >>> from sympy import jn_zeros + >>> jn_zeros(2, 4, dps=5) + [5.7635, 9.095, 12.323, 15.515] + + See Also + ======== + + jn, yn, besselj, besselk, bessely + + Parameters + ========== + + n : integer + order of Bessel function + + k : integer + number of zeros to return + + + """ + from math import pi as math_pi + + if method == "sympy": + from mpmath import besseljzero + from mpmath.libmp.libmpf import dps_to_prec + prec = dps_to_prec(dps) + return [Expr._from_mpmath(besseljzero(S(n + 0.5)._to_mpmath(prec), + int(l)), prec) + for l in range(1, k + 1)] + elif method == "scipy": + from scipy.optimize import newton + try: + from scipy.special import spherical_jn + f = lambda x: spherical_jn(n, x) + except ImportError: + from scipy.special import sph_jn + f = lambda x: sph_jn(n, x)[0][-1] + else: + raise NotImplementedError("Unknown method.") + + def solver(f, x): + if method == "scipy": + root = newton(f, x) + else: + raise NotImplementedError("Unknown method.") + return root + + # we need to approximate the position of the first root: + root = n + math_pi + # determine the first root exactly: + root = solver(f, root) + roots = [root] + for i in range(k - 1): + # estimate the position of the next root using the last root + pi: + root = solver(f, root + math_pi) + roots.append(root) + return roots + + +class AiryBase(DefinedFunction): + """ + Abstract base class for Airy functions. + + This class is meant to reduce code duplication. + + """ + + def _eval_conjugate(self): + return self.func(self.args[0].conjugate()) + + def _eval_is_extended_real(self): + return self.args[0].is_extended_real + + def as_real_imag(self, deep=True, **hints): + z = self.args[0] + zc = z.conjugate() + f = self.func + u = (f(z)+f(zc))/2 + v = I*(f(zc)-f(z))/2 + return u, v + + def _eval_expand_complex(self, deep=True, **hints): + re_part, im_part = self.as_real_imag(deep=deep, **hints) + return re_part + im_part*I + + +class airyai(AiryBase): + r""" + The Airy function $\operatorname{Ai}$ of the first kind. + + Explanation + =========== + + The Airy function $\operatorname{Ai}(z)$ is defined to be the function + satisfying Airy's differential equation + + .. math:: + \frac{\mathrm{d}^2 w(z)}{\mathrm{d}z^2} - z w(z) = 0. + + Equivalently, for real $z$ + + .. math:: + \operatorname{Ai}(z) := \frac{1}{\pi} + \int_0^\infty \cos\left(\frac{t^3}{3} + z t\right) \mathrm{d}t. + + Examples + ======== + + Create an Airy function object: + + >>> from sympy import airyai + >>> from sympy.abc import z + + >>> airyai(z) + airyai(z) + + Several special values are known: + + >>> airyai(0) + 3**(1/3)/(3*gamma(2/3)) + >>> from sympy import oo + >>> airyai(oo) + 0 + >>> airyai(-oo) + 0 + + The Airy function obeys the mirror symmetry: + + >>> from sympy import conjugate + >>> conjugate(airyai(z)) + airyai(conjugate(z)) + + Differentiation with respect to $z$ is supported: + + >>> from sympy import diff + >>> diff(airyai(z), z) + airyaiprime(z) + >>> diff(airyai(z), z, 2) + z*airyai(z) + + Series expansion is also supported: + + >>> from sympy import series + >>> series(airyai(z), z, 0, 3) + 3**(5/6)*gamma(1/3)/(6*pi) - 3**(1/6)*z*gamma(2/3)/(2*pi) + O(z**3) + + We can numerically evaluate the Airy function to arbitrary precision + on the whole complex plane: + + >>> airyai(-2).evalf(50) + 0.22740742820168557599192443603787379946077222541710 + + Rewrite $\operatorname{Ai}(z)$ in terms of hypergeometric functions: + + >>> from sympy import hyper + >>> airyai(z).rewrite(hyper) + -3**(2/3)*z*hyper((), (4/3,), z**3/9)/(3*gamma(1/3)) + 3**(1/3)*hyper((), (2/3,), z**3/9)/(3*gamma(2/3)) + + See Also + ======== + + airybi: Airy function of the second kind. + airyaiprime: Derivative of the Airy function of the first kind. + airybiprime: Derivative of the Airy function of the second kind. + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Airy_function + .. [2] https://dlmf.nist.gov/9 + .. [3] https://encyclopediaofmath.org/wiki/Airy_functions + .. [4] https://mathworld.wolfram.com/AiryFunctions.html + + """ + + nargs = 1 + unbranched = True + + @classmethod + def eval(cls, arg): + if arg.is_Number: + if arg is S.NaN: + return S.NaN + elif arg is S.Infinity: + return S.Zero + elif arg is S.NegativeInfinity: + return S.Zero + elif arg.is_zero: + return S.One / (3**Rational(2, 3) * gamma(Rational(2, 3))) + if arg.is_zero: + return S.One / (3**Rational(2, 3) * gamma(Rational(2, 3))) + + def fdiff(self, argindex=1): + if argindex == 1: + return airyaiprime(self.args[0]) + else: + raise ArgumentIndexError(self, argindex) + + @staticmethod + @cacheit + def taylor_term(n, x, *previous_terms): + if n < 0: + return S.Zero + else: + x = sympify(x) + if len(previous_terms) > 1: + p = previous_terms[-1] + return ((cbrt(3)*x)**(-n)*(cbrt(3)*x)**(n + 1)*sin(pi*(n*Rational(2, 3) + Rational(4, 3)))*factorial(n) * + gamma(n/3 + Rational(2, 3))/(sin(pi*(n*Rational(2, 3) + Rational(2, 3)))*factorial(n + 1)*gamma(n/3 + Rational(1, 3))) * p) + else: + return (S.One/(3**Rational(2, 3)*pi) * gamma((n+S.One)/S(3)) * sin(Rational(2, 3)*pi*(n+S.One)) / + factorial(n) * (cbrt(3)*x)**n) + + def _eval_rewrite_as_besselj(self, z, **kwargs): + ot = Rational(1, 3) + tt = Rational(2, 3) + a = Pow(-z, Rational(3, 2)) + if re(z).is_negative: + return ot*sqrt(-z) * (besselj(-ot, tt*a) + besselj(ot, tt*a)) + + def _eval_rewrite_as_besseli(self, z, **kwargs): + ot = Rational(1, 3) + tt = Rational(2, 3) + a = Pow(z, Rational(3, 2)) + if re(z).is_positive: + return ot*sqrt(z) * (besseli(-ot, tt*a) - besseli(ot, tt*a)) + else: + return ot*(Pow(a, ot)*besseli(-ot, tt*a) - z*Pow(a, -ot)*besseli(ot, tt*a)) + + def _eval_rewrite_as_hyper(self, z, **kwargs): + pf1 = S.One / (3**Rational(2, 3)*gamma(Rational(2, 3))) + pf2 = z / (root(3, 3)*gamma(Rational(1, 3))) + return pf1 * hyper([], [Rational(2, 3)], z**3/9) - pf2 * hyper([], [Rational(4, 3)], z**3/9) + + def _eval_expand_func(self, **hints): + arg = self.args[0] + symbs = arg.free_symbols + + if len(symbs) == 1: + z = symbs.pop() + c = Wild("c", exclude=[z]) + d = Wild("d", exclude=[z]) + m = Wild("m", exclude=[z]) + n = Wild("n", exclude=[z]) + M = arg.match(c*(d*z**n)**m) + if M is not None: + m = M[m] + # The transformation is given by 03.05.16.0001.01 + # https://functions.wolfram.com/Bessel-TypeFunctions/AiryAi/16/01/01/0001/ + if (3*m).is_integer: + c = M[c] + d = M[d] + n = M[n] + pf = (d * z**n)**m / (d**m * z**(m*n)) + newarg = c * d**m * z**(m*n) + return S.Half * ((pf + S.One)*airyai(newarg) - (pf - S.One)/sqrt(3)*airybi(newarg)) + + +class airybi(AiryBase): + r""" + The Airy function $\operatorname{Bi}$ of the second kind. + + Explanation + =========== + + The Airy function $\operatorname{Bi}(z)$ is defined to be the function + satisfying Airy's differential equation + + .. math:: + \frac{\mathrm{d}^2 w(z)}{\mathrm{d}z^2} - z w(z) = 0. + + Equivalently, for real $z$ + + .. math:: + \operatorname{Bi}(z) := \frac{1}{\pi} + \int_0^\infty + \exp\left(-\frac{t^3}{3} + z t\right) + + \sin\left(\frac{t^3}{3} + z t\right) \mathrm{d}t. + + Examples + ======== + + Create an Airy function object: + + >>> from sympy import airybi + >>> from sympy.abc import z + + >>> airybi(z) + airybi(z) + + Several special values are known: + + >>> airybi(0) + 3**(5/6)/(3*gamma(2/3)) + >>> from sympy import oo + >>> airybi(oo) + oo + >>> airybi(-oo) + 0 + + The Airy function obeys the mirror symmetry: + + >>> from sympy import conjugate + >>> conjugate(airybi(z)) + airybi(conjugate(z)) + + Differentiation with respect to $z$ is supported: + + >>> from sympy import diff + >>> diff(airybi(z), z) + airybiprime(z) + >>> diff(airybi(z), z, 2) + z*airybi(z) + + Series expansion is also supported: + + >>> from sympy import series + >>> series(airybi(z), z, 0, 3) + 3**(1/3)*gamma(1/3)/(2*pi) + 3**(2/3)*z*gamma(2/3)/(2*pi) + O(z**3) + + We can numerically evaluate the Airy function to arbitrary precision + on the whole complex plane: + + >>> airybi(-2).evalf(50) + -0.41230258795639848808323405461146104203453483447240 + + Rewrite $\operatorname{Bi}(z)$ in terms of hypergeometric functions: + + >>> from sympy import hyper + >>> airybi(z).rewrite(hyper) + 3**(1/6)*z*hyper((), (4/3,), z**3/9)/gamma(1/3) + 3**(5/6)*hyper((), (2/3,), z**3/9)/(3*gamma(2/3)) + + See Also + ======== + + airyai: Airy function of the first kind. + airyaiprime: Derivative of the Airy function of the first kind. + airybiprime: Derivative of the Airy function of the second kind. + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Airy_function + .. [2] https://dlmf.nist.gov/9 + .. [3] https://encyclopediaofmath.org/wiki/Airy_functions + .. [4] https://mathworld.wolfram.com/AiryFunctions.html + + """ + + nargs = 1 + unbranched = True + + @classmethod + def eval(cls, arg): + if arg.is_Number: + if arg is S.NaN: + return S.NaN + elif arg is S.Infinity: + return S.Infinity + elif arg is S.NegativeInfinity: + return S.Zero + elif arg.is_zero: + return S.One / (3**Rational(1, 6) * gamma(Rational(2, 3))) + + if arg.is_zero: + return S.One / (3**Rational(1, 6) * gamma(Rational(2, 3))) + + def fdiff(self, argindex=1): + if argindex == 1: + return airybiprime(self.args[0]) + else: + raise ArgumentIndexError(self, argindex) + + @staticmethod + @cacheit + def taylor_term(n, x, *previous_terms): + if n < 0: + return S.Zero + else: + x = sympify(x) + if len(previous_terms) > 1: + p = previous_terms[-1] + return (cbrt(3)*x * Abs(sin(Rational(2, 3)*pi*(n + S.One))) * factorial((n - S.One)/S(3)) / + ((n + S.One) * Abs(cos(Rational(2, 3)*pi*(n + S.Half))) * factorial((n - 2)/S(3))) * p) + else: + return (S.One/(root(3, 6)*pi) * gamma((n + S.One)/S(3)) * Abs(sin(Rational(2, 3)*pi*(n + S.One))) / + factorial(n) * (cbrt(3)*x)**n) + + def _eval_rewrite_as_besselj(self, z, **kwargs): + ot = Rational(1, 3) + tt = Rational(2, 3) + a = Pow(-z, Rational(3, 2)) + if re(z).is_negative: + return sqrt(-z/3) * (besselj(-ot, tt*a) - besselj(ot, tt*a)) + + def _eval_rewrite_as_besseli(self, z, **kwargs): + ot = Rational(1, 3) + tt = Rational(2, 3) + a = Pow(z, Rational(3, 2)) + if re(z).is_positive: + return sqrt(z)/sqrt(3) * (besseli(-ot, tt*a) + besseli(ot, tt*a)) + else: + b = Pow(a, ot) + c = Pow(a, -ot) + return sqrt(ot)*(b*besseli(-ot, tt*a) + z*c*besseli(ot, tt*a)) + + def _eval_rewrite_as_hyper(self, z, **kwargs): + pf1 = S.One / (root(3, 6)*gamma(Rational(2, 3))) + pf2 = z*root(3, 6) / gamma(Rational(1, 3)) + return pf1 * hyper([], [Rational(2, 3)], z**3/9) + pf2 * hyper([], [Rational(4, 3)], z**3/9) + + def _eval_expand_func(self, **hints): + arg = self.args[0] + symbs = arg.free_symbols + + if len(symbs) == 1: + z = symbs.pop() + c = Wild("c", exclude=[z]) + d = Wild("d", exclude=[z]) + m = Wild("m", exclude=[z]) + n = Wild("n", exclude=[z]) + M = arg.match(c*(d*z**n)**m) + if M is not None: + m = M[m] + # The transformation is given by 03.06.16.0001.01 + # https://functions.wolfram.com/Bessel-TypeFunctions/AiryBi/16/01/01/0001/ + if (3*m).is_integer: + c = M[c] + d = M[d] + n = M[n] + pf = (d * z**n)**m / (d**m * z**(m*n)) + newarg = c * d**m * z**(m*n) + return S.Half * (sqrt(3)*(S.One - pf)*airyai(newarg) + (S.One + pf)*airybi(newarg)) + + +class airyaiprime(AiryBase): + r""" + The derivative $\operatorname{Ai}^\prime$ of the Airy function of the first + kind. + + Explanation + =========== + + The Airy function $\operatorname{Ai}^\prime(z)$ is defined to be the + function + + .. math:: + \operatorname{Ai}^\prime(z) := \frac{\mathrm{d} \operatorname{Ai}(z)}{\mathrm{d} z}. + + Examples + ======== + + Create an Airy function object: + + >>> from sympy import airyaiprime + >>> from sympy.abc import z + + >>> airyaiprime(z) + airyaiprime(z) + + Several special values are known: + + >>> airyaiprime(0) + -3**(2/3)/(3*gamma(1/3)) + >>> from sympy import oo + >>> airyaiprime(oo) + 0 + + The Airy function obeys the mirror symmetry: + + >>> from sympy import conjugate + >>> conjugate(airyaiprime(z)) + airyaiprime(conjugate(z)) + + Differentiation with respect to $z$ is supported: + + >>> from sympy import diff + >>> diff(airyaiprime(z), z) + z*airyai(z) + >>> diff(airyaiprime(z), z, 2) + z*airyaiprime(z) + airyai(z) + + Series expansion is also supported: + + >>> from sympy import series + >>> series(airyaiprime(z), z, 0, 3) + -3**(2/3)/(3*gamma(1/3)) + 3**(1/3)*z**2/(6*gamma(2/3)) + O(z**3) + + We can numerically evaluate the Airy function to arbitrary precision + on the whole complex plane: + + >>> airyaiprime(-2).evalf(50) + 0.61825902074169104140626429133247528291577794512415 + + Rewrite $\operatorname{Ai}^\prime(z)$ in terms of hypergeometric functions: + + >>> from sympy import hyper + >>> airyaiprime(z).rewrite(hyper) + 3**(1/3)*z**2*hyper((), (5/3,), z**3/9)/(6*gamma(2/3)) - 3**(2/3)*hyper((), (1/3,), z**3/9)/(3*gamma(1/3)) + + See Also + ======== + + airyai: Airy function of the first kind. + airybi: Airy function of the second kind. + airybiprime: Derivative of the Airy function of the second kind. + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Airy_function + .. [2] https://dlmf.nist.gov/9 + .. [3] https://encyclopediaofmath.org/wiki/Airy_functions + .. [4] https://mathworld.wolfram.com/AiryFunctions.html + + """ + + nargs = 1 + unbranched = True + + @classmethod + def eval(cls, arg): + if arg.is_Number: + if arg is S.NaN: + return S.NaN + elif arg is S.Infinity: + return S.Zero + + if arg.is_zero: + return S.NegativeOne / (3**Rational(1, 3) * gamma(Rational(1, 3))) + + def fdiff(self, argindex=1): + if argindex == 1: + return self.args[0]*airyai(self.args[0]) + else: + raise ArgumentIndexError(self, argindex) + + def _eval_evalf(self, prec): + z = self.args[0]._to_mpmath(prec) + with workprec(prec): + res = mp.airyai(z, derivative=1) + return Expr._from_mpmath(res, prec) + + def _eval_rewrite_as_besselj(self, z, **kwargs): + tt = Rational(2, 3) + a = Pow(-z, Rational(3, 2)) + if re(z).is_negative: + return z/3 * (besselj(-tt, tt*a) - besselj(tt, tt*a)) + + def _eval_rewrite_as_besseli(self, z, **kwargs): + ot = Rational(1, 3) + tt = Rational(2, 3) + a = tt * Pow(z, Rational(3, 2)) + if re(z).is_positive: + return z/3 * (besseli(tt, a) - besseli(-tt, a)) + else: + a = Pow(z, Rational(3, 2)) + b = Pow(a, tt) + c = Pow(a, -tt) + return ot * (z**2*c*besseli(tt, tt*a) - b*besseli(-ot, tt*a)) + + def _eval_rewrite_as_hyper(self, z, **kwargs): + pf1 = z**2 / (2*3**Rational(2, 3)*gamma(Rational(2, 3))) + pf2 = 1 / (root(3, 3)*gamma(Rational(1, 3))) + return pf1 * hyper([], [Rational(5, 3)], z**3/9) - pf2 * hyper([], [Rational(1, 3)], z**3/9) + + def _eval_expand_func(self, **hints): + arg = self.args[0] + symbs = arg.free_symbols + + if len(symbs) == 1: + z = symbs.pop() + c = Wild("c", exclude=[z]) + d = Wild("d", exclude=[z]) + m = Wild("m", exclude=[z]) + n = Wild("n", exclude=[z]) + M = arg.match(c*(d*z**n)**m) + if M is not None: + m = M[m] + # The transformation is in principle + # given by 03.07.16.0001.01 but note + # that there is an error in this formula. + # https://functions.wolfram.com/Bessel-TypeFunctions/AiryAiPrime/16/01/01/0001/ + if (3*m).is_integer: + c = M[c] + d = M[d] + n = M[n] + pf = (d**m * z**(n*m)) / (d * z**n)**m + newarg = c * d**m * z**(n*m) + return S.Half * ((pf + S.One)*airyaiprime(newarg) + (pf - S.One)/sqrt(3)*airybiprime(newarg)) + + +class airybiprime(AiryBase): + r""" + The derivative $\operatorname{Bi}^\prime$ of the Airy function of the first + kind. + + Explanation + =========== + + The Airy function $\operatorname{Bi}^\prime(z)$ is defined to be the + function + + .. math:: + \operatorname{Bi}^\prime(z) := \frac{\mathrm{d} \operatorname{Bi}(z)}{\mathrm{d} z}. + + Examples + ======== + + Create an Airy function object: + + >>> from sympy import airybiprime + >>> from sympy.abc import z + + >>> airybiprime(z) + airybiprime(z) + + Several special values are known: + + >>> airybiprime(0) + 3**(1/6)/gamma(1/3) + >>> from sympy import oo + >>> airybiprime(oo) + oo + >>> airybiprime(-oo) + 0 + + The Airy function obeys the mirror symmetry: + + >>> from sympy import conjugate + >>> conjugate(airybiprime(z)) + airybiprime(conjugate(z)) + + Differentiation with respect to $z$ is supported: + + >>> from sympy import diff + >>> diff(airybiprime(z), z) + z*airybi(z) + >>> diff(airybiprime(z), z, 2) + z*airybiprime(z) + airybi(z) + + Series expansion is also supported: + + >>> from sympy import series + >>> series(airybiprime(z), z, 0, 3) + 3**(1/6)/gamma(1/3) + 3**(5/6)*z**2/(6*gamma(2/3)) + O(z**3) + + We can numerically evaluate the Airy function to arbitrary precision + on the whole complex plane: + + >>> airybiprime(-2).evalf(50) + 0.27879516692116952268509756941098324140300059345163 + + Rewrite $\operatorname{Bi}^\prime(z)$ in terms of hypergeometric functions: + + >>> from sympy import hyper + >>> airybiprime(z).rewrite(hyper) + 3**(5/6)*z**2*hyper((), (5/3,), z**3/9)/(6*gamma(2/3)) + 3**(1/6)*hyper((), (1/3,), z**3/9)/gamma(1/3) + + See Also + ======== + + airyai: Airy function of the first kind. + airybi: Airy function of the second kind. + airyaiprime: Derivative of the Airy function of the first kind. + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Airy_function + .. [2] https://dlmf.nist.gov/9 + .. [3] https://encyclopediaofmath.org/wiki/Airy_functions + .. [4] https://mathworld.wolfram.com/AiryFunctions.html + + """ + + nargs = 1 + unbranched = True + + @classmethod + def eval(cls, arg): + if arg.is_Number: + if arg is S.NaN: + return S.NaN + elif arg is S.Infinity: + return S.Infinity + elif arg is S.NegativeInfinity: + return S.Zero + elif arg.is_zero: + return 3**Rational(1, 6) / gamma(Rational(1, 3)) + + if arg.is_zero: + return 3**Rational(1, 6) / gamma(Rational(1, 3)) + + + def fdiff(self, argindex=1): + if argindex == 1: + return self.args[0]*airybi(self.args[0]) + else: + raise ArgumentIndexError(self, argindex) + + def _eval_evalf(self, prec): + z = self.args[0]._to_mpmath(prec) + with workprec(prec): + res = mp.airybi(z, derivative=1) + return Expr._from_mpmath(res, prec) + + def _eval_rewrite_as_besselj(self, z, **kwargs): + tt = Rational(2, 3) + a = tt * Pow(-z, Rational(3, 2)) + if re(z).is_negative: + return -z/sqrt(3) * (besselj(-tt, a) + besselj(tt, a)) + + def _eval_rewrite_as_besseli(self, z, **kwargs): + ot = Rational(1, 3) + tt = Rational(2, 3) + a = tt * Pow(z, Rational(3, 2)) + if re(z).is_positive: + return z/sqrt(3) * (besseli(-tt, a) + besseli(tt, a)) + else: + a = Pow(z, Rational(3, 2)) + b = Pow(a, tt) + c = Pow(a, -tt) + return sqrt(ot) * (b*besseli(-tt, tt*a) + z**2*c*besseli(tt, tt*a)) + + def _eval_rewrite_as_hyper(self, z, **kwargs): + pf1 = z**2 / (2*root(3, 6)*gamma(Rational(2, 3))) + pf2 = root(3, 6) / gamma(Rational(1, 3)) + return pf1 * hyper([], [Rational(5, 3)], z**3/9) + pf2 * hyper([], [Rational(1, 3)], z**3/9) + + def _eval_expand_func(self, **hints): + arg = self.args[0] + symbs = arg.free_symbols + + if len(symbs) == 1: + z = symbs.pop() + c = Wild("c", exclude=[z]) + d = Wild("d", exclude=[z]) + m = Wild("m", exclude=[z]) + n = Wild("n", exclude=[z]) + M = arg.match(c*(d*z**n)**m) + if M is not None: + m = M[m] + # The transformation is in principle + # given by 03.08.16.0001.01 but note + # that there is an error in this formula. + # https://functions.wolfram.com/Bessel-TypeFunctions/AiryBiPrime/16/01/01/0001/ + if (3*m).is_integer: + c = M[c] + d = M[d] + n = M[n] + pf = (d**m * z**(n*m)) / (d * z**n)**m + newarg = c * d**m * z**(n*m) + return S.Half * (sqrt(3)*(pf - S.One)*airyaiprime(newarg) + (pf + S.One)*airybiprime(newarg)) + + +class marcumq(DefinedFunction): + r""" + The Marcum Q-function. + + Explanation + =========== + + The Marcum Q-function is defined by the meromorphic continuation of + + .. math:: + Q_m(a, b) = a^{- m + 1} \int_{b}^{\infty} x^{m} e^{- \frac{a^{2}}{2} - \frac{x^{2}}{2}} I_{m - 1}\left(a x\right)\, dx + + Examples + ======== + + >>> from sympy import marcumq + >>> from sympy.abc import m, a, b + >>> marcumq(m, a, b) + marcumq(m, a, b) + + Special values: + + >>> marcumq(m, 0, b) + uppergamma(m, b**2/2)/gamma(m) + >>> marcumq(0, 0, 0) + 0 + >>> marcumq(0, a, 0) + 1 - exp(-a**2/2) + >>> marcumq(1, a, a) + 1/2 + exp(-a**2)*besseli(0, a**2)/2 + >>> marcumq(2, a, a) + 1/2 + exp(-a**2)*besseli(0, a**2)/2 + exp(-a**2)*besseli(1, a**2) + + Differentiation with respect to $a$ and $b$ is supported: + + >>> from sympy import diff + >>> diff(marcumq(m, a, b), a) + a*(-marcumq(m, a, b) + marcumq(m + 1, a, b)) + >>> diff(marcumq(m, a, b), b) + -a**(1 - m)*b**m*exp(-a**2/2 - b**2/2)*besseli(m - 1, a*b) + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Marcum_Q-function + .. [2] https://mathworld.wolfram.com/MarcumQ-Function.html + + """ + + @classmethod + def eval(cls, m, a, b): + if a is S.Zero: + if m is S.Zero and b is S.Zero: + return S.Zero + return uppergamma(m, b**2 * S.Half) / gamma(m) + + if m is S.Zero and b is S.Zero: + return 1 - 1 / exp(a**2 * S.Half) + + if a == b: + if m is S.One: + return (1 + exp(-a**2) * besseli(0, a**2))*S.Half + if m == 2: + return S.Half + S.Half * exp(-a**2) * besseli(0, a**2) + exp(-a**2) * besseli(1, a**2) + + if a.is_zero: + if m.is_zero and b.is_zero: + return S.Zero + return uppergamma(m, b**2*S.Half) / gamma(m) + + if m.is_zero and b.is_zero: + return 1 - 1 / exp(a**2*S.Half) + + def fdiff(self, argindex=2): + m, a, b = self.args + if argindex == 2: + return a * (-marcumq(m, a, b) + marcumq(1+m, a, b)) + elif argindex == 3: + return (-b**m / a**(m-1)) * exp(-(a**2 + b**2)/2) * besseli(m-1, a*b) + else: + raise ArgumentIndexError(self, argindex) + + def _eval_rewrite_as_Integral(self, m, a, b, **kwargs): + from sympy.integrals.integrals import Integral + x = kwargs.get('x', Dummy(uniquely_named_symbol('x').name)) + return a ** (1 - m) * \ + Integral(x**m * exp(-(x**2 + a**2)/2) * besseli(m-1, a*x), [x, b, S.Infinity]) + + def _eval_rewrite_as_Sum(self, m, a, b, **kwargs): + from sympy.concrete.summations import Sum + k = kwargs.get('k', Dummy('k')) + return exp(-(a**2 + b**2) / 2) * Sum((a/b)**k * besseli(k, a*b), [k, 1-m, S.Infinity]) + + def _eval_rewrite_as_besseli(self, m, a, b, **kwargs): + if a == b: + if m == 1: + return (1 + exp(-a**2) * besseli(0, a**2)) / 2 + if m.is_Integer and m >= 2: + s = sum(besseli(i, a**2) for i in range(1, m)) + return S.Half + exp(-a**2) * besseli(0, a**2) / 2 + exp(-a**2) * s + + def _eval_is_zero(self): + if all(arg.is_zero for arg in self.args): + return True + +class _besseli(DefinedFunction): + """ + Helper function to make the $\\mathrm{besseli}(nu, z)$ + function tractable for the Gruntz algorithm. + + """ + + def _eval_aseries(self, n, args0, x, logx): + from sympy.functions.combinatorial.factorials import RisingFactorial + from sympy.series.order import Order + point = args0[1] + + if point in [S.Infinity, S.NegativeInfinity]: + nu, z = self.args + l = [((RisingFactorial(Rational(2*nu - 1, 2), k)*RisingFactorial( + Rational(2*nu + 1, 2), k))/((2)**(k)*z**(Rational(2*k + 1, 2))*factorial(k))) for k in range(n)] + return sqrt(pi/(2))*(Add(*l)) + Order(1/z**(Rational(2*n + 1, 2)), x) + + return super()._eval_aseries(n, args0, x, logx) + + def _eval_rewrite_as_intractable(self, nu, z, **kwargs): + return exp(-z)*besseli(nu, z) + + def _eval_nseries(self, x, n, logx, cdir=0): + x0 = self.args[0].limit(x, 0) + if x0.is_zero: + f = self._eval_rewrite_as_intractable(*self.args) + return f._eval_nseries(x, n, logx) + return super()._eval_nseries(x, n, logx) + + +class _besselk(DefinedFunction): + """ + Helper function to make the $\\mathrm{besselk}(nu, z)$ + function tractable for the Gruntz algorithm. + + """ + + def _eval_aseries(self, n, args0, x, logx): + from sympy.functions.combinatorial.factorials import RisingFactorial + from sympy.series.order import Order + point = args0[1] + + if point in [S.Infinity, S.NegativeInfinity]: + nu, z = self.args + l = [((RisingFactorial(Rational(2*nu - 1, 2), k)*RisingFactorial( + Rational(2*nu + 1, 2), k))/((-2)**(k)*z**(Rational(2*k + 1, 2))*factorial(k))) for k in range(n)] + return sqrt(pi/(2))*(Add(*l)) + Order(1/z**(Rational(2*n + 1, 2)), x) + + return super()._eval_aseries(n, args0, x, logx) + + def _eval_rewrite_as_intractable(self,nu, z, **kwargs): + return exp(z)*besselk(nu, z) + + def _eval_nseries(self, x, n, logx, cdir=0): + x0 = self.args[0].limit(x, 0) + if x0.is_zero: + f = self._eval_rewrite_as_intractable(*self.args) + return f._eval_nseries(x, n, logx) + return super()._eval_nseries(x, n, logx) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/functions/special/beta_functions.py b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/functions/special/beta_functions.py new file mode 100644 index 0000000000000000000000000000000000000000..337ae6c71fe5a38cc0fcd819e9d489ebbbd1946b --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/functions/special/beta_functions.py @@ -0,0 +1,389 @@ +from sympy.core import S +from sympy.core.function import DefinedFunction, ArgumentIndexError +from sympy.core.symbol import Dummy, uniquely_named_symbol +from sympy.functions.special.gamma_functions import gamma, digamma +from sympy.functions.combinatorial.numbers import catalan +from sympy.functions.elementary.complexes import conjugate + +# See mpmath #569 and SymPy #20569 +def betainc_mpmath_fix(a, b, x1, x2, reg=0): + from mpmath import betainc, mpf + if x1 == x2: + return mpf(0) + else: + return betainc(a, b, x1, x2, reg) + +############################################################################### +############################ COMPLETE BETA FUNCTION ########################## +############################################################################### + +class beta(DefinedFunction): + r""" + The beta integral is called the Eulerian integral of the first kind by + Legendre: + + .. math:: + \mathrm{B}(x,y) \int^{1}_{0} t^{x-1} (1-t)^{y-1} \mathrm{d}t. + + Explanation + =========== + + The Beta function or Euler's first integral is closely associated + with the gamma function. The Beta function is often used in probability + theory and mathematical statistics. It satisfies properties like: + + .. math:: + \mathrm{B}(a,1) = \frac{1}{a} \\ + \mathrm{B}(a,b) = \mathrm{B}(b,a) \\ + \mathrm{B}(a,b) = \frac{\Gamma(a) \Gamma(b)}{\Gamma(a+b)} + + Therefore for integral values of $a$ and $b$: + + .. math:: + \mathrm{B} = \frac{(a-1)! (b-1)!}{(a+b-1)!} + + A special case of the Beta function when `x = y` is the + Central Beta function. It satisfies properties like: + + .. math:: + \mathrm{B}(x) = 2^{1 - 2x}\mathrm{B}(x, \frac{1}{2}) + \mathrm{B}(x) = 2^{1 - 2x} cos(\pi x) \mathrm{B}(\frac{1}{2} - x, x) + \mathrm{B}(x) = \int_{0}^{1} \frac{t^x}{(1 + t)^{2x}} dt + \mathrm{B}(x) = \frac{2}{x} \prod_{n = 1}^{\infty} \frac{n(n + 2x)}{(n + x)^2} + + Examples + ======== + + >>> from sympy import I, pi + >>> from sympy.abc import x, y + + The Beta function obeys the mirror symmetry: + + >>> from sympy import beta, conjugate + >>> conjugate(beta(x, y)) + beta(conjugate(x), conjugate(y)) + + Differentiation with respect to both $x$ and $y$ is supported: + + >>> from sympy import beta, diff + >>> diff(beta(x, y), x) + (polygamma(0, x) - polygamma(0, x + y))*beta(x, y) + + >>> diff(beta(x, y), y) + (polygamma(0, y) - polygamma(0, x + y))*beta(x, y) + + >>> diff(beta(x), x) + 2*(polygamma(0, x) - polygamma(0, 2*x))*beta(x, x) + + We can numerically evaluate the Beta function to + arbitrary precision for any complex numbers x and y: + + >>> from sympy import beta + >>> beta(pi).evalf(40) + 0.02671848900111377452242355235388489324562 + + >>> beta(1 + I).evalf(20) + -0.2112723729365330143 - 0.7655283165378005676*I + + See Also + ======== + + gamma: Gamma function. + uppergamma: Upper incomplete gamma function. + lowergamma: Lower incomplete gamma function. + polygamma: Polygamma function. + loggamma: Log Gamma function. + digamma: Digamma function. + trigamma: Trigamma function. + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Beta_function + .. [2] https://mathworld.wolfram.com/BetaFunction.html + .. [3] https://dlmf.nist.gov/5.12 + + """ + unbranched = True + + def fdiff(self, argindex): + x, y = self.args + if argindex == 1: + # Diff wrt x + return beta(x, y)*(digamma(x) - digamma(x + y)) + elif argindex == 2: + # Diff wrt y + return beta(x, y)*(digamma(y) - digamma(x + y)) + else: + raise ArgumentIndexError(self, argindex) + + @classmethod + def eval(cls, x, y=None): + if y is None: + return beta(x, x) + if x.is_Number and y.is_Number: + return beta(x, y, evaluate=False).doit() + + def doit(self, **hints): + x = xold = self.args[0] + # Deal with unevaluated single argument beta + single_argument = len(self.args) == 1 + y = yold = self.args[0] if single_argument else self.args[1] + if hints.get('deep', True): + x = x.doit(**hints) + y = y.doit(**hints) + if y.is_zero or x.is_zero: + return S.ComplexInfinity + if y is S.One: + return 1/x + if x is S.One: + return 1/y + if y == x + 1: + return 1/(x*y*catalan(x)) + s = x + y + if (s.is_integer and s.is_negative and x.is_integer is False and + y.is_integer is False): + return S.Zero + if x == xold and y == yold and not single_argument: + return self + return beta(x, y) + + def _eval_expand_func(self, **hints): + x, y = self.args + return gamma(x)*gamma(y) / gamma(x + y) + + def _eval_is_real(self): + return self.args[0].is_real and self.args[1].is_real + + def _eval_conjugate(self): + return self.func(self.args[0].conjugate(), self.args[1].conjugate()) + + def _eval_rewrite_as_gamma(self, x, y, piecewise=True, **kwargs): + return self._eval_expand_func(**kwargs) + + def _eval_rewrite_as_Integral(self, x, y, **kwargs): + from sympy.integrals.integrals import Integral + t = Dummy(uniquely_named_symbol('t', [x, y]).name) + return Integral(t**(x - 1)*(1 - t)**(y - 1), (t, 0, 1)) + +############################################################################### +########################## INCOMPLETE BETA FUNCTION ########################### +############################################################################### + +class betainc(DefinedFunction): + r""" + The Generalized Incomplete Beta function is defined as + + .. math:: + \mathrm{B}_{(x_1, x_2)}(a, b) = \int_{x_1}^{x_2} t^{a - 1} (1 - t)^{b - 1} dt + + The Incomplete Beta function is a special case + of the Generalized Incomplete Beta function : + + .. math:: \mathrm{B}_z (a, b) = \mathrm{B}_{(0, z)}(a, b) + + The Incomplete Beta function satisfies : + + .. math:: \mathrm{B}_z (a, b) = (-1)^a \mathrm{B}_{\frac{z}{z - 1}} (a, 1 - a - b) + + The Beta function is a special case of the Incomplete Beta function : + + .. math:: \mathrm{B}(a, b) = \mathrm{B}_{1}(a, b) + + Examples + ======== + + >>> from sympy import betainc, symbols, conjugate + >>> a, b, x, x1, x2 = symbols('a b x x1 x2') + + The Generalized Incomplete Beta function is given by: + + >>> betainc(a, b, x1, x2) + betainc(a, b, x1, x2) + + The Incomplete Beta function can be obtained as follows: + + >>> betainc(a, b, 0, x) + betainc(a, b, 0, x) + + The Incomplete Beta function obeys the mirror symmetry: + + >>> conjugate(betainc(a, b, x1, x2)) + betainc(conjugate(a), conjugate(b), conjugate(x1), conjugate(x2)) + + We can numerically evaluate the Incomplete Beta function to + arbitrary precision for any complex numbers a, b, x1 and x2: + + >>> from sympy import betainc, I + >>> betainc(2, 3, 4, 5).evalf(10) + 56.08333333 + >>> betainc(0.75, 1 - 4*I, 0, 2 + 3*I).evalf(25) + 0.2241657956955709603655887 + 0.3619619242700451992411724*I + + The Generalized Incomplete Beta function can be expressed + in terms of the Generalized Hypergeometric function. + + >>> from sympy import hyper + >>> betainc(a, b, x1, x2).rewrite(hyper) + (-x1**a*hyper((a, 1 - b), (a + 1,), x1) + x2**a*hyper((a, 1 - b), (a + 1,), x2))/a + + See Also + ======== + + beta: Beta function + hyper: Generalized Hypergeometric function + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Beta_function#Incomplete_beta_function + .. [2] https://dlmf.nist.gov/8.17 + .. [3] https://functions.wolfram.com/GammaBetaErf/Beta4/ + .. [4] https://functions.wolfram.com/GammaBetaErf/BetaRegularized4/02/ + + """ + nargs = 4 + unbranched = True + + def fdiff(self, argindex): + a, b, x1, x2 = self.args + if argindex == 3: + # Diff wrt x1 + return -(1 - x1)**(b - 1)*x1**(a - 1) + elif argindex == 4: + # Diff wrt x2 + return (1 - x2)**(b - 1)*x2**(a - 1) + else: + raise ArgumentIndexError(self, argindex) + + def _eval_mpmath(self): + return betainc_mpmath_fix, self.args + + def _eval_is_real(self): + if all(arg.is_real for arg in self.args): + return True + + def _eval_conjugate(self): + return self.func(*map(conjugate, self.args)) + + def _eval_rewrite_as_Integral(self, a, b, x1, x2, **kwargs): + from sympy.integrals.integrals import Integral + t = Dummy(uniquely_named_symbol('t', [a, b, x1, x2]).name) + return Integral(t**(a - 1)*(1 - t)**(b - 1), (t, x1, x2)) + + def _eval_rewrite_as_hyper(self, a, b, x1, x2, **kwargs): + from sympy.functions.special.hyper import hyper + return (x2**a * hyper((a, 1 - b), (a + 1,), x2) - x1**a * hyper((a, 1 - b), (a + 1,), x1)) / a + +############################################################################### +#################### REGULARIZED INCOMPLETE BETA FUNCTION ##################### +############################################################################### + +class betainc_regularized(DefinedFunction): + r""" + The Generalized Regularized Incomplete Beta function is given by + + .. math:: + \mathrm{I}_{(x_1, x_2)}(a, b) = \frac{\mathrm{B}_{(x_1, x_2)}(a, b)}{\mathrm{B}(a, b)} + + The Regularized Incomplete Beta function is a special case + of the Generalized Regularized Incomplete Beta function : + + .. math:: \mathrm{I}_z (a, b) = \mathrm{I}_{(0, z)}(a, b) + + The Regularized Incomplete Beta function is the cumulative distribution + function of the beta distribution. + + Examples + ======== + + >>> from sympy import betainc_regularized, symbols, conjugate + >>> a, b, x, x1, x2 = symbols('a b x x1 x2') + + The Generalized Regularized Incomplete Beta + function is given by: + + >>> betainc_regularized(a, b, x1, x2) + betainc_regularized(a, b, x1, x2) + + The Regularized Incomplete Beta function + can be obtained as follows: + + >>> betainc_regularized(a, b, 0, x) + betainc_regularized(a, b, 0, x) + + The Regularized Incomplete Beta function + obeys the mirror symmetry: + + >>> conjugate(betainc_regularized(a, b, x1, x2)) + betainc_regularized(conjugate(a), conjugate(b), conjugate(x1), conjugate(x2)) + + We can numerically evaluate the Regularized Incomplete Beta function + to arbitrary precision for any complex numbers a, b, x1 and x2: + + >>> from sympy import betainc_regularized, pi, E + >>> betainc_regularized(1, 2, 0, 0.25).evalf(10) + 0.4375000000 + >>> betainc_regularized(pi, E, 0, 1).evalf(5) + 1.00000 + + The Generalized Regularized Incomplete Beta function can be + expressed in terms of the Generalized Hypergeometric function. + + >>> from sympy import hyper + >>> betainc_regularized(a, b, x1, x2).rewrite(hyper) + (-x1**a*hyper((a, 1 - b), (a + 1,), x1) + x2**a*hyper((a, 1 - b), (a + 1,), x2))/(a*beta(a, b)) + + See Also + ======== + + beta: Beta function + hyper: Generalized Hypergeometric function + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Beta_function#Incomplete_beta_function + .. [2] https://dlmf.nist.gov/8.17 + .. [3] https://functions.wolfram.com/GammaBetaErf/Beta4/ + .. [4] https://functions.wolfram.com/GammaBetaErf/BetaRegularized4/02/ + + """ + nargs = 4 + unbranched = True + + def __new__(cls, a, b, x1, x2): + return super().__new__(cls, a, b, x1, x2) + + def _eval_mpmath(self): + return betainc_mpmath_fix, (*self.args, S(1)) + + def fdiff(self, argindex): + a, b, x1, x2 = self.args + if argindex == 3: + # Diff wrt x1 + return -(1 - x1)**(b - 1)*x1**(a - 1) / beta(a, b) + elif argindex == 4: + # Diff wrt x2 + return (1 - x2)**(b - 1)*x2**(a - 1) / beta(a, b) + else: + raise ArgumentIndexError(self, argindex) + + def _eval_is_real(self): + if all(arg.is_real for arg in self.args): + return True + + def _eval_conjugate(self): + return self.func(*map(conjugate, self.args)) + + def _eval_rewrite_as_Integral(self, a, b, x1, x2, **kwargs): + from sympy.integrals.integrals import Integral + t = Dummy(uniquely_named_symbol('t', [a, b, x1, x2]).name) + integrand = t**(a - 1)*(1 - t)**(b - 1) + expr = Integral(integrand, (t, x1, x2)) + return expr / Integral(integrand, (t, 0, 1)) + + def _eval_rewrite_as_hyper(self, a, b, x1, x2, **kwargs): + from sympy.functions.special.hyper import hyper + expr = (x2**a * hyper((a, 1 - b), (a + 1,), x2) - x1**a * hyper((a, 1 - b), (a + 1,), x1)) / a + return expr / beta(a, b) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/functions/special/bsplines.py b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/functions/special/bsplines.py new file mode 100644 index 0000000000000000000000000000000000000000..50c9141e841288aa02457c466fc6573f9a20d09f --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/functions/special/bsplines.py @@ -0,0 +1,348 @@ +from sympy.core import S, sympify +from sympy.core.symbol import (Dummy, symbols) +from sympy.functions import Piecewise, piecewise_fold +from sympy.logic.boolalg import And +from sympy.sets.sets import Interval + +from functools import lru_cache + + +def _ivl(cond, x): + """return the interval corresponding to the condition + + Conditions in spline's Piecewise give the range over + which an expression is valid like (lo <= x) & (x <= hi). + This function returns (lo, hi). + """ + if isinstance(cond, And) and len(cond.args) == 2: + a, b = cond.args + if a.lts == x: + a, b = b, a + return a.lts, b.gts + raise TypeError('unexpected cond type: %s' % cond) + + +def _add_splines(c, b1, d, b2, x): + """Construct c*b1 + d*b2.""" + + if S.Zero in (b1, c): + rv = piecewise_fold(d * b2) + elif S.Zero in (b2, d): + rv = piecewise_fold(c * b1) + else: + new_args = [] + # Just combining the Piecewise without any fancy optimization + p1 = piecewise_fold(c * b1) + p2 = piecewise_fold(d * b2) + + # Search all Piecewise arguments except (0, True) + p2args = list(p2.args[:-1]) + + # This merging algorithm assumes the conditions in + # p1 and p2 are sorted + for arg in p1.args[:-1]: + expr = arg.expr + cond = arg.cond + + lower = _ivl(cond, x)[0] + + # Check p2 for matching conditions that can be merged + for i, arg2 in enumerate(p2args): + expr2 = arg2.expr + cond2 = arg2.cond + + lower_2, upper_2 = _ivl(cond2, x) + if cond2 == cond: + # Conditions match, join expressions + expr += expr2 + # Remove matching element + del p2args[i] + # No need to check the rest + break + elif lower_2 < lower and upper_2 <= lower: + # Check if arg2 condition smaller than arg1, + # add to new_args by itself (no match expected + # in p1) + new_args.append(arg2) + del p2args[i] + break + + # Checked all, add expr and cond + new_args.append((expr, cond)) + + # Add remaining items from p2args + new_args.extend(p2args) + + # Add final (0, True) + new_args.append((0, True)) + + rv = Piecewise(*new_args, evaluate=False) + + return rv.expand() + + +@lru_cache(maxsize=128) +def bspline_basis(d, knots, n, x): + """ + The $n$-th B-spline at $x$ of degree $d$ with knots. + + Explanation + =========== + + B-Splines are piecewise polynomials of degree $d$. They are defined on a + set of knots, which is a sequence of integers or floats. + + Examples + ======== + + The 0th degree splines have a value of 1 on a single interval: + + >>> from sympy import bspline_basis + >>> from sympy.abc import x + >>> d = 0 + >>> knots = tuple(range(5)) + >>> bspline_basis(d, knots, 0, x) + Piecewise((1, (x >= 0) & (x <= 1)), (0, True)) + + For a given ``(d, knots)`` there are ``len(knots)-d-1`` B-splines + defined, that are indexed by ``n`` (starting at 0). + + Here is an example of a cubic B-spline: + + >>> bspline_basis(3, tuple(range(5)), 0, x) + Piecewise((x**3/6, (x >= 0) & (x <= 1)), + (-x**3/2 + 2*x**2 - 2*x + 2/3, + (x >= 1) & (x <= 2)), + (x**3/2 - 4*x**2 + 10*x - 22/3, + (x >= 2) & (x <= 3)), + (-x**3/6 + 2*x**2 - 8*x + 32/3, + (x >= 3) & (x <= 4)), + (0, True)) + + By repeating knot points, you can introduce discontinuities in the + B-splines and their derivatives: + + >>> d = 1 + >>> knots = (0, 0, 2, 3, 4) + >>> bspline_basis(d, knots, 0, x) + Piecewise((1 - x/2, (x >= 0) & (x <= 2)), (0, True)) + + It is quite time consuming to construct and evaluate B-splines. If + you need to evaluate a B-spline many times, it is best to lambdify them + first: + + >>> from sympy import lambdify + >>> d = 3 + >>> knots = tuple(range(10)) + >>> b0 = bspline_basis(d, knots, 0, x) + >>> f = lambdify(x, b0) + >>> y = f(0.5) + + Parameters + ========== + + d : integer + degree of bspline + + knots : list of integer values + list of knots points of bspline + + n : integer + $n$-th B-spline + + x : symbol + + See Also + ======== + + bspline_basis_set + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/B-spline + + """ + # make sure x has no assumptions so conditions don't evaluate + xvar = x + x = Dummy() + + knots = tuple(sympify(k) for k in knots) + d = int(d) + n = int(n) + n_knots = len(knots) + n_intervals = n_knots - 1 + if n + d + 1 > n_intervals: + raise ValueError("n + d + 1 must not exceed len(knots) - 1") + if d == 0: + result = Piecewise( + (S.One, Interval(knots[n], knots[n + 1]).contains(x)), (0, True) + ) + elif d > 0: + denom = knots[n + d + 1] - knots[n + 1] + if denom != S.Zero: + B = (knots[n + d + 1] - x) / denom + b2 = bspline_basis(d - 1, knots, n + 1, x) + else: + b2 = B = S.Zero + + denom = knots[n + d] - knots[n] + if denom != S.Zero: + A = (x - knots[n]) / denom + b1 = bspline_basis(d - 1, knots, n, x) + else: + b1 = A = S.Zero + + result = _add_splines(A, b1, B, b2, x) + else: + raise ValueError("degree must be non-negative: %r" % n) + + # return result with user-given x + return result.xreplace({x: xvar}) + + +def bspline_basis_set(d, knots, x): + """ + Return the ``len(knots)-d-1`` B-splines at *x* of degree *d* + with *knots*. + + Explanation + =========== + + This function returns a list of piecewise polynomials that are the + ``len(knots)-d-1`` B-splines of degree *d* for the given knots. + This function calls ``bspline_basis(d, knots, n, x)`` for different + values of *n*. + + Examples + ======== + + >>> from sympy import bspline_basis_set + >>> from sympy.abc import x + >>> d = 2 + >>> knots = range(5) + >>> splines = bspline_basis_set(d, knots, x) + >>> splines + [Piecewise((x**2/2, (x >= 0) & (x <= 1)), + (-x**2 + 3*x - 3/2, (x >= 1) & (x <= 2)), + (x**2/2 - 3*x + 9/2, (x >= 2) & (x <= 3)), + (0, True)), + Piecewise((x**2/2 - x + 1/2, (x >= 1) & (x <= 2)), + (-x**2 + 5*x - 11/2, (x >= 2) & (x <= 3)), + (x**2/2 - 4*x + 8, (x >= 3) & (x <= 4)), + (0, True))] + + Parameters + ========== + + d : integer + degree of bspline + + knots : list of integers + list of knots points of bspline + + x : symbol + + See Also + ======== + + bspline_basis + + """ + n_splines = len(knots) - d - 1 + return [bspline_basis(d, tuple(knots), i, x) for i in range(n_splines)] + + +def interpolating_spline(d, x, X, Y): + """ + Return spline of degree *d*, passing through the given *X* + and *Y* values. + + Explanation + =========== + + This function returns a piecewise function such that each part is + a polynomial of degree not greater than *d*. The value of *d* + must be 1 or greater and the values of *X* must be strictly + increasing. + + Examples + ======== + + >>> from sympy import interpolating_spline + >>> from sympy.abc import x + >>> interpolating_spline(1, x, [1, 2, 4, 7], [3, 6, 5, 7]) + Piecewise((3*x, (x >= 1) & (x <= 2)), + (7 - x/2, (x >= 2) & (x <= 4)), + (2*x/3 + 7/3, (x >= 4) & (x <= 7))) + >>> interpolating_spline(3, x, [-2, 0, 1, 3, 4], [4, 2, 1, 1, 3]) + Piecewise((7*x**3/117 + 7*x**2/117 - 131*x/117 + 2, (x >= -2) & (x <= 1)), + (10*x**3/117 - 2*x**2/117 - 122*x/117 + 77/39, (x >= 1) & (x <= 4))) + + Parameters + ========== + + d : integer + Degree of Bspline strictly greater than equal to one + + x : symbol + + X : list of strictly increasing real values + list of X coordinates through which the spline passes + + Y : list of real values + list of corresponding Y coordinates through which the spline passes + + See Also + ======== + + bspline_basis_set, interpolating_poly + + """ + from sympy.solvers.solveset import linsolve + from sympy.matrices.dense import Matrix + + # Input sanitization + d = sympify(d) + if not (d.is_Integer and d.is_positive): + raise ValueError("Spline degree must be a positive integer, not %s." % d) + if len(X) != len(Y): + raise ValueError("Number of X and Y coordinates must be the same.") + if len(X) < d + 1: + raise ValueError("Degree must be less than the number of control points.") + if not all(a < b for a, b in zip(X, X[1:])): + raise ValueError("The x-coordinates must be strictly increasing.") + X = [sympify(i) for i in X] + + # Evaluating knots value + if d.is_odd: + j = (d + 1) // 2 + interior_knots = X[j:-j] + else: + j = d // 2 + interior_knots = [ + (a + b)/2 for a, b in zip(X[j : -j - 1], X[j + 1 : -j]) + ] + + knots = [X[0]] * (d + 1) + list(interior_knots) + [X[-1]] * (d + 1) + + basis = bspline_basis_set(d, knots, x) + + A = [[b.subs(x, v) for b in basis] for v in X] + + coeff = linsolve((Matrix(A), Matrix(Y)), symbols("c0:{}".format(len(X)), cls=Dummy)) + coeff = list(coeff)[0] + intervals = {c for b in basis for (e, c) in b.args if c != True} + + # Sorting the intervals + # ival contains the end-points of each interval + intervals = sorted(intervals, key=lambda c: _ivl(c, x)) + + basis_dicts = [{c: e for (e, c) in b.args} for b in basis] + spline = [] + for i in intervals: + piece = sum( + [c * d.get(i, S.Zero) for (c, d) in zip(coeff, basis_dicts)], S.Zero + ) + spline.append((piece, i)) + return Piecewise(*spline) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/functions/special/delta_functions.py b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/functions/special/delta_functions.py new file mode 100644 index 0000000000000000000000000000000000000000..698d8c0c3ba5e68c2f084e83e7a9ec070ce83307 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/functions/special/delta_functions.py @@ -0,0 +1,664 @@ +from sympy.core import S, diff +from sympy.core.function import DefinedFunction, ArgumentIndexError +from sympy.core.logic import fuzzy_not +from sympy.core.relational import Eq, Ne +from sympy.functions.elementary.complexes import im, sign +from sympy.functions.elementary.piecewise import Piecewise +from sympy.polys.polyerrors import PolynomialError +from sympy.polys.polyroots import roots +from sympy.utilities.misc import filldedent + + +############################################################################### +################################ DELTA FUNCTION ############################### +############################################################################### + + +class DiracDelta(DefinedFunction): + r""" + The DiracDelta function and its derivatives. + + Explanation + =========== + + DiracDelta is not an ordinary function. It can be rigorously defined either + as a distribution or as a measure. + + DiracDelta only makes sense in definite integrals, and in particular, + integrals of the form ``Integral(f(x)*DiracDelta(x - x0), (x, a, b))``, + where it equals ``f(x0)`` if ``a <= x0 <= b`` and ``0`` otherwise. Formally, + DiracDelta acts in some ways like a function that is ``0`` everywhere except + at ``0``, but in many ways it also does not. It can often be useful to treat + DiracDelta in formal ways, building up and manipulating expressions with + delta functions (which may eventually be integrated), but care must be taken + to not treat it as a real function. SymPy's ``oo`` is similar. It only + truly makes sense formally in certain contexts (such as integration limits), + but SymPy allows its use everywhere, and it tries to be consistent with + operations on it (like ``1/oo``), but it is easy to get into trouble and get + wrong results if ``oo`` is treated too much like a number. Similarly, if + DiracDelta is treated too much like a function, it is easy to get wrong or + nonsensical results. + + DiracDelta function has the following properties: + + 1) $\frac{d}{d x} \theta(x) = \delta(x)$ + 2) $\int_{-\infty}^\infty \delta(x - a)f(x)\, dx = f(a)$ and $\int_{a- + \epsilon}^{a+\epsilon} \delta(x - a)f(x)\, dx = f(a)$ + 3) $\delta(x) = 0$ for all $x \neq 0$ + 4) $\delta(g(x)) = \sum_i \frac{\delta(x - x_i)}{\|g'(x_i)\|}$ where $x_i$ + are the roots of $g$ + 5) $\delta(-x) = \delta(x)$ + + Derivatives of ``k``-th order of DiracDelta have the following properties: + + 6) $\delta(x, k) = 0$ for all $x \neq 0$ + 7) $\delta(-x, k) = -\delta(x, k)$ for odd $k$ + 8) $\delta(-x, k) = \delta(x, k)$ for even $k$ + + Examples + ======== + + >>> from sympy import DiracDelta, diff, pi + >>> from sympy.abc import x, y + + >>> DiracDelta(x) + DiracDelta(x) + >>> DiracDelta(1) + 0 + >>> DiracDelta(-1) + 0 + >>> DiracDelta(pi) + 0 + >>> DiracDelta(x - 4).subs(x, 4) + DiracDelta(0) + >>> diff(DiracDelta(x)) + DiracDelta(x, 1) + >>> diff(DiracDelta(x - 1), x, 2) + DiracDelta(x - 1, 2) + >>> diff(DiracDelta(x**2 - 1), x, 2) + 2*(2*x**2*DiracDelta(x**2 - 1, 2) + DiracDelta(x**2 - 1, 1)) + >>> DiracDelta(3*x).is_simple(x) + True + >>> DiracDelta(x**2).is_simple(x) + False + >>> DiracDelta((x**2 - 1)*y).expand(diracdelta=True, wrt=x) + DiracDelta(x - 1)/(2*Abs(y)) + DiracDelta(x + 1)/(2*Abs(y)) + + See Also + ======== + + Heaviside + sympy.simplify.simplify.simplify, is_simple + sympy.functions.special.tensor_functions.KroneckerDelta + + References + ========== + + .. [1] https://mathworld.wolfram.com/DeltaFunction.html + + """ + + is_real = True + + def fdiff(self, argindex=1): + """ + Returns the first derivative of a DiracDelta Function. + + Explanation + =========== + + The difference between ``diff()`` and ``fdiff()`` is: ``diff()`` is the + user-level function and ``fdiff()`` is an object method. ``fdiff()`` is + a convenience method available in the ``Function`` class. It returns + the derivative of the function without considering the chain rule. + ``diff(function, x)`` calls ``Function._eval_derivative`` which in turn + calls ``fdiff()`` internally to compute the derivative of the function. + + Examples + ======== + + >>> from sympy import DiracDelta, diff + >>> from sympy.abc import x + + >>> DiracDelta(x).fdiff() + DiracDelta(x, 1) + + >>> DiracDelta(x, 1).fdiff() + DiracDelta(x, 2) + + >>> DiracDelta(x**2 - 1).fdiff() + DiracDelta(x**2 - 1, 1) + + >>> diff(DiracDelta(x, 1)).fdiff() + DiracDelta(x, 3) + + Parameters + ========== + + argindex : integer + degree of derivative + + """ + if argindex == 1: + #I didn't know if there is a better way to handle default arguments + k = 0 + if len(self.args) > 1: + k = self.args[1] + return self.func(self.args[0], k + 1) + else: + raise ArgumentIndexError(self, argindex) + + @classmethod + def eval(cls, arg, k=S.Zero): + """ + Returns a simplified form or a value of DiracDelta depending on the + argument passed by the DiracDelta object. + + Explanation + =========== + + The ``eval()`` method is automatically called when the ``DiracDelta`` + class is about to be instantiated and it returns either some simplified + instance or the unevaluated instance depending on the argument passed. + In other words, ``eval()`` method is not needed to be called explicitly, + it is being called and evaluated once the object is called. + + Examples + ======== + + >>> from sympy import DiracDelta, S + >>> from sympy.abc import x + + >>> DiracDelta(x) + DiracDelta(x) + + >>> DiracDelta(-x, 1) + -DiracDelta(x, 1) + + >>> DiracDelta(1) + 0 + + >>> DiracDelta(5, 1) + 0 + + >>> DiracDelta(0) + DiracDelta(0) + + >>> DiracDelta(-1) + 0 + + >>> DiracDelta(S.NaN) + nan + + >>> DiracDelta(x - 100).subs(x, 5) + 0 + + >>> DiracDelta(x - 100).subs(x, 100) + DiracDelta(0) + + Parameters + ========== + + k : integer + order of derivative + + arg : argument passed to DiracDelta + + """ + if not k.is_Integer or k.is_negative: + raise ValueError("Error: the second argument of DiracDelta must be \ + a non-negative integer, %s given instead." % (k,)) + if arg is S.NaN: + return S.NaN + if arg.is_nonzero: + return S.Zero + if fuzzy_not(im(arg).is_zero): + raise ValueError(filldedent(''' + Function defined only for Real Values. + Complex part: %s found in %s .''' % ( + repr(im(arg)), repr(arg)))) + c, nc = arg.args_cnc() + if c and c[0] is S.NegativeOne: + # keep this fast and simple instead of using + # could_extract_minus_sign + if k.is_odd: + return -cls(-arg, k) + elif k.is_even: + return cls(-arg, k) if k else cls(-arg) + elif k.is_zero: + return cls(arg, evaluate=False) + + def _eval_expand_diracdelta(self, **hints): + """ + Compute a simplified representation of the function using + property number 4. Pass ``wrt`` as a hint to expand the expression + with respect to a particular variable. + + Explanation + =========== + + ``wrt`` is: + + - a variable with respect to which a DiracDelta expression will + get expanded. + + Examples + ======== + + >>> from sympy import DiracDelta + >>> from sympy.abc import x, y + + >>> DiracDelta(x*y).expand(diracdelta=True, wrt=x) + DiracDelta(x)/Abs(y) + >>> DiracDelta(x*y).expand(diracdelta=True, wrt=y) + DiracDelta(y)/Abs(x) + + >>> DiracDelta(x**2 + x - 2).expand(diracdelta=True, wrt=x) + DiracDelta(x - 1)/3 + DiracDelta(x + 2)/3 + + See Also + ======== + + is_simple, Diracdelta + + """ + wrt = hints.get('wrt', None) + if wrt is None: + free = self.free_symbols + if len(free) == 1: + wrt = free.pop() + else: + raise TypeError(filldedent(''' + When there is more than 1 free symbol or variable in the expression, + the 'wrt' keyword is required as a hint to expand when using the + DiracDelta hint.''')) + + if not self.args[0].has(wrt) or (len(self.args) > 1 and self.args[1] != 0 ): + return self + try: + argroots = roots(self.args[0], wrt) + result = 0 + valid = True + darg = abs(diff(self.args[0], wrt)) + for r, m in argroots.items(): + if r.is_real is not False and m == 1: + result += self.func(wrt - r)/darg.subs(wrt, r) + else: + # don't handle non-real and if m != 1 then + # a polynomial will have a zero in the derivative (darg) + # at r + valid = False + break + if valid: + return result + except PolynomialError: + pass + return self + + def is_simple(self, x): + """ + Tells whether the argument(args[0]) of DiracDelta is a linear + expression in *x*. + + Examples + ======== + + >>> from sympy import DiracDelta, cos + >>> from sympy.abc import x, y + + >>> DiracDelta(x*y).is_simple(x) + True + >>> DiracDelta(x*y).is_simple(y) + True + + >>> DiracDelta(x**2 + x - 2).is_simple(x) + False + + >>> DiracDelta(cos(x)).is_simple(x) + False + + Parameters + ========== + + x : can be a symbol + + See Also + ======== + + sympy.simplify.simplify.simplify, DiracDelta + + """ + p = self.args[0].as_poly(x) + if p: + return p.degree() == 1 + return False + + def _eval_rewrite_as_Piecewise(self, *args, **kwargs): + """ + Represents DiracDelta in a piecewise form. + + Examples + ======== + + >>> from sympy import DiracDelta, Piecewise, Symbol + >>> x = Symbol('x') + + >>> DiracDelta(x).rewrite(Piecewise) + Piecewise((DiracDelta(0), Eq(x, 0)), (0, True)) + + >>> DiracDelta(x - 5).rewrite(Piecewise) + Piecewise((DiracDelta(0), Eq(x, 5)), (0, True)) + + >>> DiracDelta(x**2 - 5).rewrite(Piecewise) + Piecewise((DiracDelta(0), Eq(x**2, 5)), (0, True)) + + >>> DiracDelta(x - 5, 4).rewrite(Piecewise) + DiracDelta(x - 5, 4) + + """ + if len(args) == 1: + return Piecewise((DiracDelta(0), Eq(args[0], 0)), (0, True)) + + def _eval_rewrite_as_SingularityFunction(self, *args, **kwargs): + """ + Returns the DiracDelta expression written in the form of Singularity + Functions. + + """ + from sympy.solvers import solve + from sympy.functions.special.singularity_functions import SingularityFunction + if self == DiracDelta(0): + return SingularityFunction(0, 0, -1) + if self == DiracDelta(0, 1): + return SingularityFunction(0, 0, -2) + free = self.free_symbols + if len(free) == 1: + x = (free.pop()) + if len(args) == 1: + return SingularityFunction(x, solve(args[0], x)[0], -1) + return SingularityFunction(x, solve(args[0], x)[0], -args[1] - 1) + else: + # I don't know how to handle the case for DiracDelta expressions + # having arguments with more than one variable. + raise TypeError(filldedent(''' + rewrite(SingularityFunction) does not support + arguments with more that one variable.''')) + + +############################################################################### +############################## HEAVISIDE FUNCTION ############################# +############################################################################### + + +class Heaviside(DefinedFunction): + r""" + Heaviside step function. + + Explanation + =========== + + The Heaviside step function has the following properties: + + 1) $\frac{d}{d x} \theta(x) = \delta(x)$ + 2) $\theta(x) = \begin{cases} 0 & \text{for}\: x < 0 \\ \frac{1}{2} & + \text{for}\: x = 0 \\1 & \text{for}\: x > 0 \end{cases}$ + 3) $\frac{d}{d x} \max(x, 0) = \theta(x)$ + + Heaviside(x) is printed as $\theta(x)$ with the SymPy LaTeX printer. + + The value at 0 is set differently in different fields. SymPy uses 1/2, + which is a convention from electronics and signal processing, and is + consistent with solving improper integrals by Fourier transform and + convolution. + + To specify a different value of Heaviside at ``x=0``, a second argument + can be given. Using ``Heaviside(x, nan)`` gives an expression that will + evaluate to nan for x=0. + + .. versionchanged:: 1.9 ``Heaviside(0)`` now returns 1/2 (before: undefined) + + Examples + ======== + + >>> from sympy import Heaviside, nan + >>> from sympy.abc import x + >>> Heaviside(9) + 1 + >>> Heaviside(-9) + 0 + >>> Heaviside(0) + 1/2 + >>> Heaviside(0, nan) + nan + >>> (Heaviside(x) + 1).replace(Heaviside(x), Heaviside(x, 1)) + Heaviside(x, 1) + 1 + + See Also + ======== + + DiracDelta + + References + ========== + + .. [1] https://mathworld.wolfram.com/HeavisideStepFunction.html + .. [2] https://dlmf.nist.gov/1.16#iv + + """ + + is_real = True + + def fdiff(self, argindex=1): + """ + Returns the first derivative of a Heaviside Function. + + Examples + ======== + + >>> from sympy import Heaviside, diff + >>> from sympy.abc import x + + >>> Heaviside(x).fdiff() + DiracDelta(x) + + >>> Heaviside(x**2 - 1).fdiff() + DiracDelta(x**2 - 1) + + >>> diff(Heaviside(x)).fdiff() + DiracDelta(x, 1) + + Parameters + ========== + + argindex : integer + order of derivative + + """ + if argindex == 1: + return DiracDelta(self.args[0]) + else: + raise ArgumentIndexError(self, argindex) + + def __new__(cls, arg, H0=S.Half, **options): + if isinstance(H0, Heaviside) and len(H0.args) == 1: + H0 = S.Half + return super(cls, cls).__new__(cls, arg, H0, **options) + + @property + def pargs(self): + """Args without default S.Half""" + args = self.args + if args[1] is S.Half: + args = args[:1] + return args + + @classmethod + def eval(cls, arg, H0=S.Half): + """ + Returns a simplified form or a value of Heaviside depending on the + argument passed by the Heaviside object. + + Explanation + =========== + + The ``eval()`` method is automatically called when the ``Heaviside`` + class is about to be instantiated and it returns either some simplified + instance or the unevaluated instance depending on the argument passed. + In other words, ``eval()`` method is not needed to be called explicitly, + it is being called and evaluated once the object is called. + + Examples + ======== + + >>> from sympy import Heaviside, S + >>> from sympy.abc import x + + >>> Heaviside(x) + Heaviside(x) + + >>> Heaviside(19) + 1 + + >>> Heaviside(0) + 1/2 + + >>> Heaviside(0, 1) + 1 + + >>> Heaviside(-5) + 0 + + >>> Heaviside(S.NaN) + nan + + >>> Heaviside(x - 100).subs(x, 5) + 0 + + >>> Heaviside(x - 100).subs(x, 105) + 1 + + Parameters + ========== + + arg : argument passed by Heaviside object + + H0 : value of Heaviside(0) + + """ + if arg.is_extended_negative: + return S.Zero + elif arg.is_extended_positive: + return S.One + elif arg.is_zero: + return H0 + elif arg is S.NaN: + return S.NaN + elif fuzzy_not(im(arg).is_zero): + raise ValueError("Function defined only for Real Values. Complex part: %s found in %s ." % (repr(im(arg)), repr(arg)) ) + + def _eval_rewrite_as_Piecewise(self, arg, H0=None, **kwargs): + """ + Represents Heaviside in a Piecewise form. + + Examples + ======== + + >>> from sympy import Heaviside, Piecewise, Symbol, nan + >>> x = Symbol('x') + + >>> Heaviside(x).rewrite(Piecewise) + Piecewise((0, x < 0), (1/2, Eq(x, 0)), (1, True)) + + >>> Heaviside(x,nan).rewrite(Piecewise) + Piecewise((0, x < 0), (nan, Eq(x, 0)), (1, True)) + + >>> Heaviside(x - 5).rewrite(Piecewise) + Piecewise((0, x < 5), (1/2, Eq(x, 5)), (1, True)) + + >>> Heaviside(x**2 - 1).rewrite(Piecewise) + Piecewise((0, x**2 < 1), (1/2, Eq(x**2, 1)), (1, True)) + + """ + if H0 == 0: + return Piecewise((0, arg <= 0), (1, True)) + if H0 == 1: + return Piecewise((0, arg < 0), (1, True)) + return Piecewise((0, arg < 0), (H0, Eq(arg, 0)), (1, True)) + + def _eval_rewrite_as_sign(self, arg, H0=S.Half, **kwargs): + """ + Represents the Heaviside function in the form of sign function. + + Explanation + =========== + + The value of Heaviside(0) must be 1/2 for rewriting as sign to be + strictly equivalent. For easier usage, we also allow this rewriting + when Heaviside(0) is undefined. + + Examples + ======== + + >>> from sympy import Heaviside, Symbol, sign, nan + >>> x = Symbol('x', real=True) + >>> y = Symbol('y') + + >>> Heaviside(x).rewrite(sign) + sign(x)/2 + 1/2 + + >>> Heaviside(x, 0).rewrite(sign) + Piecewise((sign(x)/2 + 1/2, Ne(x, 0)), (0, True)) + + >>> Heaviside(x, nan).rewrite(sign) + Piecewise((sign(x)/2 + 1/2, Ne(x, 0)), (nan, True)) + + >>> Heaviside(x - 2).rewrite(sign) + sign(x - 2)/2 + 1/2 + + >>> Heaviside(x**2 - 2*x + 1).rewrite(sign) + sign(x**2 - 2*x + 1)/2 + 1/2 + + >>> Heaviside(y).rewrite(sign) + Heaviside(y) + + >>> Heaviside(y**2 - 2*y + 1).rewrite(sign) + Heaviside(y**2 - 2*y + 1) + + See Also + ======== + + sign + + """ + if arg.is_extended_real: + pw1 = Piecewise( + ((sign(arg) + 1)/2, Ne(arg, 0)), + (Heaviside(0, H0=H0), True)) + pw2 = Piecewise( + ((sign(arg) + 1)/2, Eq(Heaviside(0, H0=H0), S.Half)), + (pw1, True)) + return pw2 + + def _eval_rewrite_as_SingularityFunction(self, args, H0=S.Half, **kwargs): + """ + Returns the Heaviside expression written in the form of Singularity + Functions. + + """ + from sympy.solvers import solve + from sympy.functions.special.singularity_functions import SingularityFunction + if self == Heaviside(0): + return SingularityFunction(0, 0, 0) + free = self.free_symbols + if len(free) == 1: + x = (free.pop()) + return SingularityFunction(x, solve(args, x)[0], 0) + # TODO + # ((x - 5)**3*Heaviside(x - 5)).rewrite(SingularityFunction) should output + # SingularityFunction(x, 5, 0) instead of (x - 5)**3*SingularityFunction(x, 5, 0) + else: + # I don't know how to handle the case for Heaviside expressions + # having arguments with more than one variable. + raise TypeError(filldedent(''' + rewrite(SingularityFunction) does not + support arguments with more that one variable.''')) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/functions/special/elliptic_integrals.py b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/functions/special/elliptic_integrals.py new file mode 100644 index 0000000000000000000000000000000000000000..a94e343c0106891db44668ae05c33e84ecd05d0b --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/functions/special/elliptic_integrals.py @@ -0,0 +1,445 @@ +""" Elliptic Integrals. """ + +from sympy.core import S, pi, I, Rational +from sympy.core.function import DefinedFunction, ArgumentIndexError +from sympy.core.symbol import Dummy,uniquely_named_symbol +from sympy.functions.elementary.complexes import sign +from sympy.functions.elementary.hyperbolic import atanh +from sympy.functions.elementary.miscellaneous import sqrt +from sympy.functions.elementary.trigonometric import sin, tan +from sympy.functions.special.gamma_functions import gamma +from sympy.functions.special.hyper import hyper, meijerg + +class elliptic_k(DefinedFunction): + r""" + The complete elliptic integral of the first kind, defined by + + .. math:: K(m) = F\left(\tfrac{\pi}{2}\middle| m\right) + + where $F\left(z\middle| m\right)$ is the Legendre incomplete + elliptic integral of the first kind. + + Explanation + =========== + + The function $K(m)$ is a single-valued function on the complex + plane with branch cut along the interval $(1, \infty)$. + + Note that our notation defines the incomplete elliptic integral + in terms of the parameter $m$ instead of the elliptic modulus + (eccentricity) $k$. + In this case, the parameter $m$ is defined as $m=k^2$. + + Examples + ======== + + >>> from sympy import elliptic_k, I + >>> from sympy.abc import m + >>> elliptic_k(0) + pi/2 + >>> elliptic_k(1.0 + I) + 1.50923695405127 + 0.625146415202697*I + >>> elliptic_k(m).series(n=3) + pi/2 + pi*m/8 + 9*pi*m**2/128 + O(m**3) + + See Also + ======== + + elliptic_f + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Elliptic_integrals + .. [2] https://functions.wolfram.com/EllipticIntegrals/EllipticK + + """ + + @classmethod + def eval(cls, m): + if m.is_zero: + return pi*S.Half + elif m is S.Half: + return 8*pi**Rational(3, 2)/gamma(Rational(-1, 4))**2 + elif m is S.One: + return S.ComplexInfinity + elif m is S.NegativeOne: + return gamma(Rational(1, 4))**2/(4*sqrt(2*pi)) + elif m in (S.Infinity, S.NegativeInfinity, I*S.Infinity, + I*S.NegativeInfinity, S.ComplexInfinity): + return S.Zero + + def fdiff(self, argindex=1): + m = self.args[0] + return (elliptic_e(m) - (1 - m)*elliptic_k(m))/(2*m*(1 - m)) + + def _eval_conjugate(self): + m = self.args[0] + if (m.is_real and (m - 1).is_positive) is False: + return self.func(m.conjugate()) + + def _eval_nseries(self, x, n, logx, cdir=0): + from sympy.simplify import hyperexpand + return hyperexpand(self.rewrite(hyper)._eval_nseries(x, n=n, logx=logx)) + + def _eval_rewrite_as_hyper(self, m, **kwargs): + return pi*S.Half*hyper((S.Half, S.Half), (S.One,), m) + + def _eval_rewrite_as_meijerg(self, m, **kwargs): + return meijerg(((S.Half, S.Half), []), ((S.Zero,), (S.Zero,)), -m)/2 + + def _eval_is_zero(self): + m = self.args[0] + if m.is_infinite: + return True + + def _eval_rewrite_as_Integral(self, *args, **kwargs): + from sympy.integrals.integrals import Integral + t = Dummy(uniquely_named_symbol('t', args).name) + m = self.args[0] + return Integral(1/sqrt(1 - m*sin(t)**2), (t, 0, pi/2)) + + +class elliptic_f(DefinedFunction): + r""" + The Legendre incomplete elliptic integral of the first + kind, defined by + + .. math:: F\left(z\middle| m\right) = + \int_0^z \frac{dt}{\sqrt{1 - m \sin^2 t}} + + Explanation + =========== + + This function reduces to a complete elliptic integral of + the first kind, $K(m)$, when $z = \pi/2$. + + Note that our notation defines the incomplete elliptic integral + in terms of the parameter $m$ instead of the elliptic modulus + (eccentricity) $k$. + In this case, the parameter $m$ is defined as $m=k^2$. + + Examples + ======== + + >>> from sympy import elliptic_f, I + >>> from sympy.abc import z, m + >>> elliptic_f(z, m).series(z) + z + z**5*(3*m**2/40 - m/30) + m*z**3/6 + O(z**6) + >>> elliptic_f(3.0 + I/2, 1.0 + I) + 2.909449841483 + 1.74720545502474*I + + See Also + ======== + + elliptic_k + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Elliptic_integrals + .. [2] https://functions.wolfram.com/EllipticIntegrals/EllipticF + + """ + + @classmethod + def eval(cls, z, m): + if z.is_zero: + return S.Zero + if m.is_zero: + return z + k = 2*z/pi + if k.is_integer: + return k*elliptic_k(m) + elif m in (S.Infinity, S.NegativeInfinity): + return S.Zero + elif z.could_extract_minus_sign(): + return -elliptic_f(-z, m) + + def fdiff(self, argindex=1): + z, m = self.args + fm = sqrt(1 - m*sin(z)**2) + if argindex == 1: + return 1/fm + elif argindex == 2: + return (elliptic_e(z, m)/(2*m*(1 - m)) - elliptic_f(z, m)/(2*m) - + sin(2*z)/(4*(1 - m)*fm)) + raise ArgumentIndexError(self, argindex) + + def _eval_conjugate(self): + z, m = self.args + if (m.is_real and (m - 1).is_positive) is False: + return self.func(z.conjugate(), m.conjugate()) + + def _eval_rewrite_as_Integral(self, *args, **kwargs): + from sympy.integrals.integrals import Integral + t = Dummy(uniquely_named_symbol('t', args).name) + z, m = self.args[0], self.args[1] + return Integral(1/(sqrt(1 - m*sin(t)**2)), (t, 0, z)) + + def _eval_is_zero(self): + z, m = self.args + if z.is_zero: + return True + if m.is_extended_real and m.is_infinite: + return True + + +class elliptic_e(DefinedFunction): + r""" + Called with two arguments $z$ and $m$, evaluates the + incomplete elliptic integral of the second kind, defined by + + .. math:: E\left(z\middle| m\right) = \int_0^z \sqrt{1 - m \sin^2 t} dt + + Called with a single argument $m$, evaluates the Legendre complete + elliptic integral of the second kind + + .. math:: E(m) = E\left(\tfrac{\pi}{2}\middle| m\right) + + Explanation + =========== + + The function $E(m)$ is a single-valued function on the complex + plane with branch cut along the interval $(1, \infty)$. + + Note that our notation defines the incomplete elliptic integral + in terms of the parameter $m$ instead of the elliptic modulus + (eccentricity) $k$. + In this case, the parameter $m$ is defined as $m=k^2$. + + Examples + ======== + + >>> from sympy import elliptic_e, I + >>> from sympy.abc import z, m + >>> elliptic_e(z, m).series(z) + z + z**5*(-m**2/40 + m/30) - m*z**3/6 + O(z**6) + >>> elliptic_e(m).series(n=4) + pi/2 - pi*m/8 - 3*pi*m**2/128 - 5*pi*m**3/512 + O(m**4) + >>> elliptic_e(1 + I, 2 - I/2).n() + 1.55203744279187 + 0.290764986058437*I + >>> elliptic_e(0) + pi/2 + >>> elliptic_e(2.0 - I) + 0.991052601328069 + 0.81879421395609*I + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Elliptic_integrals + .. [2] https://functions.wolfram.com/EllipticIntegrals/EllipticE2 + .. [3] https://functions.wolfram.com/EllipticIntegrals/EllipticE + + """ + + @classmethod + def eval(cls, m, z=None): + if z is not None: + z, m = m, z + k = 2*z/pi + if m.is_zero: + return z + if z.is_zero: + return S.Zero + elif k.is_integer: + return k*elliptic_e(m) + elif m in (S.Infinity, S.NegativeInfinity): + return S.ComplexInfinity + elif z.could_extract_minus_sign(): + return -elliptic_e(-z, m) + else: + if m.is_zero: + return pi/2 + elif m is S.One: + return S.One + elif m is S.Infinity: + return I*S.Infinity + elif m is S.NegativeInfinity: + return S.Infinity + elif m is S.ComplexInfinity: + return S.ComplexInfinity + + def fdiff(self, argindex=1): + if len(self.args) == 2: + z, m = self.args + if argindex == 1: + return sqrt(1 - m*sin(z)**2) + elif argindex == 2: + return (elliptic_e(z, m) - elliptic_f(z, m))/(2*m) + else: + m = self.args[0] + if argindex == 1: + return (elliptic_e(m) - elliptic_k(m))/(2*m) + raise ArgumentIndexError(self, argindex) + + def _eval_conjugate(self): + if len(self.args) == 2: + z, m = self.args + if (m.is_real and (m - 1).is_positive) is False: + return self.func(z.conjugate(), m.conjugate()) + else: + m = self.args[0] + if (m.is_real and (m - 1).is_positive) is False: + return self.func(m.conjugate()) + + def _eval_nseries(self, x, n, logx, cdir=0): + from sympy.simplify import hyperexpand + if len(self.args) == 1: + return hyperexpand(self.rewrite(hyper)._eval_nseries(x, n=n, logx=logx)) + return super()._eval_nseries(x, n=n, logx=logx) + + def _eval_rewrite_as_hyper(self, *args, **kwargs): + if len(args) == 1: + m = args[0] + return (pi/2)*hyper((Rational(-1, 2), S.Half), (S.One,), m) + + def _eval_rewrite_as_meijerg(self, *args, **kwargs): + if len(args) == 1: + m = args[0] + return -meijerg(((S.Half, Rational(3, 2)), []), \ + ((S.Zero,), (S.Zero,)), -m)/4 + + def _eval_rewrite_as_Integral(self, *args, **kwargs): + from sympy.integrals.integrals import Integral + z, m = (pi/2, self.args[0]) if len(self.args) == 1 else self.args + t = Dummy(uniquely_named_symbol('t', args).name) + return Integral(sqrt(1 - m*sin(t)**2), (t, 0, z)) + + +class elliptic_pi(DefinedFunction): + r""" + Called with three arguments $n$, $z$ and $m$, evaluates the + Legendre incomplete elliptic integral of the third kind, defined by + + .. math:: \Pi\left(n; z\middle| m\right) = \int_0^z \frac{dt} + {\left(1 - n \sin^2 t\right) \sqrt{1 - m \sin^2 t}} + + Called with two arguments $n$ and $m$, evaluates the complete + elliptic integral of the third kind: + + .. math:: \Pi\left(n\middle| m\right) = + \Pi\left(n; \tfrac{\pi}{2}\middle| m\right) + + Explanation + =========== + + Note that our notation defines the incomplete elliptic integral + in terms of the parameter $m$ instead of the elliptic modulus + (eccentricity) $k$. + In this case, the parameter $m$ is defined as $m=k^2$. + + Examples + ======== + + >>> from sympy import elliptic_pi, I + >>> from sympy.abc import z, n, m + >>> elliptic_pi(n, z, m).series(z, n=4) + z + z**3*(m/6 + n/3) + O(z**4) + >>> elliptic_pi(0.5 + I, 1.0 - I, 1.2) + 2.50232379629182 - 0.760939574180767*I + >>> elliptic_pi(0, 0) + pi/2 + >>> elliptic_pi(1.0 - I/3, 2.0 + I) + 3.29136443417283 + 0.32555634906645*I + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Elliptic_integrals + .. [2] https://functions.wolfram.com/EllipticIntegrals/EllipticPi3 + .. [3] https://functions.wolfram.com/EllipticIntegrals/EllipticPi + + """ + + @classmethod + def eval(cls, n, m, z=None): + if z is not None: + z, m = m, z + if n.is_zero: + return elliptic_f(z, m) + elif n is S.One: + return (elliptic_f(z, m) + + (sqrt(1 - m*sin(z)**2)*tan(z) - + elliptic_e(z, m))/(1 - m)) + k = 2*z/pi + if k.is_integer: + return k*elliptic_pi(n, m) + elif m.is_zero: + return atanh(sqrt(n - 1)*tan(z))/sqrt(n - 1) + elif n == m: + return (elliptic_f(z, n) - elliptic_pi(1, z, n) + + tan(z)/sqrt(1 - n*sin(z)**2)) + elif n in (S.Infinity, S.NegativeInfinity): + return S.Zero + elif m in (S.Infinity, S.NegativeInfinity): + return S.Zero + elif z.could_extract_minus_sign(): + return -elliptic_pi(n, -z, m) + if n.is_zero: + return elliptic_f(z, m) + if m.is_extended_real and m.is_infinite or \ + n.is_extended_real and n.is_infinite: + return S.Zero + else: + if n.is_zero: + return elliptic_k(m) + elif n is S.One: + return S.ComplexInfinity + elif m.is_zero: + return pi/(2*sqrt(1 - n)) + elif m == S.One: + return S.NegativeInfinity/sign(n - 1) + elif n == m: + return elliptic_e(n)/(1 - n) + elif n in (S.Infinity, S.NegativeInfinity): + return S.Zero + elif m in (S.Infinity, S.NegativeInfinity): + return S.Zero + if n.is_zero: + return elliptic_k(m) + if m.is_extended_real and m.is_infinite or \ + n.is_extended_real and n.is_infinite: + return S.Zero + + def _eval_conjugate(self): + if len(self.args) == 3: + n, z, m = self.args + if (n.is_real and (n - 1).is_positive) is False and \ + (m.is_real and (m - 1).is_positive) is False: + return self.func(n.conjugate(), z.conjugate(), m.conjugate()) + else: + n, m = self.args + return self.func(n.conjugate(), m.conjugate()) + + def fdiff(self, argindex=1): + if len(self.args) == 3: + n, z, m = self.args + fm, fn = sqrt(1 - m*sin(z)**2), 1 - n*sin(z)**2 + if argindex == 1: + return (elliptic_e(z, m) + (m - n)*elliptic_f(z, m)/n + + (n**2 - m)*elliptic_pi(n, z, m)/n - + n*fm*sin(2*z)/(2*fn))/(2*(m - n)*(n - 1)) + elif argindex == 2: + return 1/(fm*fn) + elif argindex == 3: + return (elliptic_e(z, m)/(m - 1) + + elliptic_pi(n, z, m) - + m*sin(2*z)/(2*(m - 1)*fm))/(2*(n - m)) + else: + n, m = self.args + if argindex == 1: + return (elliptic_e(m) + (m - n)*elliptic_k(m)/n + + (n**2 - m)*elliptic_pi(n, m)/n)/(2*(m - n)*(n - 1)) + elif argindex == 2: + return (elliptic_e(m)/(m - 1) + elliptic_pi(n, m))/(2*(n - m)) + raise ArgumentIndexError(self, argindex) + + def _eval_rewrite_as_Integral(self, *args, **kwargs): + from sympy.integrals.integrals import Integral + if len(self.args) == 2: + n, m, z = self.args[0], self.args[1], pi/2 + else: + n, z, m = self.args + t = Dummy(uniquely_named_symbol('t', args).name) + return Integral(1/((1 - n*sin(t)**2)*sqrt(1 - m*sin(t)**2)), (t, 0, z)) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/functions/special/error_functions.py b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/functions/special/error_functions.py new file mode 100644 index 0000000000000000000000000000000000000000..a778127d8b80583a892285fadbb8230f72de39f9 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/functions/special/error_functions.py @@ -0,0 +1,2801 @@ +""" This module contains various functions that are special cases + of incomplete gamma functions. It should probably be renamed. """ + +from sympy.core import EulerGamma # Must be imported from core, not core.numbers +from sympy.core.add import Add +from sympy.core.cache import cacheit +from sympy.core.function import DefinedFunction, ArgumentIndexError, expand_mul +from sympy.core.logic import fuzzy_or +from sympy.core.numbers import I, pi, Rational, Integer +from sympy.core.relational import is_eq +from sympy.core.power import Pow +from sympy.core.singleton import S +from sympy.core.symbol import Dummy, uniquely_named_symbol +from sympy.core.sympify import sympify +from sympy.functions.combinatorial.factorials import factorial, factorial2, RisingFactorial +from sympy.functions.elementary.complexes import polar_lift, re, unpolarify +from sympy.functions.elementary.integers import ceiling, floor +from sympy.functions.elementary.miscellaneous import sqrt, root +from sympy.functions.elementary.exponential import exp, log, exp_polar +from sympy.functions.elementary.hyperbolic import cosh, sinh +from sympy.functions.elementary.trigonometric import cos, sin, sinc +from sympy.functions.special.hyper import hyper, meijerg + +# TODO series expansions +# TODO see the "Note:" in Ei + +# Helper function +def real_to_real_as_real_imag(self, deep=True, **hints): + if self.args[0].is_extended_real: + if deep: + hints['complex'] = False + return (self.expand(deep, **hints), S.Zero) + else: + return (self, S.Zero) + if deep: + x, y = self.args[0].expand(deep, **hints).as_real_imag() + else: + x, y = self.args[0].as_real_imag() + re = (self.func(x + I*y) + self.func(x - I*y))/2 + im = (self.func(x + I*y) - self.func(x - I*y))/(2*I) + return (re, im) + + +############################################################################### +################################ ERROR FUNCTION ############################### +############################################################################### + + +class erf(DefinedFunction): + r""" + The Gauss error function. + + Explanation + =========== + + This function is defined as: + + .. math :: + \mathrm{erf}(x) = \frac{2}{\sqrt{\pi}} \int_0^x e^{-t^2} \mathrm{d}t. + + Examples + ======== + + >>> from sympy import I, oo, erf + >>> from sympy.abc import z + + Several special values are known: + + >>> erf(0) + 0 + >>> erf(oo) + 1 + >>> erf(-oo) + -1 + >>> erf(I*oo) + oo*I + >>> erf(-I*oo) + -oo*I + + In general one can pull out factors of -1 and $I$ from the argument: + + >>> erf(-z) + -erf(z) + + The error function obeys the mirror symmetry: + + >>> from sympy import conjugate + >>> conjugate(erf(z)) + erf(conjugate(z)) + + Differentiation with respect to $z$ is supported: + + >>> from sympy import diff + >>> diff(erf(z), z) + 2*exp(-z**2)/sqrt(pi) + + We can numerically evaluate the error function to arbitrary precision + on the whole complex plane: + + >>> erf(4).evalf(30) + 0.999999984582742099719981147840 + + >>> erf(-4*I).evalf(30) + -1296959.73071763923152794095062*I + + See Also + ======== + + erfc: Complementary error function. + erfi: Imaginary error function. + erf2: Two-argument error function. + erfinv: Inverse error function. + erfcinv: Inverse Complementary error function. + erf2inv: Inverse two-argument error function. + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Error_function + .. [2] https://dlmf.nist.gov/7 + .. [3] https://mathworld.wolfram.com/Erf.html + .. [4] https://functions.wolfram.com/GammaBetaErf/Erf + + """ + + unbranched = True + + def fdiff(self, argindex=1): + if argindex == 1: + return 2*exp(-self.args[0]**2)/sqrt(pi) + else: + raise ArgumentIndexError(self, argindex) + + + def inverse(self, argindex=1): + """ + Returns the inverse of this function. + + """ + return erfinv + + @classmethod + def eval(cls, arg): + if arg.is_Number: + if arg is S.NaN: + return S.NaN + elif arg is S.Infinity: + return S.One + elif arg is S.NegativeInfinity: + return S.NegativeOne + elif arg.is_zero: + return S.Zero + + if isinstance(arg, erfinv): + return arg.args[0] + + if isinstance(arg, erfcinv): + return S.One - arg.args[0] + + if arg.is_zero: + return S.Zero + + # Only happens with unevaluated erf2inv + if isinstance(arg, erf2inv) and arg.args[0].is_zero: + return arg.args[1] + + # Try to pull out factors of I + t = arg.extract_multiplicatively(I) + if t in (S.Infinity, S.NegativeInfinity): + return arg + + # Try to pull out factors of -1 + if arg.could_extract_minus_sign(): + return -cls(-arg) + + @staticmethod + @cacheit + def taylor_term(n, x, *previous_terms): + if n < 0 or n % 2 == 0: + return S.Zero + else: + x = sympify(x) + k = floor((n - 1)/S(2)) + if len(previous_terms) > 2: + return -previous_terms[-2] * x**2 * (n - 2)/(n*k) + else: + return 2*S.NegativeOne**k * x**n/(n*factorial(k)*sqrt(pi)) + + def _eval_conjugate(self): + return self.func(self.args[0].conjugate()) + + def _eval_is_real(self): + if self.args[0].is_extended_real is True: + return True + # There are cases where erf(z) becomes a real number + # even if z is a complex number + + def _eval_is_imaginary(self): + if self.args[0].is_imaginary is True: + return True + + def _eval_is_finite(self): + z = self.args[0] + return fuzzy_or([z.is_finite, z.is_extended_real]) + + def _eval_is_zero(self): + if self.args[0].is_extended_real is True: + return self.args[0].is_zero + + def _eval_is_positive(self): + if self.args[0].is_extended_real is True: + return self.args[0].is_extended_positive + + def _eval_is_negative(self): + if self.args[0].is_extended_real is True: + return self.args[0].is_extended_negative + + def _eval_rewrite_as_uppergamma(self, z, **kwargs): + from sympy.functions.special.gamma_functions import uppergamma + return sqrt(z**2)/z*(S.One - uppergamma(S.Half, z**2)/sqrt(pi)) + + def _eval_rewrite_as_fresnels(self, z, **kwargs): + arg = (S.One - I)*z/sqrt(pi) + return (S.One + I)*(fresnelc(arg) - I*fresnels(arg)) + + def _eval_rewrite_as_fresnelc(self, z, **kwargs): + arg = (S.One - I)*z/sqrt(pi) + return (S.One + I)*(fresnelc(arg) - I*fresnels(arg)) + + def _eval_rewrite_as_meijerg(self, z, **kwargs): + return z/sqrt(pi)*meijerg([S.Half], [], [0], [Rational(-1, 2)], z**2) + + def _eval_rewrite_as_hyper(self, z, **kwargs): + return 2*z/sqrt(pi)*hyper([S.Half], [3*S.Half], -z**2) + + def _eval_rewrite_as_expint(self, z, **kwargs): + return sqrt(z**2)/z - z*expint(S.Half, z**2)/sqrt(pi) + + def _eval_rewrite_as_tractable(self, z, limitvar=None, **kwargs): + from sympy.series.limits import limit + if limitvar: + lim = limit(z, limitvar, S.Infinity) + if lim is S.NegativeInfinity: + return S.NegativeOne + _erfs(-z)*exp(-z**2) + return S.One - _erfs(z)*exp(-z**2) + + def _eval_rewrite_as_erfc(self, z, **kwargs): + return S.One - erfc(z) + + def _eval_rewrite_as_erfi(self, z, **kwargs): + return -I*erfi(I*z) + + def _eval_as_leading_term(self, x, logx, cdir): + arg = self.args[0].as_leading_term(x, logx=logx, cdir=cdir) + arg0 = arg.subs(x, 0) + + if arg0 is S.ComplexInfinity: + arg0 = arg.limit(x, 0, dir='-' if cdir == -1 else '+') + if x in arg.free_symbols and arg0.is_zero: + return 2*arg/sqrt(pi) + else: + return self.func(arg0) + + def _eval_aseries(self, n, args0, x, logx): + from sympy.series.order import Order + point = args0[0] + + if point in [S.Infinity, S.NegativeInfinity]: + z = self.args[0] + + try: + _, ex = z.leadterm(x) + except (ValueError, NotImplementedError): + return self + + ex = -ex # as x->1/x for aseries + if ex.is_positive: + newn = ceiling(n/ex) + s = [S.NegativeOne**k * factorial2(2*k - 1) / (z**(2*k + 1) * 2**k) + for k in range(newn)] + [Order(1/z**newn, x)] + return S.One - (exp(-z**2)/sqrt(pi)) * Add(*s) + + return super(erf, self)._eval_aseries(n, args0, x, logx) + + as_real_imag = real_to_real_as_real_imag + + +class erfc(DefinedFunction): + r""" + Complementary Error Function. + + Explanation + =========== + + The function is defined as: + + .. math :: + \mathrm{erfc}(x) = \frac{2}{\sqrt{\pi}} \int_x^\infty e^{-t^2} \mathrm{d}t + + Examples + ======== + + >>> from sympy import I, oo, erfc + >>> from sympy.abc import z + + Several special values are known: + + >>> erfc(0) + 1 + >>> erfc(oo) + 0 + >>> erfc(-oo) + 2 + >>> erfc(I*oo) + -oo*I + >>> erfc(-I*oo) + oo*I + + The error function obeys the mirror symmetry: + + >>> from sympy import conjugate + >>> conjugate(erfc(z)) + erfc(conjugate(z)) + + Differentiation with respect to $z$ is supported: + + >>> from sympy import diff + >>> diff(erfc(z), z) + -2*exp(-z**2)/sqrt(pi) + + It also follows + + >>> erfc(-z) + 2 - erfc(z) + + We can numerically evaluate the complementary error function to arbitrary + precision on the whole complex plane: + + >>> erfc(4).evalf(30) + 0.0000000154172579002800188521596734869 + + >>> erfc(4*I).evalf(30) + 1.0 - 1296959.73071763923152794095062*I + + See Also + ======== + + erf: Gaussian error function. + erfi: Imaginary error function. + erf2: Two-argument error function. + erfinv: Inverse error function. + erfcinv: Inverse Complementary error function. + erf2inv: Inverse two-argument error function. + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Error_function + .. [2] https://dlmf.nist.gov/7 + .. [3] https://mathworld.wolfram.com/Erfc.html + .. [4] https://functions.wolfram.com/GammaBetaErf/Erfc + + """ + + unbranched = True + + def fdiff(self, argindex=1): + if argindex == 1: + return -2*exp(-self.args[0]**2)/sqrt(pi) + else: + raise ArgumentIndexError(self, argindex) + + def inverse(self, argindex=1): + """ + Returns the inverse of this function. + + """ + return erfcinv + + @classmethod + def eval(cls, arg): + if arg.is_Number: + if arg is S.NaN: + return S.NaN + elif arg is S.Infinity: + return S.Zero + elif arg.is_zero: + return S.One + + if isinstance(arg, erfinv): + return S.One - arg.args[0] + + if isinstance(arg, erfcinv): + return arg.args[0] + + if arg.is_zero: + return S.One + + # Try to pull out factors of I + t = arg.extract_multiplicatively(I) + if t in (S.Infinity, S.NegativeInfinity): + return -arg + + # Try to pull out factors of -1 + if arg.could_extract_minus_sign(): + return 2 - cls(-arg) + + @staticmethod + @cacheit + def taylor_term(n, x, *previous_terms): + if n == 0: + return S.One + elif n < 0 or n % 2 == 0: + return S.Zero + else: + x = sympify(x) + k = floor((n - 1)/S(2)) + if len(previous_terms) > 2: + return -previous_terms[-2] * x**2 * (n - 2)/(n*k) + else: + return -2*S.NegativeOne**k * x**n/(n*factorial(k)*sqrt(pi)) + + def _eval_conjugate(self): + return self.func(self.args[0].conjugate()) + + def _eval_is_real(self): + if self.args[0].is_extended_real is True: + return True + if self.args[0].is_imaginary is True: + return False + + def _eval_rewrite_as_tractable(self, z, limitvar=None, **kwargs): + return self.rewrite(erf).rewrite("tractable", deep=True, limitvar=limitvar) + + def _eval_rewrite_as_erf(self, z, **kwargs): + return S.One - erf(z) + + def _eval_rewrite_as_erfi(self, z, **kwargs): + return S.One + I*erfi(I*z) + + def _eval_rewrite_as_fresnels(self, z, **kwargs): + arg = (S.One - I)*z/sqrt(pi) + return S.One - (S.One + I)*(fresnelc(arg) - I*fresnels(arg)) + + def _eval_rewrite_as_fresnelc(self, z, **kwargs): + arg = (S.One-I)*z/sqrt(pi) + return S.One - (S.One + I)*(fresnelc(arg) - I*fresnels(arg)) + + def _eval_rewrite_as_meijerg(self, z, **kwargs): + return S.One - z/sqrt(pi)*meijerg([S.Half], [], [0], [Rational(-1, 2)], z**2) + + def _eval_rewrite_as_hyper(self, z, **kwargs): + return S.One - 2*z/sqrt(pi)*hyper([S.Half], [3*S.Half], -z**2) + + def _eval_rewrite_as_uppergamma(self, z, **kwargs): + from sympy.functions.special.gamma_functions import uppergamma + return S.One - sqrt(z**2)/z*(S.One - uppergamma(S.Half, z**2)/sqrt(pi)) + + def _eval_rewrite_as_expint(self, z, **kwargs): + return S.One - sqrt(z**2)/z + z*expint(S.Half, z**2)/sqrt(pi) + + def _eval_expand_func(self, **hints): + return self.rewrite(erf) + + def _eval_as_leading_term(self, x, logx, cdir): + arg = self.args[0].as_leading_term(x, logx=logx, cdir=cdir) + arg0 = arg.subs(x, 0) + + if arg0 is S.ComplexInfinity: + arg0 = arg.limit(x, 0, dir='-' if cdir == -1 else '+') + if arg0.is_zero: + return S.One + else: + return self.func(arg0) + + as_real_imag = real_to_real_as_real_imag + + def _eval_aseries(self, n, args0, x, logx): + return S.One - erf(*self.args)._eval_aseries(n, args0, x, logx) + + +class erfi(DefinedFunction): + r""" + Imaginary error function. + + Explanation + =========== + + The function erfi is defined as: + + .. math :: + \mathrm{erfi}(x) = \frac{2}{\sqrt{\pi}} \int_0^x e^{t^2} \mathrm{d}t + + Examples + ======== + + >>> from sympy import I, oo, erfi + >>> from sympy.abc import z + + Several special values are known: + + >>> erfi(0) + 0 + >>> erfi(oo) + oo + >>> erfi(-oo) + -oo + >>> erfi(I*oo) + I + >>> erfi(-I*oo) + -I + + In general one can pull out factors of -1 and $I$ from the argument: + + >>> erfi(-z) + -erfi(z) + + >>> from sympy import conjugate + >>> conjugate(erfi(z)) + erfi(conjugate(z)) + + Differentiation with respect to $z$ is supported: + + >>> from sympy import diff + >>> diff(erfi(z), z) + 2*exp(z**2)/sqrt(pi) + + We can numerically evaluate the imaginary error function to arbitrary + precision on the whole complex plane: + + >>> erfi(2).evalf(30) + 18.5648024145755525987042919132 + + >>> erfi(-2*I).evalf(30) + -0.995322265018952734162069256367*I + + See Also + ======== + + erf: Gaussian error function. + erfc: Complementary error function. + erf2: Two-argument error function. + erfinv: Inverse error function. + erfcinv: Inverse Complementary error function. + erf2inv: Inverse two-argument error function. + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Error_function + .. [2] https://mathworld.wolfram.com/Erfi.html + .. [3] https://functions.wolfram.com/GammaBetaErf/Erfi + + """ + + unbranched = True + + def fdiff(self, argindex=1): + if argindex == 1: + return 2*exp(self.args[0]**2)/sqrt(pi) + else: + raise ArgumentIndexError(self, argindex) + + @classmethod + def eval(cls, z): + if z.is_Number: + if z is S.NaN: + return S.NaN + elif z.is_zero: + return S.Zero + elif z is S.Infinity: + return S.Infinity + + if z.is_zero: + return S.Zero + + # Try to pull out factors of -1 + if z.could_extract_minus_sign(): + return -cls(-z) + + # Try to pull out factors of I + nz = z.extract_multiplicatively(I) + if nz is not None: + if nz is S.Infinity: + return I + if isinstance(nz, erfinv): + return I*nz.args[0] + if isinstance(nz, erfcinv): + return I*(S.One - nz.args[0]) + # Only happens with unevaluated erf2inv + if isinstance(nz, erf2inv) and nz.args[0].is_zero: + return I*nz.args[1] + + @staticmethod + @cacheit + def taylor_term(n, x, *previous_terms): + if n < 0 or n % 2 == 0: + return S.Zero + else: + x = sympify(x) + k = floor((n - 1)/S(2)) + if len(previous_terms) > 2: + return previous_terms[-2] * x**2 * (n - 2)/(n*k) + else: + return 2 * x**n/(n*factorial(k)*sqrt(pi)) + + def _eval_conjugate(self): + return self.func(self.args[0].conjugate()) + + def _eval_is_extended_real(self): + return self.args[0].is_extended_real + + def _eval_is_zero(self): + return self.args[0].is_zero + + def _eval_rewrite_as_tractable(self, z, limitvar=None, **kwargs): + return self.rewrite(erf).rewrite("tractable", deep=True, limitvar=limitvar) + + def _eval_rewrite_as_erf(self, z, **kwargs): + return -I*erf(I*z) + + def _eval_rewrite_as_erfc(self, z, **kwargs): + return I*erfc(I*z) - I + + def _eval_rewrite_as_fresnels(self, z, **kwargs): + arg = (S.One + I)*z/sqrt(pi) + return (S.One - I)*(fresnelc(arg) - I*fresnels(arg)) + + def _eval_rewrite_as_fresnelc(self, z, **kwargs): + arg = (S.One + I)*z/sqrt(pi) + return (S.One - I)*(fresnelc(arg) - I*fresnels(arg)) + + def _eval_rewrite_as_meijerg(self, z, **kwargs): + return z/sqrt(pi)*meijerg([S.Half], [], [0], [Rational(-1, 2)], -z**2) + + def _eval_rewrite_as_hyper(self, z, **kwargs): + return 2*z/sqrt(pi)*hyper([S.Half], [3*S.Half], z**2) + + def _eval_rewrite_as_uppergamma(self, z, **kwargs): + from sympy.functions.special.gamma_functions import uppergamma + return sqrt(-z**2)/z*(uppergamma(S.Half, -z**2)/sqrt(pi) - S.One) + + def _eval_rewrite_as_expint(self, z, **kwargs): + return sqrt(-z**2)/z - z*expint(S.Half, -z**2)/sqrt(pi) + + def _eval_expand_func(self, **hints): + return self.rewrite(erf) + + as_real_imag = real_to_real_as_real_imag + + def _eval_as_leading_term(self, x, logx, cdir): + arg = self.args[0].as_leading_term(x, logx=logx, cdir=cdir) + arg0 = arg.subs(x, 0) + + if x in arg.free_symbols and arg0.is_zero: + return 2*arg/sqrt(pi) + elif arg0.is_finite: + return self.func(arg0) + return self.func(arg) + + def _eval_aseries(self, n, args0, x, logx): + from sympy.series.order import Order + point = args0[0] + + if point is S.Infinity: + z = self.args[0] + s = [factorial2(2*k - 1) / (2**k * z**(2*k + 1)) + for k in range(n)] + [Order(1/z**n, x)] + return -I + (exp(z**2)/sqrt(pi)) * Add(*s) + + return super(erfi, self)._eval_aseries(n, args0, x, logx) + + +class erf2(DefinedFunction): + r""" + Two-argument error function. + + Explanation + =========== + + This function is defined as: + + .. math :: + \mathrm{erf2}(x, y) = \frac{2}{\sqrt{\pi}} \int_x^y e^{-t^2} \mathrm{d}t + + Examples + ======== + + >>> from sympy import oo, erf2 + >>> from sympy.abc import x, y + + Several special values are known: + + >>> erf2(0, 0) + 0 + >>> erf2(x, x) + 0 + >>> erf2(x, oo) + 1 - erf(x) + >>> erf2(x, -oo) + -erf(x) - 1 + >>> erf2(oo, y) + erf(y) - 1 + >>> erf2(-oo, y) + erf(y) + 1 + + In general one can pull out factors of -1: + + >>> erf2(-x, -y) + -erf2(x, y) + + The error function obeys the mirror symmetry: + + >>> from sympy import conjugate + >>> conjugate(erf2(x, y)) + erf2(conjugate(x), conjugate(y)) + + Differentiation with respect to $x$, $y$ is supported: + + >>> from sympy import diff + >>> diff(erf2(x, y), x) + -2*exp(-x**2)/sqrt(pi) + >>> diff(erf2(x, y), y) + 2*exp(-y**2)/sqrt(pi) + + See Also + ======== + + erf: Gaussian error function. + erfc: Complementary error function. + erfi: Imaginary error function. + erfinv: Inverse error function. + erfcinv: Inverse Complementary error function. + erf2inv: Inverse two-argument error function. + + References + ========== + + .. [1] https://functions.wolfram.com/GammaBetaErf/Erf2/ + + """ + + + def fdiff(self, argindex): + x, y = self.args + if argindex == 1: + return -2*exp(-x**2)/sqrt(pi) + elif argindex == 2: + return 2*exp(-y**2)/sqrt(pi) + else: + raise ArgumentIndexError(self, argindex) + + @classmethod + def eval(cls, x, y): + chk = (S.Infinity, S.NegativeInfinity, S.Zero) + if x is S.NaN or y is S.NaN: + return S.NaN + elif x == y: + return S.Zero + elif x in chk or y in chk: + return erf(y) - erf(x) + + if isinstance(y, erf2inv) and y.args[0] == x: + return y.args[1] + + if x.is_zero or y.is_zero or x.is_extended_real and x.is_infinite or \ + y.is_extended_real and y.is_infinite: + return erf(y) - erf(x) + + #Try to pull out -1 factor + sign_x = x.could_extract_minus_sign() + sign_y = y.could_extract_minus_sign() + if (sign_x and sign_y): + return -cls(-x, -y) + elif (sign_x or sign_y): + return erf(y)-erf(x) + + def _eval_conjugate(self): + return self.func(self.args[0].conjugate(), self.args[1].conjugate()) + + def _eval_is_extended_real(self): + return self.args[0].is_extended_real and self.args[1].is_extended_real + + def _eval_rewrite_as_erf(self, x, y, **kwargs): + return erf(y) - erf(x) + + def _eval_rewrite_as_erfc(self, x, y, **kwargs): + return erfc(x) - erfc(y) + + def _eval_rewrite_as_erfi(self, x, y, **kwargs): + return I*(erfi(I*x)-erfi(I*y)) + + def _eval_rewrite_as_fresnels(self, x, y, **kwargs): + return erf(y).rewrite(fresnels) - erf(x).rewrite(fresnels) + + def _eval_rewrite_as_fresnelc(self, x, y, **kwargs): + return erf(y).rewrite(fresnelc) - erf(x).rewrite(fresnelc) + + def _eval_rewrite_as_meijerg(self, x, y, **kwargs): + return erf(y).rewrite(meijerg) - erf(x).rewrite(meijerg) + + def _eval_rewrite_as_hyper(self, x, y, **kwargs): + return erf(y).rewrite(hyper) - erf(x).rewrite(hyper) + + def _eval_rewrite_as_uppergamma(self, x, y, **kwargs): + from sympy.functions.special.gamma_functions import uppergamma + return (sqrt(y**2)/y*(S.One - uppergamma(S.Half, y**2)/sqrt(pi)) - + sqrt(x**2)/x*(S.One - uppergamma(S.Half, x**2)/sqrt(pi))) + + def _eval_rewrite_as_expint(self, x, y, **kwargs): + return erf(y).rewrite(expint) - erf(x).rewrite(expint) + + def _eval_expand_func(self, **hints): + return self.rewrite(erf) + + def _eval_is_zero(self): + return is_eq(*self.args) + +class erfinv(DefinedFunction): + r""" + Inverse Error Function. The erfinv function is defined as: + + .. math :: + \mathrm{erf}(x) = y \quad \Rightarrow \quad \mathrm{erfinv}(y) = x + + Examples + ======== + + >>> from sympy import erfinv + >>> from sympy.abc import x + + Several special values are known: + + >>> erfinv(0) + 0 + >>> erfinv(1) + oo + + Differentiation with respect to $x$ is supported: + + >>> from sympy import diff + >>> diff(erfinv(x), x) + sqrt(pi)*exp(erfinv(x)**2)/2 + + We can numerically evaluate the inverse error function to arbitrary + precision on [-1, 1]: + + >>> erfinv(0.2).evalf(30) + 0.179143454621291692285822705344 + + See Also + ======== + + erf: Gaussian error function. + erfc: Complementary error function. + erfi: Imaginary error function. + erf2: Two-argument error function. + erfcinv: Inverse Complementary error function. + erf2inv: Inverse two-argument error function. + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Error_function#Inverse_functions + .. [2] https://functions.wolfram.com/GammaBetaErf/InverseErf/ + + """ + + + def fdiff(self, argindex =1): + if argindex == 1: + return sqrt(pi)*exp(self.func(self.args[0])**2)*S.Half + else : + raise ArgumentIndexError(self, argindex) + + def inverse(self, argindex=1): + """ + Returns the inverse of this function. + + """ + return erf + + @classmethod + def eval(cls, z): + if z is S.NaN: + return S.NaN + elif z is S.NegativeOne: + return S.NegativeInfinity + elif z.is_zero: + return S.Zero + elif z is S.One: + return S.Infinity + + if isinstance(z, erf) and z.args[0].is_extended_real: + return z.args[0] + + if z.is_zero: + return S.Zero + + # Try to pull out factors of -1 + nz = z.extract_multiplicatively(-1) + if nz is not None and (isinstance(nz, erf) and (nz.args[0]).is_extended_real): + return -nz.args[0] + + def _eval_rewrite_as_erfcinv(self, z, **kwargs): + return erfcinv(1-z) + + def _eval_is_zero(self): + return self.args[0].is_zero + + +class erfcinv (DefinedFunction): + r""" + Inverse Complementary Error Function. The erfcinv function is defined as: + + .. math :: + \mathrm{erfc}(x) = y \quad \Rightarrow \quad \mathrm{erfcinv}(y) = x + + Examples + ======== + + >>> from sympy import erfcinv + >>> from sympy.abc import x + + Several special values are known: + + >>> erfcinv(1) + 0 + >>> erfcinv(0) + oo + + Differentiation with respect to $x$ is supported: + + >>> from sympy import diff + >>> diff(erfcinv(x), x) + -sqrt(pi)*exp(erfcinv(x)**2)/2 + + See Also + ======== + + erf: Gaussian error function. + erfc: Complementary error function. + erfi: Imaginary error function. + erf2: Two-argument error function. + erfinv: Inverse error function. + erf2inv: Inverse two-argument error function. + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Error_function#Inverse_functions + .. [2] https://functions.wolfram.com/GammaBetaErf/InverseErfc/ + + """ + + + def fdiff(self, argindex =1): + if argindex == 1: + return -sqrt(pi)*exp(self.func(self.args[0])**2)*S.Half + else: + raise ArgumentIndexError(self, argindex) + + def inverse(self, argindex=1): + """ + Returns the inverse of this function. + + """ + return erfc + + @classmethod + def eval(cls, z): + if z is S.NaN: + return S.NaN + elif z.is_zero: + return S.Infinity + elif z is S.One: + return S.Zero + elif z == 2: + return S.NegativeInfinity + + if z.is_zero: + return S.Infinity + + def _eval_rewrite_as_erfinv(self, z, **kwargs): + return erfinv(1-z) + + def _eval_is_zero(self): + return (self.args[0] - 1).is_zero + + def _eval_is_infinite(self): + z = self.args[0] + return fuzzy_or([z.is_zero, is_eq(z, Integer(2))]) + + +class erf2inv(DefinedFunction): + r""" + Two-argument Inverse error function. The erf2inv function is defined as: + + .. math :: + \mathrm{erf2}(x, w) = y \quad \Rightarrow \quad \mathrm{erf2inv}(x, y) = w + + Examples + ======== + + >>> from sympy import erf2inv, oo + >>> from sympy.abc import x, y + + Several special values are known: + + >>> erf2inv(0, 0) + 0 + >>> erf2inv(1, 0) + 1 + >>> erf2inv(0, 1) + oo + >>> erf2inv(0, y) + erfinv(y) + >>> erf2inv(oo, y) + erfcinv(-y) + + Differentiation with respect to $x$ and $y$ is supported: + + >>> from sympy import diff + >>> diff(erf2inv(x, y), x) + exp(-x**2 + erf2inv(x, y)**2) + >>> diff(erf2inv(x, y), y) + sqrt(pi)*exp(erf2inv(x, y)**2)/2 + + See Also + ======== + + erf: Gaussian error function. + erfc: Complementary error function. + erfi: Imaginary error function. + erf2: Two-argument error function. + erfinv: Inverse error function. + erfcinv: Inverse complementary error function. + + References + ========== + + .. [1] https://functions.wolfram.com/GammaBetaErf/InverseErf2/ + + """ + + + def fdiff(self, argindex): + x, y = self.args + if argindex == 1: + return exp(self.func(x,y)**2-x**2) + elif argindex == 2: + return sqrt(pi)*S.Half*exp(self.func(x,y)**2) + else: + raise ArgumentIndexError(self, argindex) + + @classmethod + def eval(cls, x, y): + if x is S.NaN or y is S.NaN: + return S.NaN + elif x.is_zero and y.is_zero: + return S.Zero + elif x.is_zero and y is S.One: + return S.Infinity + elif x is S.One and y.is_zero: + return S.One + elif x.is_zero: + return erfinv(y) + elif x is S.Infinity: + return erfcinv(-y) + elif y.is_zero: + return x + elif y is S.Infinity: + return erfinv(x) + + if x.is_zero: + if y.is_zero: + return S.Zero + else: + return erfinv(y) + if y.is_zero: + return x + + def _eval_is_zero(self): + x, y = self.args + if x.is_zero and y.is_zero: + return True + +############################################################################### +#################### EXPONENTIAL INTEGRALS #################################### +############################################################################### + +class Ei(DefinedFunction): + r""" + The classical exponential integral. + + Explanation + =========== + + For use in SymPy, this function is defined as + + .. math:: \operatorname{Ei}(x) = \sum_{n=1}^\infty \frac{x^n}{n\, n!} + + \log(x) + \gamma, + + where $\gamma$ is the Euler-Mascheroni constant. + + If $x$ is a polar number, this defines an analytic function on the + Riemann surface of the logarithm. Otherwise this defines an analytic + function in the cut plane $\mathbb{C} \setminus (-\infty, 0]$. + + **Background** + + The name exponential integral comes from the following statement: + + .. math:: \operatorname{Ei}(x) = \int_{-\infty}^x \frac{e^t}{t} \mathrm{d}t + + If the integral is interpreted as a Cauchy principal value, this statement + holds for $x > 0$ and $\operatorname{Ei}(x)$ as defined above. + + Examples + ======== + + >>> from sympy import Ei, polar_lift, exp_polar, I, pi + >>> from sympy.abc import x + + >>> Ei(-1) + Ei(-1) + + This yields a real value: + + >>> Ei(-1).n(chop=True) + -0.219383934395520 + + On the other hand the analytic continuation is not real: + + >>> Ei(polar_lift(-1)).n(chop=True) + -0.21938393439552 + 3.14159265358979*I + + The exponential integral has a logarithmic branch point at the origin: + + >>> Ei(x*exp_polar(2*I*pi)) + Ei(x) + 2*I*pi + + Differentiation is supported: + + >>> Ei(x).diff(x) + exp(x)/x + + The exponential integral is related to many other special functions. + For example: + + >>> from sympy import expint, Shi + >>> Ei(x).rewrite(expint) + -expint(1, x*exp_polar(I*pi)) - I*pi + >>> Ei(x).rewrite(Shi) + Chi(x) + Shi(x) + + See Also + ======== + + expint: Generalised exponential integral. + E1: Special case of the generalised exponential integral. + li: Logarithmic integral. + Li: Offset logarithmic integral. + Si: Sine integral. + Ci: Cosine integral. + Shi: Hyperbolic sine integral. + Chi: Hyperbolic cosine integral. + uppergamma: Upper incomplete gamma function. + + References + ========== + + .. [1] https://dlmf.nist.gov/6.6 + .. [2] https://en.wikipedia.org/wiki/Exponential_integral + .. [3] Abramowitz & Stegun, section 5: https://web.archive.org/web/20201128173312/http://people.math.sfu.ca/~cbm/aands/page_228.htm + + """ + + + @classmethod + def eval(cls, z): + if z.is_zero: + return S.NegativeInfinity + elif z is S.Infinity: + return S.Infinity + elif z is S.NegativeInfinity: + return S.Zero + + if z.is_zero: + return S.NegativeInfinity + + nz, n = z.extract_branch_factor() + if n: + return Ei(nz) + 2*I*pi*n + + def fdiff(self, argindex=1): + arg = unpolarify(self.args[0]) + if argindex == 1: + return exp(arg)/arg + else: + raise ArgumentIndexError(self, argindex) + + def _eval_evalf(self, prec): + if (self.args[0]/polar_lift(-1)).is_positive: + return super()._eval_evalf(prec) + (I*pi)._eval_evalf(prec) + return super()._eval_evalf(prec) + + def _eval_rewrite_as_uppergamma(self, z, **kwargs): + from sympy.functions.special.gamma_functions import uppergamma + # XXX this does not currently work usefully because uppergamma + # immediately turns into expint + return -uppergamma(0, polar_lift(-1)*z) - I*pi + + def _eval_rewrite_as_expint(self, z, **kwargs): + return -expint(1, polar_lift(-1)*z) - I*pi + + def _eval_rewrite_as_li(self, z, **kwargs): + if isinstance(z, log): + return li(z.args[0]) + # TODO: + # Actually it only holds that: + # Ei(z) = li(exp(z)) + # for -pi < imag(z) <= pi + return li(exp(z)) + + def _eval_rewrite_as_Si(self, z, **kwargs): + if z.is_negative: + return Shi(z) + Chi(z) - I*pi + else: + return Shi(z) + Chi(z) + _eval_rewrite_as_Ci = _eval_rewrite_as_Si + _eval_rewrite_as_Chi = _eval_rewrite_as_Si + _eval_rewrite_as_Shi = _eval_rewrite_as_Si + + def _eval_rewrite_as_tractable(self, z, limitvar=None, **kwargs): + return exp(z) * _eis(z) + + def _eval_rewrite_as_Integral(self, z, **kwargs): + from sympy.integrals.integrals import Integral + t = Dummy(uniquely_named_symbol('t', [z]).name) + return Integral(S.Exp1**t/t, (t, S.NegativeInfinity, z)) + + def _eval_as_leading_term(self, x, logx, cdir): + from sympy import re + x0 = self.args[0].limit(x, 0) + arg = self.args[0].as_leading_term(x, cdir=cdir) + cdir = arg.dir(x, cdir) + if x0.is_zero: + c, e = arg.as_coeff_exponent(x) + logx = log(x) if logx is None else logx + return log(c) + e*logx + EulerGamma - ( + I*pi if re(cdir).is_negative else S.Zero) + return super()._eval_as_leading_term(x, logx=logx, cdir=cdir) + + def _eval_nseries(self, x, n, logx, cdir=0): + x0 = self.args[0].limit(x, 0) + if x0.is_zero: + f = self._eval_rewrite_as_Si(*self.args) + return f._eval_nseries(x, n, logx) + return super()._eval_nseries(x, n, logx) + + def _eval_aseries(self, n, args0, x, logx): + from sympy.series.order import Order + point = args0[0] + + if point in (S.Infinity, S.NegativeInfinity): + z = self.args[0] + s = [factorial(k) / (z)**k for k in range(n)] + \ + [Order(1/z**n, x)] + return (exp(z)/z) * Add(*s) + + return super(Ei, self)._eval_aseries(n, args0, x, logx) + + +class expint(DefinedFunction): + r""" + Generalized exponential integral. + + Explanation + =========== + + This function is defined as + + .. math:: \operatorname{E}_\nu(z) = z^{\nu - 1} \Gamma(1 - \nu, z), + + where $\Gamma(1 - \nu, z)$ is the upper incomplete gamma function + (``uppergamma``). + + Hence for $z$ with positive real part we have + + .. math:: \operatorname{E}_\nu(z) + = \int_1^\infty \frac{e^{-zt}}{t^\nu} \mathrm{d}t, + + which explains the name. + + The representation as an incomplete gamma function provides an analytic + continuation for $\operatorname{E}_\nu(z)$. If $\nu$ is a + non-positive integer, the exponential integral is thus an unbranched + function of $z$, otherwise there is a branch point at the origin. + Refer to the incomplete gamma function documentation for details of the + branching behavior. + + Examples + ======== + + >>> from sympy import expint, S + >>> from sympy.abc import nu, z + + Differentiation is supported. Differentiation with respect to $z$ further + explains the name: for integral orders, the exponential integral is an + iterated integral of the exponential function. + + >>> expint(nu, z).diff(z) + -expint(nu - 1, z) + + Differentiation with respect to $\nu$ has no classical expression: + + >>> expint(nu, z).diff(nu) + -z**(nu - 1)*meijerg(((), (1, 1)), ((0, 0, 1 - nu), ()), z) + + At non-postive integer orders, the exponential integral reduces to the + exponential function: + + >>> expint(0, z) + exp(-z)/z + >>> expint(-1, z) + exp(-z)/z + exp(-z)/z**2 + + At half-integers it reduces to error functions: + + >>> expint(S(1)/2, z) + sqrt(pi)*erfc(sqrt(z))/sqrt(z) + + At positive integer orders it can be rewritten in terms of exponentials + and ``expint(1, z)``. Use ``expand_func()`` to do this: + + >>> from sympy import expand_func + >>> expand_func(expint(5, z)) + z**4*expint(1, z)/24 + (-z**3 + z**2 - 2*z + 6)*exp(-z)/24 + + The generalised exponential integral is essentially equivalent to the + incomplete gamma function: + + >>> from sympy import uppergamma + >>> expint(nu, z).rewrite(uppergamma) + z**(nu - 1)*uppergamma(1 - nu, z) + + As such it is branched at the origin: + + >>> from sympy import exp_polar, pi, I + >>> expint(4, z*exp_polar(2*pi*I)) + I*pi*z**3/3 + expint(4, z) + >>> expint(nu, z*exp_polar(2*pi*I)) + z**(nu - 1)*(exp(2*I*pi*nu) - 1)*gamma(1 - nu) + expint(nu, z) + + See Also + ======== + + Ei: Another related function called exponential integral. + E1: The classical case, returns expint(1, z). + li: Logarithmic integral. + Li: Offset logarithmic integral. + Si: Sine integral. + Ci: Cosine integral. + Shi: Hyperbolic sine integral. + Chi: Hyperbolic cosine integral. + uppergamma + + References + ========== + + .. [1] https://dlmf.nist.gov/8.19 + .. [2] https://functions.wolfram.com/GammaBetaErf/ExpIntegralE/ + .. [3] https://en.wikipedia.org/wiki/Exponential_integral + + """ + + + @classmethod + def eval(cls, nu, z): + from sympy.functions.special.gamma_functions import (gamma, uppergamma) + nu2 = unpolarify(nu) + if nu != nu2: + return expint(nu2, z) + if nu.is_Integer and nu <= 0 or (not nu.is_Integer and (2*nu).is_Integer): + return unpolarify(expand_mul(z**(nu - 1)*uppergamma(1 - nu, z))) + + # Extract branching information. This can be deduced from what is + # explained in lowergamma.eval(). + z, n = z.extract_branch_factor() + if n is S.Zero: + return + if nu.is_integer: + if not nu > 0: + return + return expint(nu, z) \ + - 2*pi*I*n*S.NegativeOne**(nu - 1)/factorial(nu - 1)*unpolarify(z)**(nu - 1) + else: + return (exp(2*I*pi*nu*n) - 1)*z**(nu - 1)*gamma(1 - nu) + expint(nu, z) + + def fdiff(self, argindex): + nu, z = self.args + if argindex == 1: + return -z**(nu - 1)*meijerg([], [1, 1], [0, 0, 1 - nu], [], z) + elif argindex == 2: + return -expint(nu - 1, z) + else: + raise ArgumentIndexError(self, argindex) + + def _eval_rewrite_as_uppergamma(self, nu, z, **kwargs): + from sympy.functions.special.gamma_functions import uppergamma + return z**(nu - 1)*uppergamma(1 - nu, z) + + def _eval_rewrite_as_Ei(self, nu, z, **kwargs): + if nu == 1: + return -Ei(z*exp_polar(-I*pi)) - I*pi + elif nu.is_Integer and nu > 1: + # DLMF, 8.19.7 + x = -unpolarify(z) + return x**(nu - 1)/factorial(nu - 1)*E1(z).rewrite(Ei) + \ + exp(x)/factorial(nu - 1) * \ + Add(*[factorial(nu - k - 2)*x**k for k in range(nu - 1)]) + else: + return self + + def _eval_expand_func(self, **hints): + return self.rewrite(Ei).rewrite(expint, **hints) + + def _eval_rewrite_as_Si(self, nu, z, **kwargs): + if nu != 1: + return self + return Shi(z) - Chi(z) + _eval_rewrite_as_Ci = _eval_rewrite_as_Si + _eval_rewrite_as_Chi = _eval_rewrite_as_Si + _eval_rewrite_as_Shi = _eval_rewrite_as_Si + + def _eval_nseries(self, x, n, logx, cdir=0): + if not self.args[0].has(x): + nu = self.args[0] + if nu == 1: + f = self._eval_rewrite_as_Si(*self.args) + return f._eval_nseries(x, n, logx) + elif nu.is_Integer and nu > 1: + f = self._eval_rewrite_as_Ei(*self.args) + return f._eval_nseries(x, n, logx) + return super()._eval_nseries(x, n, logx) + + def _eval_aseries(self, n, args0, x, logx): + from sympy.series.order import Order + point = args0[1] + nu = self.args[0] + + if point is S.Infinity: + z = self.args[1] + s = [S.NegativeOne**k * RisingFactorial(nu, k) / z**k for k in range(n)] + [Order(1/z**n, x)] + return (exp(-z)/z) * Add(*s) + + return super(expint, self)._eval_aseries(n, args0, x, logx) + + def _eval_rewrite_as_Integral(self, *args, **kwargs): + from sympy.integrals.integrals import Integral + n, x = self.args + t = Dummy(uniquely_named_symbol('t', args).name) + return Integral(t**-n * exp(-t*x), (t, 1, S.Infinity)) + + +def E1(z): + """ + Classical case of the generalized exponential integral. + + Explanation + =========== + + This is equivalent to ``expint(1, z)``. + + Examples + ======== + + >>> from sympy import E1 + >>> E1(0) + expint(1, 0) + + >>> E1(5) + expint(1, 5) + + See Also + ======== + + Ei: Exponential integral. + expint: Generalised exponential integral. + li: Logarithmic integral. + Li: Offset logarithmic integral. + Si: Sine integral. + Ci: Cosine integral. + Shi: Hyperbolic sine integral. + Chi: Hyperbolic cosine integral. + + """ + return expint(1, z) + + +class li(DefinedFunction): + r""" + The classical logarithmic integral. + + Explanation + =========== + + For use in SymPy, this function is defined as + + .. math:: \operatorname{li}(x) = \int_0^x \frac{1}{\log(t)} \mathrm{d}t \,. + + Examples + ======== + + >>> from sympy import I, oo, li + >>> from sympy.abc import z + + Several special values are known: + + >>> li(0) + 0 + >>> li(1) + -oo + >>> li(oo) + oo + + Differentiation with respect to $z$ is supported: + + >>> from sympy import diff + >>> diff(li(z), z) + 1/log(z) + + Defining the ``li`` function via an integral: + >>> from sympy import integrate + >>> integrate(li(z)) + z*li(z) - Ei(2*log(z)) + + >>> integrate(li(z),z) + z*li(z) - Ei(2*log(z)) + + + The logarithmic integral can also be defined in terms of ``Ei``: + + >>> from sympy import Ei + >>> li(z).rewrite(Ei) + Ei(log(z)) + >>> diff(li(z).rewrite(Ei), z) + 1/log(z) + + We can numerically evaluate the logarithmic integral to arbitrary precision + on the whole complex plane (except the singular points): + + >>> li(2).evalf(30) + 1.04516378011749278484458888919 + + >>> li(2*I).evalf(30) + 1.0652795784357498247001125598 + 3.08346052231061726610939702133*I + + We can even compute Soldner's constant by the help of mpmath: + + >>> from mpmath import findroot + >>> findroot(li, 2) + 1.45136923488338 + + Further transformations include rewriting ``li`` in terms of + the trigonometric integrals ``Si``, ``Ci``, ``Shi`` and ``Chi``: + + >>> from sympy import Si, Ci, Shi, Chi + >>> li(z).rewrite(Si) + -log(I*log(z)) - log(1/log(z))/2 + log(log(z))/2 + Ci(I*log(z)) + Shi(log(z)) + >>> li(z).rewrite(Ci) + -log(I*log(z)) - log(1/log(z))/2 + log(log(z))/2 + Ci(I*log(z)) + Shi(log(z)) + >>> li(z).rewrite(Shi) + -log(1/log(z))/2 + log(log(z))/2 + Chi(log(z)) - Shi(log(z)) + >>> li(z).rewrite(Chi) + -log(1/log(z))/2 + log(log(z))/2 + Chi(log(z)) - Shi(log(z)) + + See Also + ======== + + Li: Offset logarithmic integral. + Ei: Exponential integral. + expint: Generalised exponential integral. + E1: Special case of the generalised exponential integral. + Si: Sine integral. + Ci: Cosine integral. + Shi: Hyperbolic sine integral. + Chi: Hyperbolic cosine integral. + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Logarithmic_integral + .. [2] https://mathworld.wolfram.com/LogarithmicIntegral.html + .. [3] https://dlmf.nist.gov/6 + .. [4] https://mathworld.wolfram.com/SoldnersConstant.html + + """ + + + @classmethod + def eval(cls, z): + if z.is_zero: + return S.Zero + elif z is S.One: + return S.NegativeInfinity + elif z is S.Infinity: + return S.Infinity + if z.is_zero: + return S.Zero + + def fdiff(self, argindex=1): + arg = self.args[0] + if argindex == 1: + return S.One / log(arg) + else: + raise ArgumentIndexError(self, argindex) + + def _eval_conjugate(self): + z = self.args[0] + # Exclude values on the branch cut (-oo, 0) + if not z.is_extended_negative: + return self.func(z.conjugate()) + + def _eval_rewrite_as_Li(self, z, **kwargs): + return Li(z) + li(2) + + def _eval_rewrite_as_Ei(self, z, **kwargs): + return Ei(log(z)) + + def _eval_rewrite_as_uppergamma(self, z, **kwargs): + from sympy.functions.special.gamma_functions import uppergamma + return (-uppergamma(0, -log(z)) + + S.Half*(log(log(z)) - log(S.One/log(z))) - log(-log(z))) + + def _eval_rewrite_as_Si(self, z, **kwargs): + return (Ci(I*log(z)) - I*Si(I*log(z)) - + S.Half*(log(S.One/log(z)) - log(log(z))) - log(I*log(z))) + + _eval_rewrite_as_Ci = _eval_rewrite_as_Si + + def _eval_rewrite_as_Shi(self, z, **kwargs): + return (Chi(log(z)) - Shi(log(z)) - S.Half*(log(S.One/log(z)) - log(log(z)))) + + _eval_rewrite_as_Chi = _eval_rewrite_as_Shi + + def _eval_rewrite_as_hyper(self, z, **kwargs): + return (log(z)*hyper((1, 1), (2, 2), log(z)) + + S.Half*(log(log(z)) - log(S.One/log(z))) + EulerGamma) + + def _eval_rewrite_as_meijerg(self, z, **kwargs): + return (-log(-log(z)) - S.Half*(log(S.One/log(z)) - log(log(z))) + - meijerg(((), (1,)), ((0, 0), ()), -log(z))) + + def _eval_rewrite_as_tractable(self, z, limitvar=None, **kwargs): + return z * _eis(log(z)) + + def _eval_nseries(self, x, n, logx, cdir=0): + z = self.args[0] + s = [(log(z))**k / (factorial(k) * k) for k in range(1, n)] + return EulerGamma + log(log(z)) + Add(*s) + + def _eval_is_zero(self): + z = self.args[0] + if z.is_zero: + return True + +class Li(DefinedFunction): + r""" + The offset logarithmic integral. + + Explanation + =========== + + For use in SymPy, this function is defined as + + .. math:: \operatorname{Li}(x) = \operatorname{li}(x) - \operatorname{li}(2) + + Examples + ======== + + >>> from sympy import Li + >>> from sympy.abc import z + + The following special value is known: + + >>> Li(2) + 0 + + Differentiation with respect to $z$ is supported: + + >>> from sympy import diff + >>> diff(Li(z), z) + 1/log(z) + + The shifted logarithmic integral can be written in terms of $li(z)$: + + >>> from sympy import li + >>> Li(z).rewrite(li) + li(z) - li(2) + + We can numerically evaluate the logarithmic integral to arbitrary precision + on the whole complex plane (except the singular points): + + >>> Li(2).evalf(30) + 0 + + >>> Li(4).evalf(30) + 1.92242131492155809316615998938 + + See Also + ======== + + li: Logarithmic integral. + Ei: Exponential integral. + expint: Generalised exponential integral. + E1: Special case of the generalised exponential integral. + Si: Sine integral. + Ci: Cosine integral. + Shi: Hyperbolic sine integral. + Chi: Hyperbolic cosine integral. + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Logarithmic_integral + .. [2] https://mathworld.wolfram.com/LogarithmicIntegral.html + .. [3] https://dlmf.nist.gov/6 + + """ + + + @classmethod + def eval(cls, z): + if z is S.Infinity: + return S.Infinity + elif z == S(2): + return S.Zero + + def fdiff(self, argindex=1): + arg = self.args[0] + if argindex == 1: + return S.One / log(arg) + else: + raise ArgumentIndexError(self, argindex) + + def _eval_evalf(self, prec): + return self.rewrite(li).evalf(prec) + + def _eval_rewrite_as_li(self, z, **kwargs): + return li(z) - li(2) + + def _eval_rewrite_as_tractable(self, z, limitvar=None, **kwargs): + return self.rewrite(li).rewrite("tractable", deep=True) + + def _eval_nseries(self, x, n, logx, cdir=0): + f = self._eval_rewrite_as_li(*self.args) + return f._eval_nseries(x, n, logx) + +############################################################################### +#################### TRIGONOMETRIC INTEGRALS ################################## +############################################################################### + +class TrigonometricIntegral(DefinedFunction): + """ Base class for trigonometric integrals. """ + + + @classmethod + def eval(cls, z): + if z is S.Zero: + return cls._atzero + elif z is S.Infinity: + return cls._atinf() + elif z is S.NegativeInfinity: + return cls._atneginf() + + if z.is_zero: + return cls._atzero + + nz = z.extract_multiplicatively(polar_lift(I)) + if nz is None and cls._trigfunc(0) == 0: + nz = z.extract_multiplicatively(I) + if nz is not None: + return cls._Ifactor(nz, 1) + nz = z.extract_multiplicatively(polar_lift(-I)) + if nz is not None: + return cls._Ifactor(nz, -1) + + nz = z.extract_multiplicatively(polar_lift(-1)) + if nz is None and cls._trigfunc(0) == 0: + nz = z.extract_multiplicatively(-1) + if nz is not None: + return cls._minusfactor(nz) + + nz, n = z.extract_branch_factor() + if n == 0 and nz == z: + return + return 2*pi*I*n*cls._trigfunc(0) + cls(nz) + + def fdiff(self, argindex=1): + arg = unpolarify(self.args[0]) + if argindex == 1: + return self._trigfunc(arg)/arg + else: + raise ArgumentIndexError(self, argindex) + + def _eval_rewrite_as_Ei(self, z, **kwargs): + return self._eval_rewrite_as_expint(z).rewrite(Ei) + + def _eval_rewrite_as_uppergamma(self, z, **kwargs): + from sympy.functions.special.gamma_functions import uppergamma + return self._eval_rewrite_as_expint(z).rewrite(uppergamma) + + def _eval_nseries(self, x, n, logx, cdir=0): + # NOTE this is fairly inefficient + if self.args[0].subs(x, 0) != 0: + return super()._eval_nseries(x, n, logx) + baseseries = self._trigfunc(x)._eval_nseries(x, n, logx) + if self._trigfunc(0) != 0: + baseseries -= 1 + baseseries = baseseries.replace(Pow, lambda t, n: t**n/n, simultaneous=False) + if self._trigfunc(0) != 0: + baseseries += EulerGamma + log(x) + return baseseries.subs(x, self.args[0])._eval_nseries(x, n, logx) + + +class Si(TrigonometricIntegral): + r""" + Sine integral. + + Explanation + =========== + + This function is defined by + + .. math:: \operatorname{Si}(z) = \int_0^z \frac{\sin{t}}{t} \mathrm{d}t. + + It is an entire function. + + Examples + ======== + + >>> from sympy import Si + >>> from sympy.abc import z + + The sine integral is an antiderivative of $sin(z)/z$: + + >>> Si(z).diff(z) + sin(z)/z + + It is unbranched: + + >>> from sympy import exp_polar, I, pi + >>> Si(z*exp_polar(2*I*pi)) + Si(z) + + Sine integral behaves much like ordinary sine under multiplication by ``I``: + + >>> Si(I*z) + I*Shi(z) + >>> Si(-z) + -Si(z) + + It can also be expressed in terms of exponential integrals, but beware + that the latter is branched: + + >>> from sympy import expint + >>> Si(z).rewrite(expint) + -I*(-expint(1, z*exp_polar(-I*pi/2))/2 + + expint(1, z*exp_polar(I*pi/2))/2) + pi/2 + + It can be rewritten in the form of sinc function (by definition): + + >>> from sympy import sinc + >>> Si(z).rewrite(sinc) + Integral(sinc(_t), (_t, 0, z)) + + See Also + ======== + + Ci: Cosine integral. + Shi: Hyperbolic sine integral. + Chi: Hyperbolic cosine integral. + Ei: Exponential integral. + expint: Generalised exponential integral. + sinc: unnormalized sinc function + E1: Special case of the generalised exponential integral. + li: Logarithmic integral. + Li: Offset logarithmic integral. + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Trigonometric_integral + + """ + + _trigfunc = sin + _atzero = S.Zero + + @classmethod + def _atinf(cls): + return pi*S.Half + + @classmethod + def _atneginf(cls): + return -pi*S.Half + + @classmethod + def _minusfactor(cls, z): + return -Si(z) + + @classmethod + def _Ifactor(cls, z, sign): + return I*Shi(z)*sign + + def _eval_rewrite_as_expint(self, z, **kwargs): + # XXX should we polarify z? + return pi/2 + (E1(polar_lift(I)*z) - E1(polar_lift(-I)*z))/2/I + + def _eval_rewrite_as_Integral(self, z, **kwargs): + from sympy.integrals.integrals import Integral + t = Dummy(uniquely_named_symbol('t', [z]).name) + return Integral(sinc(t), (t, 0, z)) + + _eval_rewrite_as_sinc = _eval_rewrite_as_Integral + + def _eval_as_leading_term(self, x, logx, cdir): + arg = self.args[0].as_leading_term(x, logx=logx, cdir=cdir) + arg0 = arg.subs(x, 0) + + if arg0 is S.NaN: + arg0 = arg.limit(x, 0, dir='-' if re(cdir).is_negative else '+') + if arg0.is_zero: + return arg + elif not arg0.is_infinite: + return self.func(arg0) + else: + return self + + def _eval_aseries(self, n, args0, x, logx): + from sympy.series.order import Order + point = args0[0] + + # Expansion at oo + if point is S.Infinity: + z = self.args[0] + p = [S.NegativeOne**k * factorial(2*k) / z**(2*k + 1) + for k in range(n//2 + 1)] + [Order(1/z**n, x)] + q = [S.NegativeOne**k * factorial(2*k + 1) / z**(2*(k + 1)) + for k in range(n//2)] + [Order(1/z**n, x)] + return pi/2 - cos(z)*Add(*p) - sin(z)*Add(*q) + + # All other points are not handled + return super(Si, self)._eval_aseries(n, args0, x, logx) + + def _eval_is_zero(self): + z = self.args[0] + if z.is_zero: + return True + + +class Ci(TrigonometricIntegral): + r""" + Cosine integral. + + Explanation + =========== + + This function is defined for positive $x$ by + + .. math:: \operatorname{Ci}(x) = \gamma + \log{x} + + \int_0^x \frac{\cos{t} - 1}{t} \mathrm{d}t + = -\int_x^\infty \frac{\cos{t}}{t} \mathrm{d}t, + + where $\gamma$ is the Euler-Mascheroni constant. + + We have + + .. math:: \operatorname{Ci}(z) = + -\frac{\operatorname{E}_1\left(e^{i\pi/2} z\right) + + \operatorname{E}_1\left(e^{-i \pi/2} z\right)}{2} + + which holds for all polar $z$ and thus provides an analytic + continuation to the Riemann surface of the logarithm. + + The formula also holds as stated + for $z \in \mathbb{C}$ with $\Re(z) > 0$. + By lifting to the principal branch, we obtain an analytic function on the + cut complex plane. + + Examples + ======== + + >>> from sympy import Ci + >>> from sympy.abc import z + + The cosine integral is a primitive of $\cos(z)/z$: + + >>> Ci(z).diff(z) + cos(z)/z + + It has a logarithmic branch point at the origin: + + >>> from sympy import exp_polar, I, pi + >>> Ci(z*exp_polar(2*I*pi)) + Ci(z) + 2*I*pi + + The cosine integral behaves somewhat like ordinary $\cos$ under + multiplication by $i$: + + >>> from sympy import polar_lift + >>> Ci(polar_lift(I)*z) + Chi(z) + I*pi/2 + >>> Ci(polar_lift(-1)*z) + Ci(z) + I*pi + + It can also be expressed in terms of exponential integrals: + + >>> from sympy import expint + >>> Ci(z).rewrite(expint) + -expint(1, z*exp_polar(-I*pi/2))/2 - expint(1, z*exp_polar(I*pi/2))/2 + + See Also + ======== + + Si: Sine integral. + Shi: Hyperbolic sine integral. + Chi: Hyperbolic cosine integral. + Ei: Exponential integral. + expint: Generalised exponential integral. + E1: Special case of the generalised exponential integral. + li: Logarithmic integral. + Li: Offset logarithmic integral. + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Trigonometric_integral + + """ + + _trigfunc = cos + _atzero = S.ComplexInfinity + + @classmethod + def _atinf(cls): + return S.Zero + + @classmethod + def _atneginf(cls): + return I*pi + + @classmethod + def _minusfactor(cls, z): + return Ci(z) + I*pi + + @classmethod + def _Ifactor(cls, z, sign): + return Chi(z) + I*pi/2*sign + + def _eval_rewrite_as_expint(self, z, **kwargs): + return -(E1(polar_lift(I)*z) + E1(polar_lift(-I)*z))/2 + + def _eval_rewrite_as_Integral(self, z, **kwargs): + from sympy.integrals.integrals import Integral + t = Dummy(uniquely_named_symbol('t', [z]).name) + return S.EulerGamma + log(z) - Integral((1-cos(t))/t, (t, 0, z)) + + def _eval_as_leading_term(self, x, logx, cdir): + arg = self.args[0].as_leading_term(x, logx=logx, cdir=cdir) + arg0 = arg.subs(x, 0) + + if arg0 is S.NaN: + arg0 = arg.limit(x, 0, dir='-' if re(cdir).is_negative else '+') + if arg0.is_zero: + c, e = arg.as_coeff_exponent(x) + logx = log(x) if logx is None else logx + return log(c) + e*logx + EulerGamma + elif arg0.is_finite: + return self.func(arg0) + else: + return self + + def _eval_aseries(self, n, args0, x, logx): + from sympy.series.order import Order + point = args0[0] + + if point in (S.Infinity, S.NegativeInfinity): + z = self.args[0] + p = [S.NegativeOne**k * factorial(2*k) / z**(2*k + 1) + for k in range(n//2 + 1)] + [Order(1/z**n, x)] + q = [S.NegativeOne**k * factorial(2*k + 1) / z**(2*(k + 1)) + for k in range(n//2)] + [Order(1/z**n, x)] + result = sin(z)*(Add(*p)) - cos(z)*(Add(*q)) + + if point is S.NegativeInfinity: + result += I*pi + return result + + return super(Ci, self)._eval_aseries(n, args0, x, logx) + +class Shi(TrigonometricIntegral): + r""" + Sinh integral. + + Explanation + =========== + + This function is defined by + + .. math:: \operatorname{Shi}(z) = \int_0^z \frac{\sinh{t}}{t} \mathrm{d}t. + + It is an entire function. + + Examples + ======== + + >>> from sympy import Shi + >>> from sympy.abc import z + + The Sinh integral is a primitive of $\sinh(z)/z$: + + >>> Shi(z).diff(z) + sinh(z)/z + + It is unbranched: + + >>> from sympy import exp_polar, I, pi + >>> Shi(z*exp_polar(2*I*pi)) + Shi(z) + + The $\sinh$ integral behaves much like ordinary $\sinh$ under + multiplication by $i$: + + >>> Shi(I*z) + I*Si(z) + >>> Shi(-z) + -Shi(z) + + It can also be expressed in terms of exponential integrals, but beware + that the latter is branched: + + >>> from sympy import expint + >>> Shi(z).rewrite(expint) + expint(1, z)/2 - expint(1, z*exp_polar(I*pi))/2 - I*pi/2 + + See Also + ======== + + Si: Sine integral. + Ci: Cosine integral. + Chi: Hyperbolic cosine integral. + Ei: Exponential integral. + expint: Generalised exponential integral. + E1: Special case of the generalised exponential integral. + li: Logarithmic integral. + Li: Offset logarithmic integral. + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Trigonometric_integral + + """ + + _trigfunc = sinh + _atzero = S.Zero + + @classmethod + def _atinf(cls): + return S.Infinity + + @classmethod + def _atneginf(cls): + return S.NegativeInfinity + + @classmethod + def _minusfactor(cls, z): + return -Shi(z) + + @classmethod + def _Ifactor(cls, z, sign): + return I*Si(z)*sign + + def _eval_rewrite_as_expint(self, z, **kwargs): + # XXX should we polarify z? + return (E1(z) - E1(exp_polar(I*pi)*z))/2 - I*pi/2 + + def _eval_is_zero(self): + z = self.args[0] + if z.is_zero: + return True + + def _eval_as_leading_term(self, x, logx, cdir): + arg = self.args[0].as_leading_term(x) + arg0 = arg.subs(x, 0) + + if arg0 is S.NaN: + arg0 = arg.limit(x, 0, dir='-' if re(cdir).is_negative else '+') + if arg0.is_zero: + return arg + elif not arg0.is_infinite: + return self.func(arg0) + else: + return self + + +class Chi(TrigonometricIntegral): + r""" + Cosh integral. + + Explanation + =========== + + This function is defined for positive $x$ by + + .. math:: \operatorname{Chi}(x) = \gamma + \log{x} + + \int_0^x \frac{\cosh{t} - 1}{t} \mathrm{d}t, + + where $\gamma$ is the Euler-Mascheroni constant. + + We have + + .. math:: \operatorname{Chi}(z) = \operatorname{Ci}\left(e^{i \pi/2}z\right) + - i\frac{\pi}{2}, + + which holds for all polar $z$ and thus provides an analytic + continuation to the Riemann surface of the logarithm. + By lifting to the principal branch we obtain an analytic function on the + cut complex plane. + + Examples + ======== + + >>> from sympy import Chi + >>> from sympy.abc import z + + The $\cosh$ integral is a primitive of $\cosh(z)/z$: + + >>> Chi(z).diff(z) + cosh(z)/z + + It has a logarithmic branch point at the origin: + + >>> from sympy import exp_polar, I, pi + >>> Chi(z*exp_polar(2*I*pi)) + Chi(z) + 2*I*pi + + The $\cosh$ integral behaves somewhat like ordinary $\cosh$ under + multiplication by $i$: + + >>> from sympy import polar_lift + >>> Chi(polar_lift(I)*z) + Ci(z) + I*pi/2 + >>> Chi(polar_lift(-1)*z) + Chi(z) + I*pi + + It can also be expressed in terms of exponential integrals: + + >>> from sympy import expint + >>> Chi(z).rewrite(expint) + -expint(1, z)/2 - expint(1, z*exp_polar(I*pi))/2 - I*pi/2 + + See Also + ======== + + Si: Sine integral. + Ci: Cosine integral. + Shi: Hyperbolic sine integral. + Ei: Exponential integral. + expint: Generalised exponential integral. + E1: Special case of the generalised exponential integral. + li: Logarithmic integral. + Li: Offset logarithmic integral. + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Trigonometric_integral + + """ + + _trigfunc = cosh + _atzero = S.ComplexInfinity + + @classmethod + def _atinf(cls): + return S.Infinity + + @classmethod + def _atneginf(cls): + return S.Infinity + + @classmethod + def _minusfactor(cls, z): + return Chi(z) + I*pi + + @classmethod + def _Ifactor(cls, z, sign): + return Ci(z) + I*pi/2*sign + + def _eval_rewrite_as_expint(self, z, **kwargs): + return -I*pi/2 - (E1(z) + E1(exp_polar(I*pi)*z))/2 + + def _eval_as_leading_term(self, x, logx, cdir): + arg = self.args[0].as_leading_term(x, logx=logx, cdir=cdir) + arg0 = arg.subs(x, 0) + + if arg0 is S.NaN: + arg0 = arg.limit(x, 0, dir='-' if re(cdir).is_negative else '+') + if arg0.is_zero: + c, e = arg.as_coeff_exponent(x) + logx = log(x) if logx is None else logx + return log(c) + e*logx + EulerGamma + elif arg0.is_finite: + return self.func(arg0) + else: + return self + + +############################################################################### +#################### FRESNEL INTEGRALS ######################################## +############################################################################### + +class FresnelIntegral(DefinedFunction): + """ Base class for the Fresnel integrals.""" + + unbranched = True + + @classmethod + def eval(cls, z): + # Values at positive infinities signs + # if any were extracted automatically + if z is S.Infinity: + return S.Half + + # Value at zero + if z.is_zero: + return S.Zero + + # Try to pull out factors of -1 and I + prefact = S.One + newarg = z + changed = False + + nz = newarg.extract_multiplicatively(-1) + if nz is not None: + prefact = -prefact + newarg = nz + changed = True + + nz = newarg.extract_multiplicatively(I) + if nz is not None: + prefact = cls._sign*I*prefact + newarg = nz + changed = True + + if changed: + return prefact*cls(newarg) + + def fdiff(self, argindex=1): + if argindex == 1: + return self._trigfunc(S.Half*pi*self.args[0]**2) + else: + raise ArgumentIndexError(self, argindex) + + def _eval_is_extended_real(self): + return self.args[0].is_extended_real + + _eval_is_finite = _eval_is_extended_real + + def _eval_is_zero(self): + return self.args[0].is_zero + + def _eval_conjugate(self): + return self.func(self.args[0].conjugate()) + + as_real_imag = real_to_real_as_real_imag + + +class fresnels(FresnelIntegral): + r""" + Fresnel integral S. + + Explanation + =========== + + This function is defined by + + .. math:: \operatorname{S}(z) = \int_0^z \sin{\frac{\pi}{2} t^2} \mathrm{d}t. + + It is an entire function. + + Examples + ======== + + >>> from sympy import I, oo, fresnels + >>> from sympy.abc import z + + Several special values are known: + + >>> fresnels(0) + 0 + >>> fresnels(oo) + 1/2 + >>> fresnels(-oo) + -1/2 + >>> fresnels(I*oo) + -I/2 + >>> fresnels(-I*oo) + I/2 + + In general one can pull out factors of -1 and $i$ from the argument: + + >>> fresnels(-z) + -fresnels(z) + >>> fresnels(I*z) + -I*fresnels(z) + + The Fresnel S integral obeys the mirror symmetry + $\overline{S(z)} = S(\bar{z})$: + + >>> from sympy import conjugate + >>> conjugate(fresnels(z)) + fresnels(conjugate(z)) + + Differentiation with respect to $z$ is supported: + + >>> from sympy import diff + >>> diff(fresnels(z), z) + sin(pi*z**2/2) + + Defining the Fresnel functions via an integral: + + >>> from sympy import integrate, pi, sin, expand_func + >>> integrate(sin(pi*z**2/2), z) + 3*fresnels(z)*gamma(3/4)/(4*gamma(7/4)) + >>> expand_func(integrate(sin(pi*z**2/2), z)) + fresnels(z) + + We can numerically evaluate the Fresnel integral to arbitrary precision + on the whole complex plane: + + >>> fresnels(2).evalf(30) + 0.343415678363698242195300815958 + + >>> fresnels(-2*I).evalf(30) + 0.343415678363698242195300815958*I + + See Also + ======== + + fresnelc: Fresnel cosine integral. + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Fresnel_integral + .. [2] https://dlmf.nist.gov/7 + .. [3] https://mathworld.wolfram.com/FresnelIntegrals.html + .. [4] https://functions.wolfram.com/GammaBetaErf/FresnelS + .. [5] The converging factors for the fresnel integrals + by John W. Wrench Jr. and Vicki Alley + + """ + _trigfunc = sin + _sign = -S.One + + @staticmethod + @cacheit + def taylor_term(n, x, *previous_terms): + if n < 0: + return S.Zero + else: + x = sympify(x) + if len(previous_terms) > 1: + p = previous_terms[-1] + return (-pi**2*x**4*(4*n - 1)/(8*n*(2*n + 1)*(4*n + 3))) * p + else: + return x**3 * (-x**4)**n * (S(2)**(-2*n - 1)*pi**(2*n + 1)) / ((4*n + 3)*factorial(2*n + 1)) + + def _eval_rewrite_as_erf(self, z, **kwargs): + return (S.One + I)/4 * (erf((S.One + I)/2*sqrt(pi)*z) - I*erf((S.One - I)/2*sqrt(pi)*z)) + + def _eval_rewrite_as_hyper(self, z, **kwargs): + return pi*z**3/6 * hyper([Rational(3, 4)], [Rational(3, 2), Rational(7, 4)], -pi**2*z**4/16) + + def _eval_rewrite_as_meijerg(self, z, **kwargs): + return (pi*z**Rational(9, 4) / (sqrt(2)*(z**2)**Rational(3, 4)*(-z)**Rational(3, 4)) + * meijerg([], [1], [Rational(3, 4)], [Rational(1, 4), 0], -pi**2*z**4/16)) + + def _eval_rewrite_as_Integral(self, z, **kwargs): + from sympy.integrals.integrals import Integral + t = Dummy(uniquely_named_symbol('t', [z]).name) + return Integral(sin(pi*t**2/2), (t, 0, z)) + + def _eval_as_leading_term(self, x, logx, cdir): + from sympy.series.order import Order + arg = self.args[0].as_leading_term(x, logx=logx, cdir=cdir) + arg0 = arg.subs(x, 0) + + if arg0 is S.ComplexInfinity: + arg0 = arg.limit(x, 0, dir='-' if re(cdir).is_negative else '+') + if arg0.is_zero: + return pi*arg**3/6 + elif arg0 in [S.Infinity, S.NegativeInfinity]: + s = 1 if arg0 is S.Infinity else -1 + return s*S.Half + Order(x, x) + else: + return self.func(arg0) + + def _eval_aseries(self, n, args0, x, logx): + from sympy.series.order import Order + point = args0[0] + + # Expansion at oo and -oo + if point in [S.Infinity, -S.Infinity]: + z = self.args[0] + + # expansion of S(x) = S1(x*sqrt(pi/2)), see reference[5] page 1-8 + # as only real infinities are dealt with, sin and cos are O(1) + p = [S.NegativeOne**k * factorial(4*k + 1) / + (2**(2*k + 2) * z**(4*k + 3) * 2**(2*k)*factorial(2*k)) + for k in range(0, n) if 4*k + 3 < n] + q = [1/(2*z)] + [S.NegativeOne**k * factorial(4*k - 1) / + (2**(2*k + 1) * z**(4*k + 1) * 2**(2*k - 1)*factorial(2*k - 1)) + for k in range(1, n) if 4*k + 1 < n] + + p = [-sqrt(2/pi)*t for t in p] + q = [-sqrt(2/pi)*t for t in q] + s = 1 if point is S.Infinity else -1 + # The expansion at oo is 1/2 + some odd powers of z + # To get the expansion at -oo, replace z by -z and flip the sign + # The result -1/2 + the same odd powers of z as before. + return s*S.Half + (sin(z**2)*Add(*p) + cos(z**2)*Add(*q) + ).subs(x, sqrt(2/pi)*x) + Order(1/z**n, x) + + # All other points are not handled + return super()._eval_aseries(n, args0, x, logx) + + +class fresnelc(FresnelIntegral): + r""" + Fresnel integral C. + + Explanation + =========== + + This function is defined by + + .. math:: \operatorname{C}(z) = \int_0^z \cos{\frac{\pi}{2} t^2} \mathrm{d}t. + + It is an entire function. + + Examples + ======== + + >>> from sympy import I, oo, fresnelc + >>> from sympy.abc import z + + Several special values are known: + + >>> fresnelc(0) + 0 + >>> fresnelc(oo) + 1/2 + >>> fresnelc(-oo) + -1/2 + >>> fresnelc(I*oo) + I/2 + >>> fresnelc(-I*oo) + -I/2 + + In general one can pull out factors of -1 and $i$ from the argument: + + >>> fresnelc(-z) + -fresnelc(z) + >>> fresnelc(I*z) + I*fresnelc(z) + + The Fresnel C integral obeys the mirror symmetry + $\overline{C(z)} = C(\bar{z})$: + + >>> from sympy import conjugate + >>> conjugate(fresnelc(z)) + fresnelc(conjugate(z)) + + Differentiation with respect to $z$ is supported: + + >>> from sympy import diff + >>> diff(fresnelc(z), z) + cos(pi*z**2/2) + + Defining the Fresnel functions via an integral: + + >>> from sympy import integrate, pi, cos, expand_func + >>> integrate(cos(pi*z**2/2), z) + fresnelc(z)*gamma(1/4)/(4*gamma(5/4)) + >>> expand_func(integrate(cos(pi*z**2/2), z)) + fresnelc(z) + + We can numerically evaluate the Fresnel integral to arbitrary precision + on the whole complex plane: + + >>> fresnelc(2).evalf(30) + 0.488253406075340754500223503357 + + >>> fresnelc(-2*I).evalf(30) + -0.488253406075340754500223503357*I + + See Also + ======== + + fresnels: Fresnel sine integral. + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Fresnel_integral + .. [2] https://dlmf.nist.gov/7 + .. [3] https://mathworld.wolfram.com/FresnelIntegrals.html + .. [4] https://functions.wolfram.com/GammaBetaErf/FresnelC + .. [5] The converging factors for the fresnel integrals + by John W. Wrench Jr. and Vicki Alley + + """ + _trigfunc = cos + _sign = S.One + + @staticmethod + @cacheit + def taylor_term(n, x, *previous_terms): + if n < 0: + return S.Zero + else: + x = sympify(x) + if len(previous_terms) > 1: + p = previous_terms[-1] + return (-pi**2*x**4*(4*n - 3)/(8*n*(2*n - 1)*(4*n + 1))) * p + else: + return x * (-x**4)**n * (S(2)**(-2*n)*pi**(2*n)) / ((4*n + 1)*factorial(2*n)) + + def _eval_rewrite_as_erf(self, z, **kwargs): + return (S.One - I)/4 * (erf((S.One + I)/2*sqrt(pi)*z) + I*erf((S.One - I)/2*sqrt(pi)*z)) + + def _eval_rewrite_as_hyper(self, z, **kwargs): + return z * hyper([Rational(1, 4)], [S.Half, Rational(5, 4)], -pi**2*z**4/16) + + def _eval_rewrite_as_meijerg(self, z, **kwargs): + return (pi*z**Rational(3, 4) / (sqrt(2)*root(z**2, 4)*root(-z, 4)) + * meijerg([], [1], [Rational(1, 4)], [Rational(3, 4), 0], -pi**2*z**4/16)) + + def _eval_rewrite_as_Integral(self, z, **kwargs): + from sympy.integrals.integrals import Integral + t = Dummy(uniquely_named_symbol('t', [z]).name) + return Integral(cos(pi*t**2/2), (t, 0, z)) + + def _eval_as_leading_term(self, x, logx, cdir): + from sympy.series.order import Order + arg = self.args[0].as_leading_term(x, logx=logx, cdir=cdir) + arg0 = arg.subs(x, 0) + + if arg0 is S.ComplexInfinity: + arg0 = arg.limit(x, 0, dir='-' if re(cdir).is_negative else '+') + if arg0.is_zero: + return arg + elif arg0 in [S.Infinity, S.NegativeInfinity]: + s = 1 if arg0 is S.Infinity else -1 + return s*S.Half + Order(x, x) + else: + return self.func(arg0) + + def _eval_aseries(self, n, args0, x, logx): + from sympy.series.order import Order + point = args0[0] + + # Expansion at oo + if point in [S.Infinity, -S.Infinity]: + z = self.args[0] + + # expansion of C(x) = C1(x*sqrt(pi/2)), see reference[5] page 1-8 + # as only real infinities are dealt with, sin and cos are O(1) + p = [S.NegativeOne**k * factorial(4*k + 1) / + (2**(2*k + 2) * z**(4*k + 3) * 2**(2*k)*factorial(2*k)) + for k in range(n) if 4*k + 3 < n] + q = [1/(2*z)] + [S.NegativeOne**k * factorial(4*k - 1) / + (2**(2*k + 1) * z**(4*k + 1) * 2**(2*k - 1)*factorial(2*k - 1)) + for k in range(1, n) if 4*k + 1 < n] + + p = [-sqrt(2/pi)*t for t in p] + q = [ sqrt(2/pi)*t for t in q] + s = 1 if point is S.Infinity else -1 + # The expansion at oo is 1/2 + some odd powers of z + # To get the expansion at -oo, replace z by -z and flip the sign + # The result -1/2 + the same odd powers of z as before. + return s*S.Half + (cos(z**2)*Add(*p) + sin(z**2)*Add(*q) + ).subs(x, sqrt(2/pi)*x) + Order(1/z**n, x) + + # All other points are not handled + return super()._eval_aseries(n, args0, x, logx) + + +############################################################################### +#################### HELPER FUNCTIONS ######################################### +############################################################################### + + +class _erfs(DefinedFunction): + """ + Helper function to make the $\\mathrm{erf}(z)$ function + tractable for the Gruntz algorithm. + + """ + @classmethod + def eval(cls, arg): + if arg.is_zero: + return S.One + + def _eval_aseries(self, n, args0, x, logx): + from sympy.series.order import Order + point = args0[0] + + # Expansion at oo + if point is S.Infinity: + z = self.args[0] + l = [1/sqrt(pi) * factorial(2*k)*(-S( + 4))**(-k)/factorial(k) * (1/z)**(2*k + 1) for k in range(n)] + o = Order(1/z**(2*n + 1), x) + # It is very inefficient to first add the order and then do the nseries + return (Add(*l))._eval_nseries(x, n, logx) + o + + # Expansion at I*oo + t = point.extract_multiplicatively(I) + if t is S.Infinity: + z = self.args[0] + # TODO: is the series really correct? + l = [1/sqrt(pi) * factorial(2*k)*(-S( + 4))**(-k)/factorial(k) * (1/z)**(2*k + 1) for k in range(n)] + o = Order(1/z**(2*n + 1), x) + # It is very inefficient to first add the order and then do the nseries + return (Add(*l))._eval_nseries(x, n, logx) + o + + # All other points are not handled + return super()._eval_aseries(n, args0, x, logx) + + def fdiff(self, argindex=1): + if argindex == 1: + z = self.args[0] + return -2/sqrt(pi) + 2*z*_erfs(z) + else: + raise ArgumentIndexError(self, argindex) + + def _eval_rewrite_as_intractable(self, z, **kwargs): + return (S.One - erf(z))*exp(z**2) + + +class _eis(DefinedFunction): + """ + Helper function to make the $\\mathrm{Ei}(z)$ and $\\mathrm{li}(z)$ + functions tractable for the Gruntz algorithm. + + """ + + + def _eval_aseries(self, n, args0, x, logx): + from sympy.series.order import Order + if args0[0] not in (S.Infinity, S.NegativeInfinity): + return super()._eval_aseries(n, args0, x, logx) + + z = self.args[0] + l = [factorial(k) * (1/z)**(k + 1) for k in range(n)] + o = Order(1/z**(n + 1), x) + # It is very inefficient to first add the order and then do the nseries + return (Add(*l))._eval_nseries(x, n, logx) + o + + + def fdiff(self, argindex=1): + if argindex == 1: + z = self.args[0] + return S.One / z - _eis(z) + else: + raise ArgumentIndexError(self, argindex) + + def _eval_rewrite_as_intractable(self, z, **kwargs): + return exp(-z)*Ei(z) + + def _eval_as_leading_term(self, x, logx, cdir): + x0 = self.args[0].limit(x, 0) + if x0.is_zero: + f = self._eval_rewrite_as_intractable(*self.args) + return f._eval_as_leading_term(x, logx=logx, cdir=cdir) + return super()._eval_as_leading_term(x, logx=logx, cdir=cdir) + + def _eval_nseries(self, x, n, logx, cdir=0): + x0 = self.args[0].limit(x, 0) + if x0.is_zero: + f = self._eval_rewrite_as_intractable(*self.args) + return f._eval_nseries(x, n, logx) + return super()._eval_nseries(x, n, logx) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/functions/special/gamma_functions.py b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/functions/special/gamma_functions.py new file mode 100644 index 0000000000000000000000000000000000000000..73a5a1585154c603e9c510b3c61144039e0e5502 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/functions/special/gamma_functions.py @@ -0,0 +1,1344 @@ +from math import prod + +from sympy.core import Add, S, Dummy, expand_func +from sympy.core.expr import Expr +from sympy.core.function import DefinedFunction, ArgumentIndexError, PoleError +from sympy.core.logic import fuzzy_and, fuzzy_not +from sympy.core.numbers import Rational, pi, oo, I +from sympy.core.power import Pow +from sympy.functions.special.zeta_functions import zeta +from sympy.functions.special.error_functions import erf, erfc, Ei +from sympy.functions.elementary.complexes import re, unpolarify +from sympy.functions.elementary.exponential import exp, log +from sympy.functions.elementary.integers import ceiling, floor +from sympy.functions.elementary.miscellaneous import sqrt +from sympy.functions.elementary.trigonometric import sin, cos, cot +from sympy.functions.combinatorial.numbers import bernoulli, harmonic +from sympy.functions.combinatorial.factorials import factorial, rf, RisingFactorial +from sympy.utilities.misc import as_int + +from mpmath import mp, workprec +from mpmath.libmp.libmpf import prec_to_dps + +def intlike(n): + try: + as_int(n, strict=False) + return True + except ValueError: + return False + +############################################################################### +############################ COMPLETE GAMMA FUNCTION ########################## +############################################################################### + +class gamma(DefinedFunction): + r""" + The gamma function + + .. math:: + \Gamma(x) := \int^{\infty}_{0} t^{x-1} e^{-t} \mathrm{d}t. + + Explanation + =========== + + The ``gamma`` function implements the function which passes through the + values of the factorial function (i.e., $\Gamma(n) = (n - 1)!$ when n is + an integer). More generally, $\Gamma(z)$ is defined in the whole complex + plane except at the negative integers where there are simple poles. + + Examples + ======== + + >>> from sympy import S, I, pi, gamma + >>> from sympy.abc import x + + Several special values are known: + + >>> gamma(1) + 1 + >>> gamma(4) + 6 + >>> gamma(S(3)/2) + sqrt(pi)/2 + + The ``gamma`` function obeys the mirror symmetry: + + >>> from sympy import conjugate + >>> conjugate(gamma(x)) + gamma(conjugate(x)) + + Differentiation with respect to $x$ is supported: + + >>> from sympy import diff + >>> diff(gamma(x), x) + gamma(x)*polygamma(0, x) + + Series expansion is also supported: + + >>> from sympy import series + >>> series(gamma(x), x, 0, 3) + 1/x - EulerGamma + x*(EulerGamma**2/2 + pi**2/12) + x**2*(-EulerGamma*pi**2/12 - zeta(3)/3 - EulerGamma**3/6) + O(x**3) + + We can numerically evaluate the ``gamma`` function to arbitrary precision + on the whole complex plane: + + >>> gamma(pi).evalf(40) + 2.288037795340032417959588909060233922890 + >>> gamma(1+I).evalf(20) + 0.49801566811835604271 - 0.15494982830181068512*I + + See Also + ======== + + lowergamma: Lower incomplete gamma function. + uppergamma: Upper incomplete gamma function. + polygamma: Polygamma function. + loggamma: Log Gamma function. + digamma: Digamma function. + trigamma: Trigamma function. + sympy.functions.special.beta_functions.beta: Euler Beta function. + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Gamma_function + .. [2] https://dlmf.nist.gov/5 + .. [3] https://mathworld.wolfram.com/GammaFunction.html + .. [4] https://functions.wolfram.com/GammaBetaErf/Gamma/ + + """ + + unbranched = True + _singularities = (S.ComplexInfinity,) + + def fdiff(self, argindex=1): + if argindex == 1: + return self.func(self.args[0])*polygamma(0, self.args[0]) + else: + raise ArgumentIndexError(self, argindex) + + @classmethod + def eval(cls, arg): + if arg.is_Number: + if arg is S.NaN: + return S.NaN + elif arg is oo: + return oo + elif intlike(arg): + if arg.is_positive: + return factorial(arg - 1) + else: + return S.ComplexInfinity + elif arg.is_Rational: + if arg.q == 2: + n = abs(arg.p) // arg.q + + if arg.is_positive: + k, coeff = n, S.One + else: + n = k = n + 1 + + if n & 1 == 0: + coeff = S.One + else: + coeff = S.NegativeOne + + coeff *= prod(range(3, 2*k, 2)) + + if arg.is_positive: + return coeff*sqrt(pi) / 2**n + else: + return 2**n*sqrt(pi) / coeff + + def _eval_expand_func(self, **hints): + arg = self.args[0] + if arg.is_Rational: + if abs(arg.p) > arg.q: + x = Dummy('x') + n = arg.p // arg.q + p = arg.p - n*arg.q + return self.func(x + n)._eval_expand_func().subs(x, Rational(p, arg.q)) + + if arg.is_Add: + coeff, tail = arg.as_coeff_add() + if coeff and coeff.q != 1: + intpart = floor(coeff) + tail = (coeff - intpart,) + tail + coeff = intpart + tail = arg._new_rawargs(*tail, reeval=False) + return self.func(tail)*RisingFactorial(tail, coeff) + + return self.func(*self.args) + + def _eval_conjugate(self): + return self.func(self.args[0].conjugate()) + + def _eval_is_real(self): + x = self.args[0] + if x.is_nonpositive and x.is_integer: + return False + if intlike(x) and x <= 0: + return False + if x.is_positive or x.is_noninteger: + return True + + def _eval_is_positive(self): + x = self.args[0] + if x.is_positive: + return True + elif x.is_noninteger: + return floor(x).is_even + + def _eval_rewrite_as_tractable(self, z, limitvar=None, **kwargs): + return exp(loggamma(z)) + + def _eval_rewrite_as_factorial(self, z, **kwargs): + return factorial(z - 1) + + def _eval_nseries(self, x, n, logx, cdir=0): + x0 = self.args[0].limit(x, 0) + if not (x0.is_Integer and x0 <= 0): + return super()._eval_nseries(x, n, logx) + t = self.args[0] - x0 + return (self.func(t + 1)/rf(self.args[0], -x0 + 1))._eval_nseries(x, n, logx) + + def _eval_as_leading_term(self, x, logx, cdir): + arg = self.args[0] + x0 = arg.subs(x, 0) + + if x0.is_integer and x0.is_nonpositive: + n = -x0 + res = S.NegativeOne**n/self.func(n + 1) + return res/(arg + n).as_leading_term(x) + elif not x0.is_infinite: + return self.func(x0) + raise PoleError() + + +############################################################################### +################## LOWER and UPPER INCOMPLETE GAMMA FUNCTIONS ################# +############################################################################### + +class lowergamma(DefinedFunction): + r""" + The lower incomplete gamma function. + + Explanation + =========== + + It can be defined as the meromorphic continuation of + + .. math:: + \gamma(s, x) := \int_0^x t^{s-1} e^{-t} \mathrm{d}t = \Gamma(s) - \Gamma(s, x). + + This can be shown to be the same as + + .. math:: + \gamma(s, x) = \frac{x^s}{s} {}_1F_1\left({s \atop s+1} \middle| -x\right), + + where ${}_1F_1$ is the (confluent) hypergeometric function. + + Examples + ======== + + >>> from sympy import lowergamma, S + >>> from sympy.abc import s, x + >>> lowergamma(s, x) + lowergamma(s, x) + >>> lowergamma(3, x) + -2*(x**2/2 + x + 1)*exp(-x) + 2 + >>> lowergamma(-S(1)/2, x) + -2*sqrt(pi)*erf(sqrt(x)) - 2*exp(-x)/sqrt(x) + + See Also + ======== + + gamma: Gamma function. + uppergamma: Upper incomplete gamma function. + polygamma: Polygamma function. + loggamma: Log Gamma function. + digamma: Digamma function. + trigamma: Trigamma function. + sympy.functions.special.beta_functions.beta: Euler Beta function. + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Incomplete_gamma_function#Lower_incomplete_gamma_function + .. [2] Abramowitz, Milton; Stegun, Irene A., eds. (1965), Chapter 6, + Section 5, Handbook of Mathematical Functions with Formulas, Graphs, + and Mathematical Tables + .. [3] https://dlmf.nist.gov/8 + .. [4] https://functions.wolfram.com/GammaBetaErf/Gamma2/ + .. [5] https://functions.wolfram.com/GammaBetaErf/Gamma3/ + + """ + + + def fdiff(self, argindex=2): + from sympy.functions.special.hyper import meijerg + if argindex == 2: + a, z = self.args + return exp(-unpolarify(z))*z**(a - 1) + elif argindex == 1: + a, z = self.args + return gamma(a)*digamma(a) - log(z)*uppergamma(a, z) \ + - meijerg([], [1, 1], [0, 0, a], [], z) + + else: + raise ArgumentIndexError(self, argindex) + + @classmethod + def eval(cls, a, x): + # For lack of a better place, we use this one to extract branching + # information. The following can be + # found in the literature (c/f references given above), albeit scattered: + # 1) For fixed x != 0, lowergamma(s, x) is an entire function of s + # 2) For fixed positive integers s, lowergamma(s, x) is an entire + # function of x. + # 3) For fixed non-positive integers s, + # lowergamma(s, exp(I*2*pi*n)*x) = + # 2*pi*I*n*(-1)**(-s)/factorial(-s) + lowergamma(s, x) + # (this follows from lowergamma(s, x).diff(x) = x**(s-1)*exp(-x)). + # 4) For fixed non-integral s, + # lowergamma(s, x) = x**s*gamma(s)*lowergamma_unbranched(s, x), + # where lowergamma_unbranched(s, x) is an entire function (in fact + # of both s and x), i.e. + # lowergamma(s, exp(2*I*pi*n)*x) = exp(2*pi*I*n*a)*lowergamma(a, x) + if x is S.Zero: + return S.Zero + nx, n = x.extract_branch_factor() + if a.is_integer and a.is_positive: + nx = unpolarify(x) + if nx != x: + return lowergamma(a, nx) + elif a.is_integer and a.is_nonpositive: + if n != 0: + return 2*pi*I*n*S.NegativeOne**(-a)/factorial(-a) + lowergamma(a, nx) + elif n != 0: + return exp(2*pi*I*n*a)*lowergamma(a, nx) + + # Special values. + if a.is_Number: + if a is S.One: + return S.One - exp(-x) + elif a is S.Half: + return sqrt(pi)*erf(sqrt(x)) + elif a.is_Integer or (2*a).is_Integer: + b = a - 1 + if b.is_positive: + if a.is_integer: + return factorial(b) - exp(-x) * factorial(b) * Add(*[x ** k / factorial(k) for k in range(a)]) + else: + return gamma(a)*(lowergamma(S.Half, x)/sqrt(pi) - exp(-x)*Add(*[x**(k - S.Half)/gamma(S.Half + k) for k in range(1, a + S.Half)])) + + if not a.is_Integer: + return S.NegativeOne**(S.Half - a)*pi*erf(sqrt(x))/gamma(1 - a) + exp(-x)*Add(*[x**(k + a - 1)*gamma(a)/gamma(a + k) for k in range(1, Rational(3, 2) - a)]) + + if x.is_zero: + return S.Zero + + def _eval_evalf(self, prec): + if all(x.is_number for x in self.args): + a = self.args[0]._to_mpmath(prec) + z = self.args[1]._to_mpmath(prec) + with workprec(prec): + res = mp.gammainc(a, 0, z) + return Expr._from_mpmath(res, prec) + else: + return self + + def _eval_conjugate(self): + x = self.args[1] + if x not in (S.Zero, S.NegativeInfinity): + return self.func(self.args[0].conjugate(), x.conjugate()) + + def _eval_is_meromorphic(self, x, a): + # By https://en.wikipedia.org/wiki/Incomplete_gamma_function#Holomorphic_extension, + # lowergamma(s, z) = z**s*gamma(s)*gammastar(s, z), + # where gammastar(s, z) is holomorphic for all s and z. + # Hence the singularities of lowergamma are z = 0 (branch + # point) and nonpositive integer values of s (poles of gamma(s)). + s, z = self.args + args_merom = fuzzy_and([z._eval_is_meromorphic(x, a), + s._eval_is_meromorphic(x, a)]) + if not args_merom: + return args_merom + z0 = z.subs(x, a) + if s.is_integer: + return fuzzy_and([s.is_positive, z0.is_finite]) + s0 = s.subs(x, a) + return fuzzy_and([s0.is_finite, z0.is_finite, fuzzy_not(z0.is_zero)]) + + def _eval_aseries(self, n, args0, x, logx): + from sympy.series.order import O + s, z = self.args + if args0[0] is oo and not z.has(x): + coeff = z**s*exp(-z) + sum_expr = sum(z**k/rf(s, k + 1) for k in range(n - 1)) + o = O(z**s*s**(-n)) + return coeff*sum_expr + o + return super()._eval_aseries(n, args0, x, logx) + + def _eval_rewrite_as_uppergamma(self, s, x, **kwargs): + return gamma(s) - uppergamma(s, x) + + def _eval_rewrite_as_expint(self, s, x, **kwargs): + from sympy.functions.special.error_functions import expint + if s.is_integer and s.is_nonpositive: + return self + return self.rewrite(uppergamma).rewrite(expint) + + def _eval_is_zero(self): + x = self.args[1] + if x.is_zero: + return True + + +class uppergamma(DefinedFunction): + r""" + The upper incomplete gamma function. + + Explanation + =========== + + It can be defined as the meromorphic continuation of + + .. math:: + \Gamma(s, x) := \int_x^\infty t^{s-1} e^{-t} \mathrm{d}t = \Gamma(s) - \gamma(s, x). + + where $\gamma(s, x)$ is the lower incomplete gamma function, + :class:`lowergamma`. This can be shown to be the same as + + .. math:: + \Gamma(s, x) = \Gamma(s) - \frac{x^s}{s} {}_1F_1\left({s \atop s+1} \middle| -x\right), + + where ${}_1F_1$ is the (confluent) hypergeometric function. + + The upper incomplete gamma function is also essentially equivalent to the + generalized exponential integral: + + .. math:: + \operatorname{E}_{n}(x) = \int_{1}^{\infty}{\frac{e^{-xt}}{t^n} \, dt} = x^{n-1}\Gamma(1-n,x). + + Examples + ======== + + >>> from sympy import uppergamma, S + >>> from sympy.abc import s, x + >>> uppergamma(s, x) + uppergamma(s, x) + >>> uppergamma(3, x) + 2*(x**2/2 + x + 1)*exp(-x) + >>> uppergamma(-S(1)/2, x) + -2*sqrt(pi)*erfc(sqrt(x)) + 2*exp(-x)/sqrt(x) + >>> uppergamma(-2, x) + expint(3, x)/x**2 + + See Also + ======== + + gamma: Gamma function. + lowergamma: Lower incomplete gamma function. + polygamma: Polygamma function. + loggamma: Log Gamma function. + digamma: Digamma function. + trigamma: Trigamma function. + sympy.functions.special.beta_functions.beta: Euler Beta function. + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Incomplete_gamma_function#Upper_incomplete_gamma_function + .. [2] Abramowitz, Milton; Stegun, Irene A., eds. (1965), Chapter 6, + Section 5, Handbook of Mathematical Functions with Formulas, Graphs, + and Mathematical Tables + .. [3] https://dlmf.nist.gov/8 + .. [4] https://functions.wolfram.com/GammaBetaErf/Gamma2/ + .. [5] https://functions.wolfram.com/GammaBetaErf/Gamma3/ + .. [6] https://en.wikipedia.org/wiki/Exponential_integral#Relation_with_other_functions + + """ + + + def fdiff(self, argindex=2): + from sympy.functions.special.hyper import meijerg + if argindex == 2: + a, z = self.args + return -exp(-unpolarify(z))*z**(a - 1) + elif argindex == 1: + a, z = self.args + return uppergamma(a, z)*log(z) + meijerg([], [1, 1], [0, 0, a], [], z) + else: + raise ArgumentIndexError(self, argindex) + + def _eval_evalf(self, prec): + if all(x.is_number for x in self.args): + a = self.args[0]._to_mpmath(prec) + z = self.args[1]._to_mpmath(prec) + with workprec(prec): + res = mp.gammainc(a, z, mp.inf) + return Expr._from_mpmath(res, prec) + return self + + @classmethod + def eval(cls, a, z): + from sympy.functions.special.error_functions import expint + if z.is_Number: + if z is S.NaN: + return S.NaN + elif z is oo: + return S.Zero + elif z.is_zero: + if re(a).is_positive: + return gamma(a) + + # We extract branching information here. C/f lowergamma. + nx, n = z.extract_branch_factor() + if a.is_integer and a.is_positive: + nx = unpolarify(z) + if z != nx: + return uppergamma(a, nx) + elif a.is_integer and a.is_nonpositive: + if n != 0: + return -2*pi*I*n*S.NegativeOne**(-a)/factorial(-a) + uppergamma(a, nx) + elif n != 0: + return gamma(a)*(1 - exp(2*pi*I*n*a)) + exp(2*pi*I*n*a)*uppergamma(a, nx) + + # Special values. + if a.is_Number: + if a is S.Zero and z.is_positive: + return -Ei(-z) + elif a is S.One: + return exp(-z) + elif a is S.Half: + return sqrt(pi)*erfc(sqrt(z)) + elif a.is_Integer or (2*a).is_Integer: + b = a - 1 + if b.is_positive: + if a.is_integer: + return exp(-z) * factorial(b) * Add(*[z**k / factorial(k) + for k in range(a)]) + else: + return (gamma(a) * erfc(sqrt(z)) + + S.NegativeOne**(a - S(3)/2) * exp(-z) * sqrt(z) + * Add(*[gamma(-S.Half - k) * (-z)**k / gamma(1-a) + for k in range(a - S.Half)])) + elif b.is_Integer: + return expint(-b, z)*unpolarify(z)**(b + 1) + + if not a.is_Integer: + return (S.NegativeOne**(S.Half - a) * pi*erfc(sqrt(z))/gamma(1-a) + - z**a * exp(-z) * Add(*[z**k * gamma(a) / gamma(a+k+1) + for k in range(S.Half - a)])) + + if a.is_zero and z.is_positive: + return -Ei(-z) + + if z.is_zero and re(a).is_positive: + return gamma(a) + + def _eval_conjugate(self): + z = self.args[1] + if z not in (S.Zero, S.NegativeInfinity): + return self.func(self.args[0].conjugate(), z.conjugate()) + + def _eval_is_meromorphic(self, x, a): + return lowergamma._eval_is_meromorphic(self, x, a) + + def _eval_rewrite_as_lowergamma(self, s, x, **kwargs): + return gamma(s) - lowergamma(s, x) + + def _eval_rewrite_as_tractable(self, s, x, **kwargs): + return exp(loggamma(s)) - lowergamma(s, x) + + def _eval_rewrite_as_expint(self, s, x, **kwargs): + from sympy.functions.special.error_functions import expint + return expint(1 - s, x)*x**s + + +############################################################################### +###################### POLYGAMMA and LOGGAMMA FUNCTIONS ####################### +############################################################################### + +class polygamma(DefinedFunction): + r""" + The function ``polygamma(n, z)`` returns ``log(gamma(z)).diff(n + 1)``. + + Explanation + =========== + + It is a meromorphic function on $\mathbb{C}$ and defined as the $(n+1)$-th + derivative of the logarithm of the gamma function: + + .. math:: + \psi^{(n)} (z) := \frac{\mathrm{d}^{n+1}}{\mathrm{d} z^{n+1}} \log\Gamma(z). + + For `n` not a nonnegative integer the generalization by Espinosa and Moll [5]_ + is used: + + .. math:: \psi(s,z) = \frac{\zeta'(s+1, z) + (\gamma + \psi(-s)) \zeta(s+1, z)} + {\Gamma(-s)} + + Examples + ======== + + Several special values are known: + + >>> from sympy import S, polygamma + >>> polygamma(0, 1) + -EulerGamma + >>> polygamma(0, 1/S(2)) + -2*log(2) - EulerGamma + >>> polygamma(0, 1/S(3)) + -log(3) - sqrt(3)*pi/6 - EulerGamma - log(sqrt(3)) + >>> polygamma(0, 1/S(4)) + -pi/2 - log(4) - log(2) - EulerGamma + >>> polygamma(0, 2) + 1 - EulerGamma + >>> polygamma(0, 23) + 19093197/5173168 - EulerGamma + + >>> from sympy import oo, I + >>> polygamma(0, oo) + oo + >>> polygamma(0, -oo) + oo + >>> polygamma(0, I*oo) + oo + >>> polygamma(0, -I*oo) + oo + + Differentiation with respect to $x$ is supported: + + >>> from sympy import Symbol, diff + >>> x = Symbol("x") + >>> diff(polygamma(0, x), x) + polygamma(1, x) + >>> diff(polygamma(0, x), x, 2) + polygamma(2, x) + >>> diff(polygamma(0, x), x, 3) + polygamma(3, x) + >>> diff(polygamma(1, x), x) + polygamma(2, x) + >>> diff(polygamma(1, x), x, 2) + polygamma(3, x) + >>> diff(polygamma(2, x), x) + polygamma(3, x) + >>> diff(polygamma(2, x), x, 2) + polygamma(4, x) + + >>> n = Symbol("n") + >>> diff(polygamma(n, x), x) + polygamma(n + 1, x) + >>> diff(polygamma(n, x), x, 2) + polygamma(n + 2, x) + + We can rewrite ``polygamma`` functions in terms of harmonic numbers: + + >>> from sympy import harmonic + >>> polygamma(0, x).rewrite(harmonic) + harmonic(x - 1) - EulerGamma + >>> polygamma(2, x).rewrite(harmonic) + 2*harmonic(x - 1, 3) - 2*zeta(3) + >>> ni = Symbol("n", integer=True) + >>> polygamma(ni, x).rewrite(harmonic) + (-1)**(n + 1)*(-harmonic(x - 1, n + 1) + zeta(n + 1))*factorial(n) + + See Also + ======== + + gamma: Gamma function. + lowergamma: Lower incomplete gamma function. + uppergamma: Upper incomplete gamma function. + loggamma: Log Gamma function. + digamma: Digamma function. + trigamma: Trigamma function. + sympy.functions.special.beta_functions.beta: Euler Beta function. + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Polygamma_function + .. [2] https://mathworld.wolfram.com/PolygammaFunction.html + .. [3] https://functions.wolfram.com/GammaBetaErf/PolyGamma/ + .. [4] https://functions.wolfram.com/GammaBetaErf/PolyGamma2/ + .. [5] O. Espinosa and V. Moll, "A generalized polygamma function", + *Integral Transforms and Special Functions* (2004), 101-115. + + """ + + @classmethod + def eval(cls, n, z): + if n is S.NaN or z is S.NaN: + return S.NaN + elif z is oo: + return oo if n.is_zero else S.Zero + elif z.is_Integer and z.is_nonpositive: + return S.ComplexInfinity + elif n is S.NegativeOne: + return loggamma(z) - log(2*pi) / 2 + elif n.is_zero: + if z is -oo or z.extract_multiplicatively(I) in (oo, -oo): + return oo + elif z.is_Integer: + return harmonic(z-1) - S.EulerGamma + elif z.is_Rational: + # TODO n == 1 also can do some rational z + p, q = z.as_numer_denom() + # only expand for small denominators to avoid creating long expressions + if q <= 6: + return expand_func(polygamma(S.Zero, z, evaluate=False)) + elif n.is_integer and n.is_nonnegative: + nz = unpolarify(z) + if z != nz: + return polygamma(n, nz) + if z.is_Integer: + return S.NegativeOne**(n+1) * factorial(n) * zeta(n+1, z) + elif z is S.Half: + return S.NegativeOne**(n+1) * factorial(n) * (2**(n+1)-1) * zeta(n+1) + + def _eval_is_real(self): + if self.args[0].is_positive and self.args[1].is_positive: + return True + + def _eval_is_complex(self): + z = self.args[1] + is_negative_integer = fuzzy_and([z.is_negative, z.is_integer]) + return fuzzy_and([z.is_complex, fuzzy_not(is_negative_integer)]) + + def _eval_is_positive(self): + n, z = self.args + if n.is_positive: + if n.is_odd and z.is_real: + return True + if n.is_even and z.is_positive: + return False + + def _eval_is_negative(self): + n, z = self.args + if n.is_positive: + if n.is_even and z.is_positive: + return True + if n.is_odd and z.is_real: + return False + + def _eval_expand_func(self, **hints): + n, z = self.args + + if n.is_Integer and n.is_nonnegative: + if z.is_Add: + coeff = z.args[0] + if coeff.is_Integer: + e = -(n + 1) + if coeff > 0: + tail = Add(*[Pow( + z - i, e) for i in range(1, int(coeff) + 1)]) + else: + tail = -Add(*[Pow( + z + i, e) for i in range(int(-coeff))]) + return polygamma(n, z - coeff) + S.NegativeOne**n*factorial(n)*tail + + elif z.is_Mul: + coeff, z = z.as_two_terms() + if coeff.is_Integer and coeff.is_positive: + tail = [polygamma(n, z + Rational( + i, coeff)) for i in range(int(coeff))] + if n == 0: + return Add(*tail)/coeff + log(coeff) + else: + return Add(*tail)/coeff**(n + 1) + z *= coeff + + if n == 0 and z.is_Rational: + p, q = z.as_numer_denom() + + # Reference: + # Values of the polygamma functions at rational arguments, J. Choi, 2007 + part_1 = -S.EulerGamma - pi * cot(p * pi / q) / 2 - log(q) + Add( + *[cos(2 * k * pi * p / q) * log(2 * sin(k * pi / q)) for k in range(1, q)]) + + if z > 0: + n = floor(z) + z0 = z - n + return part_1 + Add(*[1 / (z0 + k) for k in range(n)]) + elif z < 0: + n = floor(1 - z) + z0 = z + n + return part_1 - Add(*[1 / (z0 - 1 - k) for k in range(n)]) + + if n == -1: + return loggamma(z) - log(2*pi) / 2 + if n.is_integer is False or n.is_nonnegative is False: + s = Dummy("s") + dzt = zeta(s, z).diff(s).subs(s, n+1) + return (dzt + (S.EulerGamma + digamma(-n)) * zeta(n+1, z)) / gamma(-n) + + return polygamma(n, z) + + def _eval_rewrite_as_zeta(self, n, z, **kwargs): + if n.is_integer and n.is_positive: + return S.NegativeOne**(n + 1)*factorial(n)*zeta(n + 1, z) + + def _eval_rewrite_as_harmonic(self, n, z, **kwargs): + if n.is_integer: + if n.is_zero: + return harmonic(z - 1) - S.EulerGamma + else: + return S.NegativeOne**(n+1) * factorial(n) * (zeta(n+1) - harmonic(z-1, n+1)) + + def _eval_as_leading_term(self, x, logx, cdir): + from sympy.series.order import Order + n, z = [a.as_leading_term(x) for a in self.args] + o = Order(z, x) + if n == 0 and o.contains(1/x): + logx = log(x) if logx is None else logx + return o.getn() * logx + else: + return self.func(n, z) + + def fdiff(self, argindex=2): + if argindex == 2: + n, z = self.args[:2] + return polygamma(n + 1, z) + else: + raise ArgumentIndexError(self, argindex) + + def _eval_aseries(self, n, args0, x, logx): + from sympy.series.order import Order + if args0[1] != oo or not \ + (self.args[0].is_Integer and self.args[0].is_nonnegative): + return super()._eval_aseries(n, args0, x, logx) + z = self.args[1] + N = self.args[0] + + if N == 0: + # digamma function series + # Abramowitz & Stegun, p. 259, 6.3.18 + r = log(z) - 1/(2*z) + o = None + if n < 2: + o = Order(1/z, x) + else: + m = ceiling((n + 1)//2) + l = [bernoulli(2*k) / (2*k*z**(2*k)) for k in range(1, m)] + r -= Add(*l) + o = Order(1/z**n, x) + return r._eval_nseries(x, n, logx) + o + else: + # proper polygamma function + # Abramowitz & Stegun, p. 260, 6.4.10 + # We return terms to order higher than O(x**n) on purpose + # -- otherwise we would not be able to return any terms for + # quite a long time! + fac = gamma(N) + e0 = fac + N*fac/(2*z) + m = ceiling((n + 1)//2) + for k in range(1, m): + fac = fac*(2*k + N - 1)*(2*k + N - 2) / ((2*k)*(2*k - 1)) + e0 += bernoulli(2*k)*fac/z**(2*k) + o = Order(1/z**(2*m), x) + if n == 0: + o = Order(1/z, x) + elif n == 1: + o = Order(1/z**2, x) + r = e0._eval_nseries(z, n, logx) + o + return (-1 * (-1/z)**N * r)._eval_nseries(x, n, logx) + + def _eval_evalf(self, prec): + if not all(i.is_number for i in self.args): + return + s = self.args[0]._to_mpmath(prec+12) + z = self.args[1]._to_mpmath(prec+12) + if mp.isint(z) and z <= 0: + return S.ComplexInfinity + with workprec(prec+12): + if mp.isint(s) and s >= 0: + res = mp.polygamma(s, z) + else: + zt = mp.zeta(s+1, z) + dzt = mp.zeta(s+1, z, 1) + res = (dzt + (mp.euler + mp.digamma(-s)) * zt) * mp.rgamma(-s) + return Expr._from_mpmath(res, prec) + + +class loggamma(DefinedFunction): + r""" + The ``loggamma`` function implements the logarithm of the + gamma function (i.e., $\log\Gamma(x)$). + + Examples + ======== + + Several special values are known. For numerical integral + arguments we have: + + >>> from sympy import loggamma + >>> loggamma(-2) + oo + >>> loggamma(0) + oo + >>> loggamma(1) + 0 + >>> loggamma(2) + 0 + >>> loggamma(3) + log(2) + + And for symbolic values: + + >>> from sympy import Symbol + >>> n = Symbol("n", integer=True, positive=True) + >>> loggamma(n) + log(gamma(n)) + >>> loggamma(-n) + oo + + For half-integral values: + + >>> from sympy import S + >>> loggamma(S(5)/2) + log(3*sqrt(pi)/4) + >>> loggamma(n/2) + log(2**(1 - n)*sqrt(pi)*gamma(n)/gamma(n/2 + 1/2)) + + And general rational arguments: + + >>> from sympy import expand_func + >>> L = loggamma(S(16)/3) + >>> expand_func(L).doit() + -5*log(3) + loggamma(1/3) + log(4) + log(7) + log(10) + log(13) + >>> L = loggamma(S(19)/4) + >>> expand_func(L).doit() + -4*log(4) + loggamma(3/4) + log(3) + log(7) + log(11) + log(15) + >>> L = loggamma(S(23)/7) + >>> expand_func(L).doit() + -3*log(7) + log(2) + loggamma(2/7) + log(9) + log(16) + + The ``loggamma`` function has the following limits towards infinity: + + >>> from sympy import oo + >>> loggamma(oo) + oo + >>> loggamma(-oo) + zoo + + The ``loggamma`` function obeys the mirror symmetry + if $x \in \mathbb{C} \setminus \{-\infty, 0\}$: + + >>> from sympy.abc import x + >>> from sympy import conjugate + >>> conjugate(loggamma(x)) + loggamma(conjugate(x)) + + Differentiation with respect to $x$ is supported: + + >>> from sympy import diff + >>> diff(loggamma(x), x) + polygamma(0, x) + + Series expansion is also supported: + + >>> from sympy import series + >>> series(loggamma(x), x, 0, 4).cancel() + -log(x) - EulerGamma*x + pi**2*x**2/12 - x**3*zeta(3)/3 + O(x**4) + + We can numerically evaluate the ``loggamma`` function + to arbitrary precision on the whole complex plane: + + >>> from sympy import I + >>> loggamma(5).evalf(30) + 3.17805383034794561964694160130 + >>> loggamma(I).evalf(20) + -0.65092319930185633889 - 1.8724366472624298171*I + + See Also + ======== + + gamma: Gamma function. + lowergamma: Lower incomplete gamma function. + uppergamma: Upper incomplete gamma function. + polygamma: Polygamma function. + digamma: Digamma function. + trigamma: Trigamma function. + sympy.functions.special.beta_functions.beta: Euler Beta function. + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Gamma_function + .. [2] https://dlmf.nist.gov/5 + .. [3] https://mathworld.wolfram.com/LogGammaFunction.html + .. [4] https://functions.wolfram.com/GammaBetaErf/LogGamma/ + + """ + @classmethod + def eval(cls, z): + if z.is_integer: + if z.is_nonpositive: + return oo + elif z.is_positive: + return log(gamma(z)) + elif z.is_rational: + p, q = z.as_numer_denom() + # Half-integral values: + if p.is_positive and q == 2: + return log(sqrt(pi) * 2**(1 - p) * gamma(p) / gamma((p + 1)*S.Half)) + + if z is oo: + return oo + elif abs(z) is oo: + return S.ComplexInfinity + if z is S.NaN: + return S.NaN + + def _eval_expand_func(self, **hints): + from sympy.concrete.summations import Sum + z = self.args[0] + + if z.is_Rational: + p, q = z.as_numer_denom() + # General rational arguments (u + p/q) + # Split z as n + p/q with p < q + n = p // q + p = p - n*q + if p.is_positive and q.is_positive and p < q: + k = Dummy("k") + if n.is_positive: + return loggamma(p / q) - n*log(q) + Sum(log((k - 1)*q + p), (k, 1, n)) + elif n.is_negative: + return loggamma(p / q) - n*log(q) + pi*I*n - Sum(log(k*q - p), (k, 1, -n)) + elif n.is_zero: + return loggamma(p / q) + + return self + + def _eval_nseries(self, x, n, logx=None, cdir=0): + x0 = self.args[0].limit(x, 0) + if x0.is_zero: + f = self._eval_rewrite_as_intractable(*self.args) + return f._eval_nseries(x, n, logx) + return super()._eval_nseries(x, n, logx) + + def _eval_aseries(self, n, args0, x, logx): + from sympy.series.order import Order + if args0[0] != oo: + return super()._eval_aseries(n, args0, x, logx) + z = self.args[0] + r = log(z)*(z - S.Half) - z + log(2*pi)/2 + l = [bernoulli(2*k) / (2*k*(2*k - 1)*z**(2*k - 1)) for k in range(1, n)] + o = None + if n == 0: + o = Order(1, x) + else: + o = Order(1/z**n, x) + # It is very inefficient to first add the order and then do the nseries + return (r + Add(*l))._eval_nseries(x, n, logx) + o + + def _eval_rewrite_as_intractable(self, z, **kwargs): + return log(gamma(z)) + + def _eval_is_real(self): + z = self.args[0] + if z.is_positive: + return True + elif z.is_nonpositive: + return False + + def _eval_conjugate(self): + z = self.args[0] + if z not in (S.Zero, S.NegativeInfinity): + return self.func(z.conjugate()) + + def fdiff(self, argindex=1): + if argindex == 1: + return polygamma(0, self.args[0]) + else: + raise ArgumentIndexError(self, argindex) + + +class digamma(DefinedFunction): + r""" + The ``digamma`` function is the first derivative of the ``loggamma`` + function + + .. math:: + \psi(x) := \frac{\mathrm{d}}{\mathrm{d} z} \log\Gamma(z) + = \frac{\Gamma'(z)}{\Gamma(z) }. + + In this case, ``digamma(z) = polygamma(0, z)``. + + Examples + ======== + + >>> from sympy import digamma + >>> digamma(0) + zoo + >>> from sympy import Symbol + >>> z = Symbol('z') + >>> digamma(z) + polygamma(0, z) + + To retain ``digamma`` as it is: + + >>> digamma(0, evaluate=False) + digamma(0) + >>> digamma(z, evaluate=False) + digamma(z) + + See Also + ======== + + gamma: Gamma function. + lowergamma: Lower incomplete gamma function. + uppergamma: Upper incomplete gamma function. + polygamma: Polygamma function. + loggamma: Log Gamma function. + trigamma: Trigamma function. + sympy.functions.special.beta_functions.beta: Euler Beta function. + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Digamma_function + .. [2] https://mathworld.wolfram.com/DigammaFunction.html + .. [3] https://functions.wolfram.com/GammaBetaErf/PolyGamma2/ + + """ + def _eval_evalf(self, prec): + z = self.args[0] + nprec = prec_to_dps(prec) + return polygamma(0, z).evalf(n=nprec) + + def fdiff(self, argindex=1): + z = self.args[0] + return polygamma(0, z).fdiff() + + def _eval_is_real(self): + z = self.args[0] + return polygamma(0, z).is_real + + def _eval_is_positive(self): + z = self.args[0] + return polygamma(0, z).is_positive + + def _eval_is_negative(self): + z = self.args[0] + return polygamma(0, z).is_negative + + def _eval_aseries(self, n, args0, x, logx): + as_polygamma = self.rewrite(polygamma) + args0 = [S.Zero,] + args0 + return as_polygamma._eval_aseries(n, args0, x, logx) + + @classmethod + def eval(cls, z): + return polygamma(0, z) + + def _eval_expand_func(self, **hints): + z = self.args[0] + return polygamma(0, z).expand(func=True) + + def _eval_rewrite_as_harmonic(self, z, **kwargs): + return harmonic(z - 1) - S.EulerGamma + + def _eval_rewrite_as_polygamma(self, z, **kwargs): + return polygamma(0, z) + + def _eval_as_leading_term(self, x, logx, cdir): + z = self.args[0] + return polygamma(0, z).as_leading_term(x) + + + +class trigamma(DefinedFunction): + r""" + The ``trigamma`` function is the second derivative of the ``loggamma`` + function + + .. math:: + \psi^{(1)}(z) := \frac{\mathrm{d}^{2}}{\mathrm{d} z^{2}} \log\Gamma(z). + + In this case, ``trigamma(z) = polygamma(1, z)``. + + Examples + ======== + + >>> from sympy import trigamma + >>> trigamma(0) + zoo + >>> from sympy import Symbol + >>> z = Symbol('z') + >>> trigamma(z) + polygamma(1, z) + + To retain ``trigamma`` as it is: + + >>> trigamma(0, evaluate=False) + trigamma(0) + >>> trigamma(z, evaluate=False) + trigamma(z) + + + See Also + ======== + + gamma: Gamma function. + lowergamma: Lower incomplete gamma function. + uppergamma: Upper incomplete gamma function. + polygamma: Polygamma function. + loggamma: Log Gamma function. + digamma: Digamma function. + sympy.functions.special.beta_functions.beta: Euler Beta function. + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Trigamma_function + .. [2] https://mathworld.wolfram.com/TrigammaFunction.html + .. [3] https://functions.wolfram.com/GammaBetaErf/PolyGamma2/ + + """ + def _eval_evalf(self, prec): + z = self.args[0] + nprec = prec_to_dps(prec) + return polygamma(1, z).evalf(n=nprec) + + def fdiff(self, argindex=1): + z = self.args[0] + return polygamma(1, z).fdiff() + + def _eval_is_real(self): + z = self.args[0] + return polygamma(1, z).is_real + + def _eval_is_positive(self): + z = self.args[0] + return polygamma(1, z).is_positive + + def _eval_is_negative(self): + z = self.args[0] + return polygamma(1, z).is_negative + + def _eval_aseries(self, n, args0, x, logx): + as_polygamma = self.rewrite(polygamma) + args0 = [S.One,] + args0 + return as_polygamma._eval_aseries(n, args0, x, logx) + + @classmethod + def eval(cls, z): + return polygamma(1, z) + + def _eval_expand_func(self, **hints): + z = self.args[0] + return polygamma(1, z).expand(func=True) + + def _eval_rewrite_as_zeta(self, z, **kwargs): + return zeta(2, z) + + def _eval_rewrite_as_polygamma(self, z, **kwargs): + return polygamma(1, z) + + def _eval_rewrite_as_harmonic(self, z, **kwargs): + return -harmonic(z - 1, 2) + pi**2 / 6 + + def _eval_as_leading_term(self, x, logx, cdir): + z = self.args[0] + return polygamma(1, z).as_leading_term(x) + + +############################################################################### +##################### COMPLETE MULTIVARIATE GAMMA FUNCTION #################### +############################################################################### + + +class multigamma(DefinedFunction): + r""" + The multivariate gamma function is a generalization of the gamma function + + .. math:: + \Gamma_p(z) = \pi^{p(p-1)/4}\prod_{k=1}^p \Gamma[z + (1 - k)/2]. + + In a special case, ``multigamma(x, 1) = gamma(x)``. + + Examples + ======== + + >>> from sympy import S, multigamma + >>> from sympy import Symbol + >>> x = Symbol('x') + >>> p = Symbol('p', positive=True, integer=True) + + >>> multigamma(x, p) + pi**(p*(p - 1)/4)*Product(gamma(-_k/2 + x + 1/2), (_k, 1, p)) + + Several special values are known: + + >>> multigamma(1, 1) + 1 + >>> multigamma(4, 1) + 6 + >>> multigamma(S(3)/2, 1) + sqrt(pi)/2 + + Writing ``multigamma`` in terms of the ``gamma`` function: + + >>> multigamma(x, 1) + gamma(x) + + >>> multigamma(x, 2) + sqrt(pi)*gamma(x)*gamma(x - 1/2) + + >>> multigamma(x, 3) + pi**(3/2)*gamma(x)*gamma(x - 1)*gamma(x - 1/2) + + Parameters + ========== + + p : order or dimension of the multivariate gamma function + + See Also + ======== + + gamma, lowergamma, uppergamma, polygamma, loggamma, digamma, trigamma, + sympy.functions.special.beta_functions.beta + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Multivariate_gamma_function + + """ + unbranched = True + + def fdiff(self, argindex=2): + from sympy.concrete.summations import Sum + if argindex == 2: + x, p = self.args + k = Dummy("k") + return self.func(x, p)*Sum(polygamma(0, x + (1 - k)/2), (k, 1, p)) + else: + raise ArgumentIndexError(self, argindex) + + @classmethod + def eval(cls, x, p): + from sympy.concrete.products import Product + if p.is_positive is False or p.is_integer is False: + raise ValueError('Order parameter p must be positive integer.') + k = Dummy("k") + return (pi**(p*(p - 1)/4)*Product(gamma(x + (1 - k)/2), + (k, 1, p))).doit() + + def _eval_conjugate(self): + x, p = self.args + return self.func(x.conjugate(), p) + + def _eval_is_real(self): + x, p = self.args + y = 2*x + if y.is_integer and (y <= (p - 1)) is True: + return False + if intlike(y) and (y <= (p - 1)): + return False + if y > (p - 1) or y.is_noninteger: + return True diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/functions/special/hyper.py b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/functions/special/hyper.py new file mode 100644 index 0000000000000000000000000000000000000000..3943e140821222a510c609a071b5dbbf08883745 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/functions/special/hyper.py @@ -0,0 +1,1185 @@ +"""Hypergeometric and Meijer G-functions""" +from collections import Counter + +from sympy.core import S, Mod +from sympy.core.add import Add +from sympy.core.expr import Expr +from sympy.core.function import DefinedFunction, Derivative, ArgumentIndexError + +from sympy.core.containers import Tuple +from sympy.core.mul import Mul +from sympy.core.numbers import I, pi, oo, zoo +from sympy.core.parameters import global_parameters +from sympy.core.relational import Ne +from sympy.core.sorting import default_sort_key +from sympy.core.symbol import Dummy + +from sympy.external.gmpy import lcm +from sympy.functions import (sqrt, exp, log, sin, cos, asin, atan, + sinh, cosh, asinh, acosh, atanh, acoth) +from sympy.functions import factorial, RisingFactorial +from sympy.functions.elementary.complexes import Abs, re, unpolarify +from sympy.functions.elementary.exponential import exp_polar +from sympy.functions.elementary.integers import ceiling +from sympy.functions.elementary.piecewise import Piecewise +from sympy.logic.boolalg import (And, Or) +from sympy import ordered + + +class TupleArg(Tuple): + + # This method is only needed because hyper._eval_as_leading_term falls back + # (via super()) on using Function._eval_as_leading_term, which in turn + # calls as_leading_term on the args of the hyper. Ideally hyper should just + # have an _eval_as_leading_term method that handles all cases and this + # method should be removed because leading terms of tuples don't make + # sense. + def as_leading_term(self, *x, logx=None, cdir=0): + return TupleArg(*[f.as_leading_term(*x, logx=logx, cdir=cdir) for f in self.args]) + + def limit(self, x, xlim, dir='+'): + """ Compute limit x->xlim. + """ + from sympy.series.limits import limit + return TupleArg(*[limit(f, x, xlim, dir) for f in self.args]) + + +# TODO should __new__ accept **options? +# TODO should constructors should check if parameters are sensible? + + +def _prep_tuple(v): + """ + Turn an iterable argument *v* into a tuple and unpolarify, since both + hypergeometric and meijer g-functions are unbranched in their parameters. + + Examples + ======== + + >>> from sympy.functions.special.hyper import _prep_tuple + >>> _prep_tuple([1, 2, 3]) + (1, 2, 3) + >>> _prep_tuple((4, 5)) + (4, 5) + >>> _prep_tuple((7, 8, 9)) + (7, 8, 9) + + """ + return TupleArg(*[unpolarify(x) for x in v]) + + +class TupleParametersBase(DefinedFunction): + """ Base class that takes care of differentiation, when some of + the arguments are actually tuples. """ + # This is not deduced automatically since there are Tuples as arguments. + is_commutative = True + + def _eval_derivative(self, s): + try: + res = 0 + if self.args[0].has(s) or self.args[1].has(s): + for i, p in enumerate(self._diffargs): + m = self._diffargs[i].diff(s) + if m != 0: + res += self.fdiff((1, i))*m + return res + self.fdiff(3)*self.args[2].diff(s) + except (ArgumentIndexError, NotImplementedError): + return Derivative(self, s) + + +class hyper(TupleParametersBase): + r""" + The generalized hypergeometric function is defined by a series where + the ratios of successive terms are a rational function of the summation + index. When convergent, it is continued analytically to the largest + possible domain. + + Explanation + =========== + + The hypergeometric function depends on two vectors of parameters, called + the numerator parameters $a_p$, and the denominator parameters + $b_q$. It also has an argument $z$. The series definition is + + .. math :: + {}_pF_q\left(\begin{matrix} a_1, \cdots, a_p \\ b_1, \cdots, b_q \end{matrix} + \middle| z \right) + = \sum_{n=0}^\infty \frac{(a_1)_n \cdots (a_p)_n}{(b_1)_n \cdots (b_q)_n} + \frac{z^n}{n!}, + + where $(a)_n = (a)(a+1)\cdots(a+n-1)$ denotes the rising factorial. + + If one of the $b_q$ is a non-positive integer then the series is + undefined unless one of the $a_p$ is a larger (i.e., smaller in + magnitude) non-positive integer. If none of the $b_q$ is a + non-positive integer and one of the $a_p$ is a non-positive + integer, then the series reduces to a polynomial. To simplify the + following discussion, we assume that none of the $a_p$ or + $b_q$ is a non-positive integer. For more details, see the + references. + + The series converges for all $z$ if $p \le q$, and thus + defines an entire single-valued function in this case. If $p = + q+1$ the series converges for $|z| < 1$, and can be continued + analytically into a half-plane. If $p > q+1$ the series is + divergent for all $z$. + + Please note the hypergeometric function constructor currently does *not* + check if the parameters actually yield a well-defined function. + + Examples + ======== + + The parameters $a_p$ and $b_q$ can be passed as arbitrary + iterables, for example: + + >>> from sympy import hyper + >>> from sympy.abc import x, n, a + >>> h = hyper((1, 2, 3), [3, 4], x); h + hyper((1, 2), (4,), x) + >>> hyper((3, 1, 2), [3, 4], x, evaluate=False) # don't remove duplicates + hyper((1, 2, 3), (3, 4), x) + + There is also pretty printing (it looks better using Unicode): + + >>> from sympy import pprint + >>> pprint(h, use_unicode=False) + _ + |_ /1, 2 | \ + | | | x| + 2 1 \ 4 | / + + The parameters must always be iterables, even if they are vectors of + length one or zero: + + >>> hyper((1, ), [], x) + hyper((1,), (), x) + + But of course they may be variables (but if they depend on $x$ then you + should not expect much implemented functionality): + + >>> hyper((n, a), (n**2,), x) + hyper((a, n), (n**2,), x) + + The hypergeometric function generalizes many named special functions. + The function ``hyperexpand()`` tries to express a hypergeometric function + using named special functions. For example: + + >>> from sympy import hyperexpand + >>> hyperexpand(hyper([], [], x)) + exp(x) + + You can also use ``expand_func()``: + + >>> from sympy import expand_func + >>> expand_func(x*hyper([1, 1], [2], -x)) + log(x + 1) + + More examples: + + >>> from sympy import S + >>> hyperexpand(hyper([], [S(1)/2], -x**2/4)) + cos(x) + >>> hyperexpand(x*hyper([S(1)/2, S(1)/2], [S(3)/2], x**2)) + asin(x) + + We can also sometimes ``hyperexpand()`` parametric functions: + + >>> from sympy.abc import a + >>> hyperexpand(hyper([-a], [], x)) + (1 - x)**a + + See Also + ======== + + sympy.simplify.hyperexpand + gamma + meijerg + + References + ========== + + .. [1] Luke, Y. L. (1969), The Special Functions and Their Approximations, + Volume 1 + .. [2] https://en.wikipedia.org/wiki/Generalized_hypergeometric_function + + """ + + + def __new__(cls, ap, bq, z, **kwargs): + # TODO should we check convergence conditions? + if kwargs.pop('evaluate', global_parameters.evaluate): + ca = Counter(Tuple(*ap)) + cb = Counter(Tuple(*bq)) + common = ca & cb + arg = ap, bq = [], [] + for i, c in enumerate((ca, cb)): + c -= common + for k in ordered(c): + arg[i].extend([k]*c[k]) + else: + ap = list(ordered(ap)) + bq = list(ordered(bq)) + return super().__new__(cls, _prep_tuple(ap), _prep_tuple(bq), z, **kwargs) + + @classmethod + def eval(cls, ap, bq, z): + if len(ap) <= len(bq) or (len(ap) == len(bq) + 1 and (Abs(z) <= 1) == True): + nz = unpolarify(z) + if z != nz: + return hyper(ap, bq, nz) + + def fdiff(self, argindex=3): + if argindex != 3: + raise ArgumentIndexError(self, argindex) + nap = Tuple(*[a + 1 for a in self.ap]) + nbq = Tuple(*[b + 1 for b in self.bq]) + fac = Mul(*self.ap)/Mul(*self.bq) + return fac*hyper(nap, nbq, self.argument) + + def _eval_expand_func(self, **hints): + from sympy.functions.special.gamma_functions import gamma + from sympy.simplify.hyperexpand import hyperexpand + if len(self.ap) == 2 and len(self.bq) == 1 and self.argument == 1: + a, b = self.ap + c = self.bq[0] + return gamma(c)*gamma(c - a - b)/gamma(c - a)/gamma(c - b) + return hyperexpand(self) + + def _eval_rewrite_as_Sum(self, ap, bq, z, **kwargs): + from sympy.concrete.summations import Sum + n = Dummy("n", integer=True) + rfap = [RisingFactorial(a, n) for a in ap] + rfbq = [RisingFactorial(b, n) for b in bq] + coeff = Mul(*rfap) / Mul(*rfbq) + return Piecewise((Sum(coeff * z**n / factorial(n), (n, 0, oo)), + self.convergence_statement), (self, True)) + + def _eval_as_leading_term(self, x, logx, cdir): + arg = self.args[2] + x0 = arg.subs(x, 0) + if x0 is S.NaN: + x0 = arg.limit(x, 0, dir='-' if re(cdir).is_negative else '+') + + if x0 is S.Zero: + return S.One + return super()._eval_as_leading_term(x, logx=logx, cdir=cdir) + + def _eval_nseries(self, x, n, logx, cdir=0): + + from sympy.series.order import Order + + arg = self.args[2] + x0 = arg.limit(x, 0) + ap = self.args[0] + bq = self.args[1] + + if not (arg == x and x0 == 0): + # It would be better to do something with arg.nseries here, rather + # than falling back on Function._eval_nseries. The code below + # though is not sufficient if arg is something like x/(x+1). + from sympy.simplify.hyperexpand import hyperexpand + return hyperexpand(super()._eval_nseries(x, n, logx)) + + terms = [] + + for i in range(n): + num = Mul(*[RisingFactorial(a, i) for a in ap]) + den = Mul(*[RisingFactorial(b, i) for b in bq]) + terms.append(((num/den) * (arg**i)) / factorial(i)) + + return (Add(*terms) + Order(x**n,x)) + + @property + def argument(self): + """ Argument of the hypergeometric function. """ + return self.args[2] + + @property + def ap(self): + """ Numerator parameters of the hypergeometric function. """ + return Tuple(*self.args[0]) + + @property + def bq(self): + """ Denominator parameters of the hypergeometric function. """ + return Tuple(*self.args[1]) + + @property + def _diffargs(self): + return self.ap + self.bq + + @property + def eta(self): + """ A quantity related to the convergence of the series. """ + return sum(self.ap) - sum(self.bq) + + @property + def radius_of_convergence(self): + """ + Compute the radius of convergence of the defining series. + + Explanation + =========== + + Note that even if this is not ``oo``, the function may still be + evaluated outside of the radius of convergence by analytic + continuation. But if this is zero, then the function is not actually + defined anywhere else. + + Examples + ======== + + >>> from sympy import hyper + >>> from sympy.abc import z + >>> hyper((1, 2), [3], z).radius_of_convergence + 1 + >>> hyper((1, 2, 3), [4], z).radius_of_convergence + 0 + >>> hyper((1, 2), (3, 4), z).radius_of_convergence + oo + + """ + if any(a.is_integer and (a <= 0) == True for a in self.ap + self.bq): + aints = [a for a in self.ap if a.is_Integer and (a <= 0) == True] + bints = [a for a in self.bq if a.is_Integer and (a <= 0) == True] + if len(aints) < len(bints): + return S.Zero + popped = False + for b in bints: + cancelled = False + while aints: + a = aints.pop() + if a >= b: + cancelled = True + break + popped = True + if not cancelled: + return S.Zero + if aints or popped: + # There are still non-positive numerator parameters. + # This is a polynomial. + return oo + if len(self.ap) == len(self.bq) + 1: + return S.One + elif len(self.ap) <= len(self.bq): + return oo + else: + return S.Zero + + @property + def convergence_statement(self): + """ Return a condition on z under which the series converges. """ + R = self.radius_of_convergence + if R == 0: + return False + if R == oo: + return True + # The special functions and their approximations, page 44 + e = self.eta + z = self.argument + c1 = And(re(e) < 0, abs(z) <= 1) + c2 = And(0 <= re(e), re(e) < 1, abs(z) <= 1, Ne(z, 1)) + c3 = And(re(e) >= 1, abs(z) < 1) + return Or(c1, c2, c3) + + def _eval_simplify(self, **kwargs): + from sympy.simplify.hyperexpand import hyperexpand + return hyperexpand(self) + + +class meijerg(TupleParametersBase): + r""" + The Meijer G-function is defined by a Mellin-Barnes type integral that + resembles an inverse Mellin transform. It generalizes the hypergeometric + functions. + + Explanation + =========== + + The Meijer G-function depends on four sets of parameters. There are + "*numerator parameters*" + $a_1, \ldots, a_n$ and $a_{n+1}, \ldots, a_p$, and there are + "*denominator parameters*" + $b_1, \ldots, b_m$ and $b_{m+1}, \ldots, b_q$. + Confusingly, it is traditionally denoted as follows (note the position + of $m$, $n$, $p$, $q$, and how they relate to the lengths of the four + parameter vectors): + + .. math :: + G_{p,q}^{m,n} \left(\begin{matrix}a_1, \cdots, a_n & a_{n+1}, \cdots, a_p \\ + b_1, \cdots, b_m & b_{m+1}, \cdots, b_q + \end{matrix} \middle| z \right). + + However, in SymPy the four parameter vectors are always available + separately (see examples), so that there is no need to keep track of the + decorating sub- and super-scripts on the G symbol. + + The G function is defined as the following integral: + + .. math :: + \frac{1}{2 \pi i} \int_L \frac{\prod_{j=1}^m \Gamma(b_j - s) + \prod_{j=1}^n \Gamma(1 - a_j + s)}{\prod_{j=m+1}^q \Gamma(1- b_j +s) + \prod_{j=n+1}^p \Gamma(a_j - s)} z^s \mathrm{d}s, + + where $\Gamma(z)$ is the gamma function. There are three possible + contours which we will not describe in detail here (see the references). + If the integral converges along more than one of them, the definitions + agree. The contours all separate the poles of $\Gamma(1-a_j+s)$ + from the poles of $\Gamma(b_k-s)$, so in particular the G function + is undefined if $a_j - b_k \in \mathbb{Z}_{>0}$ for some + $j \le n$ and $k \le m$. + + The conditions under which one of the contours yields a convergent integral + are complicated and we do not state them here, see the references. + + Please note currently the Meijer G-function constructor does *not* check any + convergence conditions. + + Examples + ======== + + You can pass the parameters either as four separate vectors: + + >>> from sympy import meijerg, Tuple, pprint + >>> from sympy.abc import x, a + >>> pprint(meijerg((1, 2), (a, 4), (5,), [], x), use_unicode=False) + __1, 2 /1, 2 4, a | \ + /__ | | x| + \_|4, 1 \ 5 | / + + Or as two nested vectors: + + >>> pprint(meijerg([(1, 2), (3, 4)], ([5], Tuple()), x), use_unicode=False) + __1, 2 /1, 2 3, 4 | \ + /__ | | x| + \_|4, 1 \ 5 | / + + As with the hypergeometric function, the parameters may be passed as + arbitrary iterables. Vectors of length zero and one also have to be + passed as iterables. The parameters need not be constants, but if they + depend on the argument then not much implemented functionality should be + expected. + + All the subvectors of parameters are available: + + >>> from sympy import pprint + >>> g = meijerg([1], [2], [3], [4], x) + >>> pprint(g, use_unicode=False) + __1, 1 /1 2 | \ + /__ | | x| + \_|2, 2 \3 4 | / + >>> g.an + (1,) + >>> g.ap + (1, 2) + >>> g.aother + (2,) + >>> g.bm + (3,) + >>> g.bq + (3, 4) + >>> g.bother + (4,) + + The Meijer G-function generalizes the hypergeometric functions. + In some cases it can be expressed in terms of hypergeometric functions, + using Slater's theorem. For example: + + >>> from sympy import hyperexpand + >>> from sympy.abc import a, b, c + >>> hyperexpand(meijerg([a], [], [c], [b], x), allow_hyper=True) + x**c*gamma(-a + c + 1)*hyper((-a + c + 1,), + (-b + c + 1,), -x)/gamma(-b + c + 1) + + Thus the Meijer G-function also subsumes many named functions as special + cases. You can use ``expand_func()`` or ``hyperexpand()`` to (try to) + rewrite a Meijer G-function in terms of named special functions. For + example: + + >>> from sympy import expand_func, S + >>> expand_func(meijerg([[],[]], [[0],[]], -x)) + exp(x) + >>> hyperexpand(meijerg([[],[]], [[S(1)/2],[0]], (x/2)**2)) + sin(x)/sqrt(pi) + + See Also + ======== + + hyper + sympy.simplify.hyperexpand + + References + ========== + + .. [1] Luke, Y. L. (1969), The Special Functions and Their Approximations, + Volume 1 + .. [2] https://en.wikipedia.org/wiki/Meijer_G-function + + """ + + + def __new__(cls, *args, **kwargs): + if len(args) == 5: + args = [(args[0], args[1]), (args[2], args[3]), args[4]] + if len(args) != 3: + raise TypeError("args must be either as, as', bs, bs', z or " + "as, bs, z") + + def tr(p): + if len(p) != 2: + raise TypeError("wrong argument") + p = [list(ordered(i)) for i in p] + return TupleArg(_prep_tuple(p[0]), _prep_tuple(p[1])) + + arg0, arg1 = tr(args[0]), tr(args[1]) + if Tuple(arg0, arg1).has(oo, zoo, -oo): + raise ValueError("G-function parameters must be finite") + if any((a - b).is_Integer and a - b > 0 + for a in arg0[0] for b in arg1[0]): + raise ValueError("no parameter a1, ..., an may differ from " + "any b1, ..., bm by a positive integer") + + # TODO should we check convergence conditions? + return super().__new__(cls, arg0, arg1, args[2], **kwargs) + + def fdiff(self, argindex=3): + if argindex != 3: + return self._diff_wrt_parameter(argindex[1]) + if len(self.an) >= 1: + a = list(self.an) + a[0] -= 1 + G = meijerg(a, self.aother, self.bm, self.bother, self.argument) + return 1/self.argument * ((self.an[0] - 1)*self + G) + elif len(self.bm) >= 1: + b = list(self.bm) + b[0] += 1 + G = meijerg(self.an, self.aother, b, self.bother, self.argument) + return 1/self.argument * (self.bm[0]*self - G) + else: + return S.Zero + + def _diff_wrt_parameter(self, idx): + # Differentiation wrt a parameter can only be done in very special + # cases. In particular, if we want to differentiate with respect to + # `a`, all other gamma factors have to reduce to rational functions. + # + # Let MT denote mellin transform. Suppose T(-s) is the gamma factor + # appearing in the definition of G. Then + # + # MT(log(z)G(z)) = d/ds T(s) = d/da T(s) + ... + # + # Thus d/da G(z) = log(z)G(z) - ... + # The ... can be evaluated as a G function under the above conditions, + # the formula being most easily derived by using + # + # d Gamma(s + n) Gamma(s + n) / 1 1 1 \ + # -- ------------ = ------------ | - + ---- + ... + --------- | + # ds Gamma(s) Gamma(s) \ s s + 1 s + n - 1 / + # + # which follows from the difference equation of the digamma function. + # (There is a similar equation for -n instead of +n). + + # We first figure out how to pair the parameters. + an = list(self.an) + ap = list(self.aother) + bm = list(self.bm) + bq = list(self.bother) + if idx < len(an): + an.pop(idx) + else: + idx -= len(an) + if idx < len(ap): + ap.pop(idx) + else: + idx -= len(ap) + if idx < len(bm): + bm.pop(idx) + else: + bq.pop(idx - len(bm)) + pairs1 = [] + pairs2 = [] + for l1, l2, pairs in [(an, bq, pairs1), (ap, bm, pairs2)]: + while l1: + x = l1.pop() + found = None + for i, y in enumerate(l2): + if not Mod((x - y).simplify(), 1): + found = i + break + if found is None: + raise NotImplementedError('Derivative not expressible ' + 'as G-function?') + y = l2[i] + l2.pop(i) + pairs.append((x, y)) + + # Now build the result. + res = log(self.argument)*self + + for a, b in pairs1: + sign = 1 + n = a - b + base = b + if n < 0: + sign = -1 + n = b - a + base = a + for k in range(n): + res -= sign*meijerg(self.an + (base + k + 1,), self.aother, + self.bm, self.bother + (base + k + 0,), + self.argument) + + for a, b in pairs2: + sign = 1 + n = b - a + base = a + if n < 0: + sign = -1 + n = a - b + base = b + for k in range(n): + res -= sign*meijerg(self.an, self.aother + (base + k + 1,), + self.bm + (base + k + 0,), self.bother, + self.argument) + + return res + + def get_period(self): + """ + Return a number $P$ such that $G(x*exp(I*P)) == G(x)$. + + Examples + ======== + + >>> from sympy import meijerg, pi, S + >>> from sympy.abc import z + + >>> meijerg([1], [], [], [], z).get_period() + 2*pi + >>> meijerg([pi], [], [], [], z).get_period() + oo + >>> meijerg([1, 2], [], [], [], z).get_period() + oo + >>> meijerg([1,1], [2], [1, S(1)/2, S(1)/3], [1], z).get_period() + 12*pi + + """ + # This follows from slater's theorem. + def compute(l): + # first check that no two differ by an integer + for i, b in enumerate(l): + if not b.is_Rational: + return oo + for j in range(i + 1, len(l)): + if not Mod((b - l[j]).simplify(), 1): + return oo + return lcm(*(x.q for x in l)) + beta = compute(self.bm) + alpha = compute(self.an) + p, q = len(self.ap), len(self.bq) + if p == q: + if oo in (alpha, beta): + return oo + return 2*pi*lcm(alpha, beta) + elif p < q: + return 2*pi*beta + else: + return 2*pi*alpha + + def _eval_expand_func(self, **hints): + from sympy.simplify.hyperexpand import hyperexpand + return hyperexpand(self) + + def _eval_evalf(self, prec): + # The default code is insufficient for polar arguments. + # mpmath provides an optional argument "r", which evaluates + # G(z**(1/r)). I am not sure what its intended use is, but we hijack it + # here in the following way: to evaluate at a number z of |argument| + # less than (say) n*pi, we put r=1/n, compute z' = root(z, n) + # (carefully so as not to loose the branch information), and evaluate + # G(z'**(1/r)) = G(z'**n) = G(z). + import mpmath + znum = self.argument._eval_evalf(prec) + if znum.has(exp_polar): + znum, branch = znum.as_coeff_mul(exp_polar) + if len(branch) != 1: + return + branch = branch[0].args[0]/I + else: + branch = S.Zero + n = ceiling(abs(branch/pi)) + 1 + znum = znum**(S.One/n)*exp(I*branch / n) + + # Convert all args to mpf or mpc + try: + [z, r, ap, bq] = [arg._to_mpmath(prec) + for arg in [znum, 1/n, self.args[0], self.args[1]]] + except ValueError: + return + + with mpmath.workprec(prec): + v = mpmath.meijerg(ap, bq, z, r) + + return Expr._from_mpmath(v, prec) + + def _eval_as_leading_term(self, x, logx, cdir): + from sympy.simplify.hyperexpand import hyperexpand + return hyperexpand(self).as_leading_term(x, logx=logx, cdir=cdir) + + def integrand(self, s): + """ Get the defining integrand D(s). """ + from sympy.functions.special.gamma_functions import gamma + return self.argument**s \ + * Mul(*(gamma(b - s) for b in self.bm)) \ + * Mul(*(gamma(1 - a + s) for a in self.an)) \ + / Mul(*(gamma(1 - b + s) for b in self.bother)) \ + / Mul(*(gamma(a - s) for a in self.aother)) + + @property + def argument(self): + """ Argument of the Meijer G-function. """ + return self.args[2] + + @property + def an(self): + """ First set of numerator parameters. """ + return Tuple(*self.args[0][0]) + + @property + def ap(self): + """ Combined numerator parameters. """ + return Tuple(*(self.args[0][0] + self.args[0][1])) + + @property + def aother(self): + """ Second set of numerator parameters. """ + return Tuple(*self.args[0][1]) + + @property + def bm(self): + """ First set of denominator parameters. """ + return Tuple(*self.args[1][0]) + + @property + def bq(self): + """ Combined denominator parameters. """ + return Tuple(*(self.args[1][0] + self.args[1][1])) + + @property + def bother(self): + """ Second set of denominator parameters. """ + return Tuple(*self.args[1][1]) + + @property + def _diffargs(self): + return self.ap + self.bq + + @property + def nu(self): + """ A quantity related to the convergence region of the integral, + c.f. references. """ + return sum(self.bq) - sum(self.ap) + + @property + def delta(self): + """ A quantity related to the convergence region of the integral, + c.f. references. """ + return len(self.bm) + len(self.an) - S(len(self.ap) + len(self.bq))/2 + + @property + def is_number(self): + """ Returns true if expression has numeric data only. """ + return not self.free_symbols + + +class HyperRep(DefinedFunction): + """ + A base class for "hyper representation functions". + + This is used exclusively in ``hyperexpand()``, but fits more logically here. + + pFq is branched at 1 if p == q+1. For use with slater-expansion, we want + define an "analytic continuation" to all polar numbers, which is + continuous on circles and on the ray t*exp_polar(I*pi). Moreover, we want + a "nice" expression for the various cases. + + This base class contains the core logic, concrete derived classes only + supply the actual functions. + + """ + + + @classmethod + def eval(cls, *args): + newargs = tuple(map(unpolarify, args[:-1])) + args[-1:] + if args != newargs: + return cls(*newargs) + + @classmethod + def _expr_small(cls, x): + """ An expression for F(x) which holds for |x| < 1. """ + raise NotImplementedError + + @classmethod + def _expr_small_minus(cls, x): + """ An expression for F(-x) which holds for |x| < 1. """ + raise NotImplementedError + + @classmethod + def _expr_big(cls, x, n): + """ An expression for F(exp_polar(2*I*pi*n)*x), |x| > 1. """ + raise NotImplementedError + + @classmethod + def _expr_big_minus(cls, x, n): + """ An expression for F(exp_polar(2*I*pi*n + pi*I)*x), |x| > 1. """ + raise NotImplementedError + + def _eval_rewrite_as_nonrep(self, *args, **kwargs): + x, n = self.args[-1].extract_branch_factor(allow_half=True) + minus = False + newargs = self.args[:-1] + (x,) + if not n.is_Integer: + minus = True + n -= S.Half + newerargs = newargs + (n,) + if minus: + small = self._expr_small_minus(*newargs) + big = self._expr_big_minus(*newerargs) + else: + small = self._expr_small(*newargs) + big = self._expr_big(*newerargs) + + if big == small: + return small + return Piecewise((big, abs(x) > 1), (small, True)) + + def _eval_rewrite_as_nonrepsmall(self, *args, **kwargs): + x, n = self.args[-1].extract_branch_factor(allow_half=True) + args = self.args[:-1] + (x,) + if not n.is_Integer: + return self._expr_small_minus(*args) + return self._expr_small(*args) + + +class HyperRep_power1(HyperRep): + """ Return a representative for hyper([-a], [], z) == (1 - z)**a. """ + + @classmethod + def _expr_small(cls, a, x): + return (1 - x)**a + + @classmethod + def _expr_small_minus(cls, a, x): + return (1 + x)**a + + @classmethod + def _expr_big(cls, a, x, n): + if a.is_integer: + return cls._expr_small(a, x) + return (x - 1)**a*exp((2*n - 1)*pi*I*a) + + @classmethod + def _expr_big_minus(cls, a, x, n): + if a.is_integer: + return cls._expr_small_minus(a, x) + return (1 + x)**a*exp(2*n*pi*I*a) + + +class HyperRep_power2(HyperRep): + """ Return a representative for hyper([a, a - 1/2], [2*a], z). """ + + @classmethod + def _expr_small(cls, a, x): + return 2**(2*a - 1)*(1 + sqrt(1 - x))**(1 - 2*a) + + @classmethod + def _expr_small_minus(cls, a, x): + return 2**(2*a - 1)*(1 + sqrt(1 + x))**(1 - 2*a) + + @classmethod + def _expr_big(cls, a, x, n): + sgn = -1 + if n.is_odd: + sgn = 1 + n -= 1 + return 2**(2*a - 1)*(1 + sgn*I*sqrt(x - 1))**(1 - 2*a) \ + *exp(-2*n*pi*I*a) + + @classmethod + def _expr_big_minus(cls, a, x, n): + sgn = 1 + if n.is_odd: + sgn = -1 + return sgn*2**(2*a - 1)*(sqrt(1 + x) + sgn)**(1 - 2*a)*exp(-2*pi*I*a*n) + + +class HyperRep_log1(HyperRep): + """ Represent -z*hyper([1, 1], [2], z) == log(1 - z). """ + @classmethod + def _expr_small(cls, x): + return log(1 - x) + + @classmethod + def _expr_small_minus(cls, x): + return log(1 + x) + + @classmethod + def _expr_big(cls, x, n): + return log(x - 1) + (2*n - 1)*pi*I + + @classmethod + def _expr_big_minus(cls, x, n): + return log(1 + x) + 2*n*pi*I + + +class HyperRep_atanh(HyperRep): + """ Represent hyper([1/2, 1], [3/2], z) == atanh(sqrt(z))/sqrt(z). """ + @classmethod + def _expr_small(cls, x): + return atanh(sqrt(x))/sqrt(x) + + def _expr_small_minus(cls, x): + return atan(sqrt(x))/sqrt(x) + + def _expr_big(cls, x, n): + if n.is_even: + return (acoth(sqrt(x)) + I*pi/2)/sqrt(x) + else: + return (acoth(sqrt(x)) - I*pi/2)/sqrt(x) + + def _expr_big_minus(cls, x, n): + if n.is_even: + return atan(sqrt(x))/sqrt(x) + else: + return (atan(sqrt(x)) - pi)/sqrt(x) + + +class HyperRep_asin1(HyperRep): + """ Represent hyper([1/2, 1/2], [3/2], z) == asin(sqrt(z))/sqrt(z). """ + @classmethod + def _expr_small(cls, z): + return asin(sqrt(z))/sqrt(z) + + @classmethod + def _expr_small_minus(cls, z): + return asinh(sqrt(z))/sqrt(z) + + @classmethod + def _expr_big(cls, z, n): + return S.NegativeOne**n*((S.Half - n)*pi/sqrt(z) + I*acosh(sqrt(z))/sqrt(z)) + + @classmethod + def _expr_big_minus(cls, z, n): + return S.NegativeOne**n*(asinh(sqrt(z))/sqrt(z) + n*pi*I/sqrt(z)) + + +class HyperRep_asin2(HyperRep): + """ Represent hyper([1, 1], [3/2], z) == asin(sqrt(z))/sqrt(z)/sqrt(1-z). """ + # TODO this can be nicer + @classmethod + def _expr_small(cls, z): + return HyperRep_asin1._expr_small(z) \ + /HyperRep_power1._expr_small(S.Half, z) + + @classmethod + def _expr_small_minus(cls, z): + return HyperRep_asin1._expr_small_minus(z) \ + /HyperRep_power1._expr_small_minus(S.Half, z) + + @classmethod + def _expr_big(cls, z, n): + return HyperRep_asin1._expr_big(z, n) \ + /HyperRep_power1._expr_big(S.Half, z, n) + + @classmethod + def _expr_big_minus(cls, z, n): + return HyperRep_asin1._expr_big_minus(z, n) \ + /HyperRep_power1._expr_big_minus(S.Half, z, n) + + +class HyperRep_sqrts1(HyperRep): + """ Return a representative for hyper([-a, 1/2 - a], [1/2], z). """ + + @classmethod + def _expr_small(cls, a, z): + return ((1 - sqrt(z))**(2*a) + (1 + sqrt(z))**(2*a))/2 + + @classmethod + def _expr_small_minus(cls, a, z): + return (1 + z)**a*cos(2*a*atan(sqrt(z))) + + @classmethod + def _expr_big(cls, a, z, n): + if n.is_even: + return ((sqrt(z) + 1)**(2*a)*exp(2*pi*I*n*a) + + (sqrt(z) - 1)**(2*a)*exp(2*pi*I*(n - 1)*a))/2 + else: + n -= 1 + return ((sqrt(z) - 1)**(2*a)*exp(2*pi*I*a*(n + 1)) + + (sqrt(z) + 1)**(2*a)*exp(2*pi*I*a*n))/2 + + @classmethod + def _expr_big_minus(cls, a, z, n): + if n.is_even: + return (1 + z)**a*exp(2*pi*I*n*a)*cos(2*a*atan(sqrt(z))) + else: + return (1 + z)**a*exp(2*pi*I*n*a)*cos(2*a*atan(sqrt(z)) - 2*pi*a) + + +class HyperRep_sqrts2(HyperRep): + """ Return a representative for + sqrt(z)/2*[(1-sqrt(z))**2a - (1 + sqrt(z))**2a] + == -2*z/(2*a+1) d/dz hyper([-a - 1/2, -a], [1/2], z)""" + + @classmethod + def _expr_small(cls, a, z): + return sqrt(z)*((1 - sqrt(z))**(2*a) - (1 + sqrt(z))**(2*a))/2 + + @classmethod + def _expr_small_minus(cls, a, z): + return sqrt(z)*(1 + z)**a*sin(2*a*atan(sqrt(z))) + + @classmethod + def _expr_big(cls, a, z, n): + if n.is_even: + return sqrt(z)/2*((sqrt(z) - 1)**(2*a)*exp(2*pi*I*a*(n - 1)) - + (sqrt(z) + 1)**(2*a)*exp(2*pi*I*a*n)) + else: + n -= 1 + return sqrt(z)/2*((sqrt(z) - 1)**(2*a)*exp(2*pi*I*a*(n + 1)) - + (sqrt(z) + 1)**(2*a)*exp(2*pi*I*a*n)) + + def _expr_big_minus(cls, a, z, n): + if n.is_even: + return (1 + z)**a*exp(2*pi*I*n*a)*sqrt(z)*sin(2*a*atan(sqrt(z))) + else: + return (1 + z)**a*exp(2*pi*I*n*a)*sqrt(z) \ + *sin(2*a*atan(sqrt(z)) - 2*pi*a) + + +class HyperRep_log2(HyperRep): + """ Represent log(1/2 + sqrt(1 - z)/2) == -z/4*hyper([3/2, 1, 1], [2, 2], z) """ + + @classmethod + def _expr_small(cls, z): + return log(S.Half + sqrt(1 - z)/2) + + @classmethod + def _expr_small_minus(cls, z): + return log(S.Half + sqrt(1 + z)/2) + + @classmethod + def _expr_big(cls, z, n): + if n.is_even: + return (n - S.Half)*pi*I + log(sqrt(z)/2) + I*asin(1/sqrt(z)) + else: + return (n - S.Half)*pi*I + log(sqrt(z)/2) - I*asin(1/sqrt(z)) + + def _expr_big_minus(cls, z, n): + if n.is_even: + return pi*I*n + log(S.Half + sqrt(1 + z)/2) + else: + return pi*I*n + log(sqrt(1 + z)/2 - S.Half) + + +class HyperRep_cosasin(HyperRep): + """ Represent hyper([a, -a], [1/2], z) == cos(2*a*asin(sqrt(z))). """ + # Note there are many alternative expressions, e.g. as powers of a sum of + # square roots. + + @classmethod + def _expr_small(cls, a, z): + return cos(2*a*asin(sqrt(z))) + + @classmethod + def _expr_small_minus(cls, a, z): + return cosh(2*a*asinh(sqrt(z))) + + @classmethod + def _expr_big(cls, a, z, n): + return cosh(2*a*acosh(sqrt(z)) + a*pi*I*(2*n - 1)) + + @classmethod + def _expr_big_minus(cls, a, z, n): + return cosh(2*a*asinh(sqrt(z)) + 2*a*pi*I*n) + + +class HyperRep_sinasin(HyperRep): + """ Represent 2*a*z*hyper([1 - a, 1 + a], [3/2], z) + == sqrt(z)/sqrt(1-z)*sin(2*a*asin(sqrt(z))) """ + + @classmethod + def _expr_small(cls, a, z): + return sqrt(z)/sqrt(1 - z)*sin(2*a*asin(sqrt(z))) + + @classmethod + def _expr_small_minus(cls, a, z): + return -sqrt(z)/sqrt(1 + z)*sinh(2*a*asinh(sqrt(z))) + + @classmethod + def _expr_big(cls, a, z, n): + return -1/sqrt(1 - 1/z)*sinh(2*a*acosh(sqrt(z)) + a*pi*I*(2*n - 1)) + + @classmethod + def _expr_big_minus(cls, a, z, n): + return -1/sqrt(1 + 1/z)*sinh(2*a*asinh(sqrt(z)) + 2*a*pi*I*n) + +class appellf1(DefinedFunction): + r""" + This is the Appell hypergeometric function of two variables as: + + .. math :: + F_1(a,b_1,b_2,c,x,y) = \sum_{m=0}^{\infty} \sum_{n=0}^{\infty} + \frac{(a)_{m+n} (b_1)_m (b_2)_n}{(c)_{m+n}} + \frac{x^m y^n}{m! n!}. + + Examples + ======== + + >>> from sympy import appellf1, symbols + >>> x, y, a, b1, b2, c = symbols('x y a b1 b2 c') + >>> appellf1(2., 1., 6., 4., 5., 6.) + 0.0063339426292673 + >>> appellf1(12., 12., 6., 4., 0.5, 0.12) + 172870711.659936 + >>> appellf1(40, 2, 6, 4, 15, 60) + appellf1(40, 2, 6, 4, 15, 60) + >>> appellf1(20., 12., 10., 3., 0.5, 0.12) + 15605338197184.4 + >>> appellf1(40, 2, 6, 4, x, y) + appellf1(40, 2, 6, 4, x, y) + >>> appellf1(a, b1, b2, c, x, y) + appellf1(a, b1, b2, c, x, y) + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Appell_series + .. [2] https://functions.wolfram.com/HypergeometricFunctions/AppellF1/ + + """ + + @classmethod + def eval(cls, a, b1, b2, c, x, y): + if default_sort_key(b1) > default_sort_key(b2): + b1, b2 = b2, b1 + x, y = y, x + return cls(a, b1, b2, c, x, y) + elif b1 == b2 and default_sort_key(x) > default_sort_key(y): + x, y = y, x + return cls(a, b1, b2, c, x, y) + if x == 0 and y == 0: + return S.One + + def fdiff(self, argindex=5): + a, b1, b2, c, x, y = self.args + if argindex == 5: + return (a*b1/c)*appellf1(a + 1, b1 + 1, b2, c + 1, x, y) + elif argindex == 6: + return (a*b2/c)*appellf1(a + 1, b1, b2 + 1, c + 1, x, y) + elif argindex in (1, 2, 3, 4): + return Derivative(self, self.args[argindex-1]) + else: + raise ArgumentIndexError(self, argindex) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/functions/special/mathieu_functions.py b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/functions/special/mathieu_functions.py new file mode 100644 index 0000000000000000000000000000000000000000..66bccd8d3e6dd357e1e0b93fb5cb5ad4c5f1367f --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/functions/special/mathieu_functions.py @@ -0,0 +1,269 @@ +""" This module contains the Mathieu functions. +""" + +from sympy.core.function import DefinedFunction, ArgumentIndexError +from sympy.functions.elementary.miscellaneous import sqrt +from sympy.functions.elementary.trigonometric import sin, cos + + +class MathieuBase(DefinedFunction): + """ + Abstract base class for Mathieu functions. + + This class is meant to reduce code duplication. + + """ + + unbranched = True + + def _eval_conjugate(self): + a, q, z = self.args + return self.func(a.conjugate(), q.conjugate(), z.conjugate()) + + +class mathieus(MathieuBase): + r""" + The Mathieu Sine function $S(a,q,z)$. + + Explanation + =========== + + This function is one solution of the Mathieu differential equation: + + .. math :: + y(x)^{\prime\prime} + (a - 2 q \cos(2 x)) y(x) = 0 + + The other solution is the Mathieu Cosine function. + + Examples + ======== + + >>> from sympy import diff, mathieus + >>> from sympy.abc import a, q, z + + >>> mathieus(a, q, z) + mathieus(a, q, z) + + >>> mathieus(a, 0, z) + sin(sqrt(a)*z) + + >>> diff(mathieus(a, q, z), z) + mathieusprime(a, q, z) + + See Also + ======== + + mathieuc: Mathieu cosine function. + mathieusprime: Derivative of Mathieu sine function. + mathieucprime: Derivative of Mathieu cosine function. + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Mathieu_function + .. [2] https://dlmf.nist.gov/28 + .. [3] https://mathworld.wolfram.com/MathieuFunction.html + .. [4] https://functions.wolfram.com/MathieuandSpheroidalFunctions/MathieuS/ + + """ + + def fdiff(self, argindex=1): + if argindex == 3: + a, q, z = self.args + return mathieusprime(a, q, z) + else: + raise ArgumentIndexError(self, argindex) + + @classmethod + def eval(cls, a, q, z): + if q.is_Number and q.is_zero: + return sin(sqrt(a)*z) + # Try to pull out factors of -1 + if z.could_extract_minus_sign(): + return -cls(a, q, -z) + + +class mathieuc(MathieuBase): + r""" + The Mathieu Cosine function $C(a,q,z)$. + + Explanation + =========== + + This function is one solution of the Mathieu differential equation: + + .. math :: + y(x)^{\prime\prime} + (a - 2 q \cos(2 x)) y(x) = 0 + + The other solution is the Mathieu Sine function. + + Examples + ======== + + >>> from sympy import diff, mathieuc + >>> from sympy.abc import a, q, z + + >>> mathieuc(a, q, z) + mathieuc(a, q, z) + + >>> mathieuc(a, 0, z) + cos(sqrt(a)*z) + + >>> diff(mathieuc(a, q, z), z) + mathieucprime(a, q, z) + + See Also + ======== + + mathieus: Mathieu sine function + mathieusprime: Derivative of Mathieu sine function + mathieucprime: Derivative of Mathieu cosine function + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Mathieu_function + .. [2] https://dlmf.nist.gov/28 + .. [3] https://mathworld.wolfram.com/MathieuFunction.html + .. [4] https://functions.wolfram.com/MathieuandSpheroidalFunctions/MathieuC/ + + """ + + def fdiff(self, argindex=1): + if argindex == 3: + a, q, z = self.args + return mathieucprime(a, q, z) + else: + raise ArgumentIndexError(self, argindex) + + @classmethod + def eval(cls, a, q, z): + if q.is_Number and q.is_zero: + return cos(sqrt(a)*z) + # Try to pull out factors of -1 + if z.could_extract_minus_sign(): + return cls(a, q, -z) + + +class mathieusprime(MathieuBase): + r""" + The derivative $S^{\prime}(a,q,z)$ of the Mathieu Sine function. + + Explanation + =========== + + This function is one solution of the Mathieu differential equation: + + .. math :: + y(x)^{\prime\prime} + (a - 2 q \cos(2 x)) y(x) = 0 + + The other solution is the Mathieu Cosine function. + + Examples + ======== + + >>> from sympy import diff, mathieusprime + >>> from sympy.abc import a, q, z + + >>> mathieusprime(a, q, z) + mathieusprime(a, q, z) + + >>> mathieusprime(a, 0, z) + sqrt(a)*cos(sqrt(a)*z) + + >>> diff(mathieusprime(a, q, z), z) + (-a + 2*q*cos(2*z))*mathieus(a, q, z) + + See Also + ======== + + mathieus: Mathieu sine function + mathieuc: Mathieu cosine function + mathieucprime: Derivative of Mathieu cosine function + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Mathieu_function + .. [2] https://dlmf.nist.gov/28 + .. [3] https://mathworld.wolfram.com/MathieuFunction.html + .. [4] https://functions.wolfram.com/MathieuandSpheroidalFunctions/MathieuSPrime/ + + """ + + def fdiff(self, argindex=1): + if argindex == 3: + a, q, z = self.args + return (2*q*cos(2*z) - a)*mathieus(a, q, z) + else: + raise ArgumentIndexError(self, argindex) + + @classmethod + def eval(cls, a, q, z): + if q.is_Number and q.is_zero: + return sqrt(a)*cos(sqrt(a)*z) + # Try to pull out factors of -1 + if z.could_extract_minus_sign(): + return cls(a, q, -z) + + +class mathieucprime(MathieuBase): + r""" + The derivative $C^{\prime}(a,q,z)$ of the Mathieu Cosine function. + + Explanation + =========== + + This function is one solution of the Mathieu differential equation: + + .. math :: + y(x)^{\prime\prime} + (a - 2 q \cos(2 x)) y(x) = 0 + + The other solution is the Mathieu Sine function. + + Examples + ======== + + >>> from sympy import diff, mathieucprime + >>> from sympy.abc import a, q, z + + >>> mathieucprime(a, q, z) + mathieucprime(a, q, z) + + >>> mathieucprime(a, 0, z) + -sqrt(a)*sin(sqrt(a)*z) + + >>> diff(mathieucprime(a, q, z), z) + (-a + 2*q*cos(2*z))*mathieuc(a, q, z) + + See Also + ======== + + mathieus: Mathieu sine function + mathieuc: Mathieu cosine function + mathieusprime: Derivative of Mathieu sine function + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Mathieu_function + .. [2] https://dlmf.nist.gov/28 + .. [3] https://mathworld.wolfram.com/MathieuFunction.html + .. [4] https://functions.wolfram.com/MathieuandSpheroidalFunctions/MathieuCPrime/ + + """ + + def fdiff(self, argindex=1): + if argindex == 3: + a, q, z = self.args + return (2*q*cos(2*z) - a)*mathieuc(a, q, z) + else: + raise ArgumentIndexError(self, argindex) + + @classmethod + def eval(cls, a, q, z): + if q.is_Number and q.is_zero: + return -sqrt(a)*sin(sqrt(a)*z) + # Try to pull out factors of -1 + if z.could_extract_minus_sign(): + return -cls(a, q, -z) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/functions/special/polynomials.py b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/functions/special/polynomials.py new file mode 100644 index 0000000000000000000000000000000000000000..5816baef600baf957c31a9dddaa5571da86d754a --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/functions/special/polynomials.py @@ -0,0 +1,1447 @@ +""" +This module mainly implements special orthogonal polynomials. + +See also functions.combinatorial.numbers which contains some +combinatorial polynomials. + +""" + +from sympy.core import Rational +from sympy.core.function import DefinedFunction, ArgumentIndexError +from sympy.core.singleton import S +from sympy.core.symbol import Dummy +from sympy.functions.combinatorial.factorials import binomial, factorial, RisingFactorial +from sympy.functions.elementary.complexes import re +from sympy.functions.elementary.exponential import exp +from sympy.functions.elementary.integers import floor +from sympy.functions.elementary.miscellaneous import sqrt +from sympy.functions.elementary.trigonometric import cos, sec +from sympy.functions.special.gamma_functions import gamma +from sympy.functions.special.hyper import hyper +from sympy.polys.orthopolys import (chebyshevt_poly, chebyshevu_poly, + gegenbauer_poly, hermite_poly, hermite_prob_poly, + jacobi_poly, laguerre_poly, legendre_poly) + +_x = Dummy('x') + + +class OrthogonalPolynomial(DefinedFunction): + """Base class for orthogonal polynomials. + """ + + @classmethod + def _eval_at_order(cls, n, x): + if n.is_integer and n >= 0: + return cls._ortho_poly(int(n), _x).subs(_x, x) + + def _eval_conjugate(self): + return self.func(self.args[0], self.args[1].conjugate()) + +#---------------------------------------------------------------------------- +# Jacobi polynomials +# + + +class jacobi(OrthogonalPolynomial): + r""" + Jacobi polynomial $P_n^{\left(\alpha, \beta\right)}(x)$. + + Explanation + =========== + + ``jacobi(n, alpha, beta, x)`` gives the $n$th Jacobi polynomial + in $x$, $P_n^{\left(\alpha, \beta\right)}(x)$. + + The Jacobi polynomials are orthogonal on $[-1, 1]$ with respect + to the weight $\left(1-x\right)^\alpha \left(1+x\right)^\beta$. + + Examples + ======== + + >>> from sympy import jacobi, S, conjugate, diff + >>> from sympy.abc import a, b, n, x + + >>> jacobi(0, a, b, x) + 1 + >>> jacobi(1, a, b, x) + a/2 - b/2 + x*(a/2 + b/2 + 1) + >>> jacobi(2, a, b, x) + a**2/8 - a*b/4 - a/8 + b**2/8 - b/8 + x**2*(a**2/8 + a*b/4 + 7*a/8 + b**2/8 + 7*b/8 + 3/2) + x*(a**2/4 + 3*a/4 - b**2/4 - 3*b/4) - 1/2 + + >>> jacobi(n, a, b, x) + jacobi(n, a, b, x) + + >>> jacobi(n, a, a, x) + RisingFactorial(a + 1, n)*gegenbauer(n, + a + 1/2, x)/RisingFactorial(2*a + 1, n) + + >>> jacobi(n, 0, 0, x) + legendre(n, x) + + >>> jacobi(n, S(1)/2, S(1)/2, x) + RisingFactorial(3/2, n)*chebyshevu(n, x)/factorial(n + 1) + + >>> jacobi(n, -S(1)/2, -S(1)/2, x) + RisingFactorial(1/2, n)*chebyshevt(n, x)/factorial(n) + + >>> jacobi(n, a, b, -x) + (-1)**n*jacobi(n, b, a, x) + + >>> jacobi(n, a, b, 0) + gamma(a + n + 1)*hyper((-n, -b - n), (a + 1,), -1)/(2**n*factorial(n)*gamma(a + 1)) + >>> jacobi(n, a, b, 1) + RisingFactorial(a + 1, n)/factorial(n) + + >>> conjugate(jacobi(n, a, b, x)) + jacobi(n, conjugate(a), conjugate(b), conjugate(x)) + + >>> diff(jacobi(n,a,b,x), x) + (a/2 + b/2 + n/2 + 1/2)*jacobi(n - 1, a + 1, b + 1, x) + + See Also + ======== + + gegenbauer, + chebyshevt_root, chebyshevu, chebyshevu_root, + legendre, assoc_legendre, + hermite, hermite_prob, + laguerre, assoc_laguerre, + sympy.polys.orthopolys.jacobi_poly, + sympy.polys.orthopolys.gegenbauer_poly + sympy.polys.orthopolys.chebyshevt_poly + sympy.polys.orthopolys.chebyshevu_poly + sympy.polys.orthopolys.hermite_poly + sympy.polys.orthopolys.legendre_poly + sympy.polys.orthopolys.laguerre_poly + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Jacobi_polynomials + .. [2] https://mathworld.wolfram.com/JacobiPolynomial.html + .. [3] https://functions.wolfram.com/Polynomials/JacobiP/ + + """ + + @classmethod + def eval(cls, n, a, b, x): + # Simplify to other polynomials + # P^{a, a}_n(x) + if a == b: + if a == Rational(-1, 2): + return RisingFactorial(S.Half, n) / factorial(n) * chebyshevt(n, x) + elif a.is_zero: + return legendre(n, x) + elif a == S.Half: + return RisingFactorial(3*S.Half, n) / factorial(n + 1) * chebyshevu(n, x) + else: + return RisingFactorial(a + 1, n) / RisingFactorial(2*a + 1, n) * gegenbauer(n, a + S.Half, x) + elif b == -a: + # P^{a, -a}_n(x) + return gamma(n + a + 1) / gamma(n + 1) * (1 + x)**(a/2) / (1 - x)**(a/2) * assoc_legendre(n, -a, x) + + if not n.is_Number: + # Symbolic result P^{a,b}_n(x) + # P^{a,b}_n(-x) ---> (-1)**n * P^{b,a}_n(-x) + if x.could_extract_minus_sign(): + return S.NegativeOne**n * jacobi(n, b, a, -x) + # We can evaluate for some special values of x + if x.is_zero: + return (2**(-n) * gamma(a + n + 1) / (gamma(a + 1) * factorial(n)) * + hyper([-b - n, -n], [a + 1], -1)) + if x == S.One: + return RisingFactorial(a + 1, n) / factorial(n) + elif x is S.Infinity: + if n.is_positive: + # Make sure a+b+2*n \notin Z + if (a + b + 2*n).is_integer: + raise ValueError("Error. a + b + 2*n should not be an integer.") + return RisingFactorial(a + b + n + 1, n) * S.Infinity + else: + # n is a given fixed integer, evaluate into polynomial + return jacobi_poly(n, a, b, x) + + def fdiff(self, argindex=4): + from sympy.concrete.summations import Sum + if argindex == 1: + # Diff wrt n + raise ArgumentIndexError(self, argindex) + elif argindex == 2: + # Diff wrt a + n, a, b, x = self.args + k = Dummy("k") + f1 = 1 / (a + b + n + k + 1) + f2 = ((a + b + 2*k + 1) * RisingFactorial(b + k + 1, n - k) / + ((n - k) * RisingFactorial(a + b + k + 1, n - k))) + return Sum(f1 * (jacobi(n, a, b, x) + f2*jacobi(k, a, b, x)), (k, 0, n - 1)) + elif argindex == 3: + # Diff wrt b + n, a, b, x = self.args + k = Dummy("k") + f1 = 1 / (a + b + n + k + 1) + f2 = (-1)**(n - k) * ((a + b + 2*k + 1) * RisingFactorial(a + k + 1, n - k) / + ((n - k) * RisingFactorial(a + b + k + 1, n - k))) + return Sum(f1 * (jacobi(n, a, b, x) + f2*jacobi(k, a, b, x)), (k, 0, n - 1)) + elif argindex == 4: + # Diff wrt x + n, a, b, x = self.args + return S.Half * (a + b + n + 1) * jacobi(n - 1, a + 1, b + 1, x) + else: + raise ArgumentIndexError(self, argindex) + + def _eval_rewrite_as_Sum(self, n, a, b, x, **kwargs): + from sympy.concrete.summations import Sum + # Make sure n \in N + if n.is_negative or n.is_integer is False: + raise ValueError("Error: n should be a non-negative integer.") + k = Dummy("k") + kern = (RisingFactorial(-n, k) * RisingFactorial(a + b + n + 1, k) * RisingFactorial(a + k + 1, n - k) / + factorial(k) * ((1 - x)/2)**k) + return 1 / factorial(n) * Sum(kern, (k, 0, n)) + + def _eval_rewrite_as_polynomial(self, n, a, b, x, **kwargs): + # This function is just kept for backwards compatibility + # but should not be used + return self._eval_rewrite_as_Sum(n, a, b, x, **kwargs) + + def _eval_conjugate(self): + n, a, b, x = self.args + return self.func(n, a.conjugate(), b.conjugate(), x.conjugate()) + + +def jacobi_normalized(n, a, b, x): + r""" + Jacobi polynomial $P_n^{\left(\alpha, \beta\right)}(x)$. + + Explanation + =========== + + ``jacobi_normalized(n, alpha, beta, x)`` gives the $n$th + Jacobi polynomial in $x$, $P_n^{\left(\alpha, \beta\right)}(x)$. + + The Jacobi polynomials are orthogonal on $[-1, 1]$ with respect + to the weight $\left(1-x\right)^\alpha \left(1+x\right)^\beta$. + + This functions returns the polynomials normilzed: + + .. math:: + + \int_{-1}^{1} + P_m^{\left(\alpha, \beta\right)}(x) + P_n^{\left(\alpha, \beta\right)}(x) + (1-x)^{\alpha} (1+x)^{\beta} \mathrm{d}x + = \delta_{m,n} + + Examples + ======== + + >>> from sympy import jacobi_normalized + >>> from sympy.abc import n,a,b,x + + >>> jacobi_normalized(n, a, b, x) + jacobi(n, a, b, x)/sqrt(2**(a + b + 1)*gamma(a + n + 1)*gamma(b + n + 1)/((a + b + 2*n + 1)*factorial(n)*gamma(a + b + n + 1))) + + Parameters + ========== + + n : integer degree of polynomial + + a : alpha value + + b : beta value + + x : symbol + + See Also + ======== + + gegenbauer, + chebyshevt_root, chebyshevu, chebyshevu_root, + legendre, assoc_legendre, + hermite, hermite_prob, + laguerre, assoc_laguerre, + sympy.polys.orthopolys.jacobi_poly, + sympy.polys.orthopolys.gegenbauer_poly + sympy.polys.orthopolys.chebyshevt_poly + sympy.polys.orthopolys.chebyshevu_poly + sympy.polys.orthopolys.hermite_poly + sympy.polys.orthopolys.legendre_poly + sympy.polys.orthopolys.laguerre_poly + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Jacobi_polynomials + .. [2] https://mathworld.wolfram.com/JacobiPolynomial.html + .. [3] https://functions.wolfram.com/Polynomials/JacobiP/ + + """ + nfactor = (S(2)**(a + b + 1) * (gamma(n + a + 1) * gamma(n + b + 1)) + / (2*n + a + b + 1) / (factorial(n) * gamma(n + a + b + 1))) + + return jacobi(n, a, b, x) / sqrt(nfactor) + + +#---------------------------------------------------------------------------- +# Gegenbauer polynomials +# + + +class gegenbauer(OrthogonalPolynomial): + r""" + Gegenbauer polynomial $C_n^{\left(\alpha\right)}(x)$. + + Explanation + =========== + + ``gegenbauer(n, alpha, x)`` gives the $n$th Gegenbauer polynomial + in $x$, $C_n^{\left(\alpha\right)}(x)$. + + The Gegenbauer polynomials are orthogonal on $[-1, 1]$ with + respect to the weight $\left(1-x^2\right)^{\alpha-\frac{1}{2}}$. + + Examples + ======== + + >>> from sympy import gegenbauer, conjugate, diff + >>> from sympy.abc import n,a,x + >>> gegenbauer(0, a, x) + 1 + >>> gegenbauer(1, a, x) + 2*a*x + >>> gegenbauer(2, a, x) + -a + x**2*(2*a**2 + 2*a) + >>> gegenbauer(3, a, x) + x**3*(4*a**3/3 + 4*a**2 + 8*a/3) + x*(-2*a**2 - 2*a) + + >>> gegenbauer(n, a, x) + gegenbauer(n, a, x) + >>> gegenbauer(n, a, -x) + (-1)**n*gegenbauer(n, a, x) + + >>> gegenbauer(n, a, 0) + 2**n*sqrt(pi)*gamma(a + n/2)/(gamma(a)*gamma(1/2 - n/2)*gamma(n + 1)) + >>> gegenbauer(n, a, 1) + gamma(2*a + n)/(gamma(2*a)*gamma(n + 1)) + + >>> conjugate(gegenbauer(n, a, x)) + gegenbauer(n, conjugate(a), conjugate(x)) + + >>> diff(gegenbauer(n, a, x), x) + 2*a*gegenbauer(n - 1, a + 1, x) + + See Also + ======== + + jacobi, + chebyshevt_root, chebyshevu, chebyshevu_root, + legendre, assoc_legendre, + hermite, hermite_prob, + laguerre, assoc_laguerre, + sympy.polys.orthopolys.jacobi_poly + sympy.polys.orthopolys.gegenbauer_poly + sympy.polys.orthopolys.chebyshevt_poly + sympy.polys.orthopolys.chebyshevu_poly + sympy.polys.orthopolys.hermite_poly + sympy.polys.orthopolys.hermite_prob_poly + sympy.polys.orthopolys.legendre_poly + sympy.polys.orthopolys.laguerre_poly + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Gegenbauer_polynomials + .. [2] https://mathworld.wolfram.com/GegenbauerPolynomial.html + .. [3] https://functions.wolfram.com/Polynomials/GegenbauerC3/ + + """ + + @classmethod + def eval(cls, n, a, x): + # For negative n the polynomials vanish + # See https://functions.wolfram.com/Polynomials/GegenbauerC3/03/01/03/0012/ + if n.is_negative: + return S.Zero + + # Some special values for fixed a + if a == S.Half: + return legendre(n, x) + elif a == S.One: + return chebyshevu(n, x) + elif a == S.NegativeOne: + return S.Zero + + if not n.is_Number: + # Handle this before the general sign extraction rule + if x == S.NegativeOne: + if (re(a) > S.Half) == True: + return S.ComplexInfinity + else: + return (cos(S.Pi*(a+n)) * sec(S.Pi*a) * gamma(2*a+n) / + (gamma(2*a) * gamma(n+1))) + + # Symbolic result C^a_n(x) + # C^a_n(-x) ---> (-1)**n * C^a_n(x) + if x.could_extract_minus_sign(): + return S.NegativeOne**n * gegenbauer(n, a, -x) + # We can evaluate for some special values of x + if x.is_zero: + return (2**n * sqrt(S.Pi) * gamma(a + S.Half*n) / + (gamma((1 - n)/2) * gamma(n + 1) * gamma(a)) ) + if x == S.One: + return gamma(2*a + n) / (gamma(2*a) * gamma(n + 1)) + elif x is S.Infinity: + if n.is_positive: + return RisingFactorial(a, n) * S.Infinity + else: + # n is a given fixed integer, evaluate into polynomial + return gegenbauer_poly(n, a, x) + + def fdiff(self, argindex=3): + from sympy.concrete.summations import Sum + if argindex == 1: + # Diff wrt n + raise ArgumentIndexError(self, argindex) + elif argindex == 2: + # Diff wrt a + n, a, x = self.args + k = Dummy("k") + factor1 = 2 * (1 + (-1)**(n - k)) * (k + a) / ((k + + n + 2*a) * (n - k)) + factor2 = 2*(k + 1) / ((k + 2*a) * (2*k + 2*a + 1)) + \ + 2 / (k + n + 2*a) + kern = factor1*gegenbauer(k, a, x) + factor2*gegenbauer(n, a, x) + return Sum(kern, (k, 0, n - 1)) + elif argindex == 3: + # Diff wrt x + n, a, x = self.args + return 2*a*gegenbauer(n - 1, a + 1, x) + else: + raise ArgumentIndexError(self, argindex) + + def _eval_rewrite_as_Sum(self, n, a, x, **kwargs): + from sympy.concrete.summations import Sum + k = Dummy("k") + kern = ((-1)**k * RisingFactorial(a, n - k) * (2*x)**(n - 2*k) / + (factorial(k) * factorial(n - 2*k))) + return Sum(kern, (k, 0, floor(n/2))) + + def _eval_rewrite_as_polynomial(self, n, a, x, **kwargs): + # This function is just kept for backwards compatibility + # but should not be used + return self._eval_rewrite_as_Sum(n, a, x, **kwargs) + + def _eval_conjugate(self): + n, a, x = self.args + return self.func(n, a.conjugate(), x.conjugate()) + +#---------------------------------------------------------------------------- +# Chebyshev polynomials of first and second kind +# + + +class chebyshevt(OrthogonalPolynomial): + r""" + Chebyshev polynomial of the first kind, $T_n(x)$. + + Explanation + =========== + + ``chebyshevt(n, x)`` gives the $n$th Chebyshev polynomial (of the first + kind) in $x$, $T_n(x)$. + + The Chebyshev polynomials of the first kind are orthogonal on + $[-1, 1]$ with respect to the weight $\frac{1}{\sqrt{1-x^2}}$. + + Examples + ======== + + >>> from sympy import chebyshevt, diff + >>> from sympy.abc import n,x + >>> chebyshevt(0, x) + 1 + >>> chebyshevt(1, x) + x + >>> chebyshevt(2, x) + 2*x**2 - 1 + + >>> chebyshevt(n, x) + chebyshevt(n, x) + >>> chebyshevt(n, -x) + (-1)**n*chebyshevt(n, x) + >>> chebyshevt(-n, x) + chebyshevt(n, x) + + >>> chebyshevt(n, 0) + cos(pi*n/2) + >>> chebyshevt(n, -1) + (-1)**n + + >>> diff(chebyshevt(n, x), x) + n*chebyshevu(n - 1, x) + + See Also + ======== + + jacobi, gegenbauer, + chebyshevt_root, chebyshevu, chebyshevu_root, + legendre, assoc_legendre, + hermite, hermite_prob, + laguerre, assoc_laguerre, + sympy.polys.orthopolys.jacobi_poly + sympy.polys.orthopolys.gegenbauer_poly + sympy.polys.orthopolys.chebyshevt_poly + sympy.polys.orthopolys.chebyshevu_poly + sympy.polys.orthopolys.hermite_poly + sympy.polys.orthopolys.hermite_prob_poly + sympy.polys.orthopolys.legendre_poly + sympy.polys.orthopolys.laguerre_poly + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Chebyshev_polynomial + .. [2] https://mathworld.wolfram.com/ChebyshevPolynomialoftheFirstKind.html + .. [3] https://mathworld.wolfram.com/ChebyshevPolynomialoftheSecondKind.html + .. [4] https://functions.wolfram.com/Polynomials/ChebyshevT/ + .. [5] https://functions.wolfram.com/Polynomials/ChebyshevU/ + + """ + + _ortho_poly = staticmethod(chebyshevt_poly) + + @classmethod + def eval(cls, n, x): + if not n.is_Number: + # Symbolic result T_n(x) + # T_n(-x) ---> (-1)**n * T_n(x) + if x.could_extract_minus_sign(): + return S.NegativeOne**n * chebyshevt(n, -x) + # T_{-n}(x) ---> T_n(x) + if n.could_extract_minus_sign(): + return chebyshevt(-n, x) + # We can evaluate for some special values of x + if x.is_zero: + return cos(S.Half * S.Pi * n) + if x == S.One: + return S.One + elif x is S.Infinity: + return S.Infinity + else: + # n is a given fixed integer, evaluate into polynomial + if n.is_negative: + # T_{-n}(x) == T_n(x) + return cls._eval_at_order(-n, x) + else: + return cls._eval_at_order(n, x) + + def fdiff(self, argindex=2): + if argindex == 1: + # Diff wrt n + raise ArgumentIndexError(self, argindex) + elif argindex == 2: + # Diff wrt x + n, x = self.args + return n * chebyshevu(n - 1, x) + else: + raise ArgumentIndexError(self, argindex) + + def _eval_rewrite_as_Sum(self, n, x, **kwargs): + from sympy.concrete.summations import Sum + k = Dummy("k") + kern = binomial(n, 2*k) * (x**2 - 1)**k * x**(n - 2*k) + return Sum(kern, (k, 0, floor(n/2))) + + def _eval_rewrite_as_polynomial(self, n, x, **kwargs): + # This function is just kept for backwards compatibility + # but should not be used + return self._eval_rewrite_as_Sum(n, x, **kwargs) + + +class chebyshevu(OrthogonalPolynomial): + r""" + Chebyshev polynomial of the second kind, $U_n(x)$. + + Explanation + =========== + + ``chebyshevu(n, x)`` gives the $n$th Chebyshev polynomial of the second + kind in x, $U_n(x)$. + + The Chebyshev polynomials of the second kind are orthogonal on + $[-1, 1]$ with respect to the weight $\sqrt{1-x^2}$. + + Examples + ======== + + >>> from sympy import chebyshevu, diff + >>> from sympy.abc import n,x + >>> chebyshevu(0, x) + 1 + >>> chebyshevu(1, x) + 2*x + >>> chebyshevu(2, x) + 4*x**2 - 1 + + >>> chebyshevu(n, x) + chebyshevu(n, x) + >>> chebyshevu(n, -x) + (-1)**n*chebyshevu(n, x) + >>> chebyshevu(-n, x) + -chebyshevu(n - 2, x) + + >>> chebyshevu(n, 0) + cos(pi*n/2) + >>> chebyshevu(n, 1) + n + 1 + + >>> diff(chebyshevu(n, x), x) + (-x*chebyshevu(n, x) + (n + 1)*chebyshevt(n + 1, x))/(x**2 - 1) + + See Also + ======== + + jacobi, gegenbauer, + chebyshevt, chebyshevt_root, chebyshevu_root, + legendre, assoc_legendre, + hermite, hermite_prob, + laguerre, assoc_laguerre, + sympy.polys.orthopolys.jacobi_poly + sympy.polys.orthopolys.gegenbauer_poly + sympy.polys.orthopolys.chebyshevt_poly + sympy.polys.orthopolys.chebyshevu_poly + sympy.polys.orthopolys.hermite_poly + sympy.polys.orthopolys.hermite_prob_poly + sympy.polys.orthopolys.legendre_poly + sympy.polys.orthopolys.laguerre_poly + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Chebyshev_polynomial + .. [2] https://mathworld.wolfram.com/ChebyshevPolynomialoftheFirstKind.html + .. [3] https://mathworld.wolfram.com/ChebyshevPolynomialoftheSecondKind.html + .. [4] https://functions.wolfram.com/Polynomials/ChebyshevT/ + .. [5] https://functions.wolfram.com/Polynomials/ChebyshevU/ + + """ + + _ortho_poly = staticmethod(chebyshevu_poly) + + @classmethod + def eval(cls, n, x): + if not n.is_Number: + # Symbolic result U_n(x) + # U_n(-x) ---> (-1)**n * U_n(x) + if x.could_extract_minus_sign(): + return S.NegativeOne**n * chebyshevu(n, -x) + # U_{-n}(x) ---> -U_{n-2}(x) + if n.could_extract_minus_sign(): + if n == S.NegativeOne: + # n can not be -1 here + return S.Zero + elif not (-n - 2).could_extract_minus_sign(): + return -chebyshevu(-n - 2, x) + # We can evaluate for some special values of x + if x.is_zero: + return cos(S.Half * S.Pi * n) + if x == S.One: + return S.One + n + elif x is S.Infinity: + return S.Infinity + else: + # n is a given fixed integer, evaluate into polynomial + if n.is_negative: + # U_{-n}(x) ---> -U_{n-2}(x) + if n == S.NegativeOne: + return S.Zero + else: + return -cls._eval_at_order(-n - 2, x) + else: + return cls._eval_at_order(n, x) + + def fdiff(self, argindex=2): + if argindex == 1: + # Diff wrt n + raise ArgumentIndexError(self, argindex) + elif argindex == 2: + # Diff wrt x + n, x = self.args + return ((n + 1) * chebyshevt(n + 1, x) - x * chebyshevu(n, x)) / (x**2 - 1) + else: + raise ArgumentIndexError(self, argindex) + + def _eval_rewrite_as_Sum(self, n, x, **kwargs): + from sympy.concrete.summations import Sum + k = Dummy("k") + kern = S.NegativeOne**k * factorial( + n - k) * (2*x)**(n - 2*k) / (factorial(k) * factorial(n - 2*k)) + return Sum(kern, (k, 0, floor(n/2))) + + def _eval_rewrite_as_polynomial(self, n, x, **kwargs): + # This function is just kept for backwards compatibility + # but should not be used + return self._eval_rewrite_as_Sum(n, x, **kwargs) + + +class chebyshevt_root(DefinedFunction): + r""" + ``chebyshev_root(n, k)`` returns the $k$th root (indexed from zero) of + the $n$th Chebyshev polynomial of the first kind; that is, if + $0 \le k < n$, ``chebyshevt(n, chebyshevt_root(n, k)) == 0``. + + Examples + ======== + + >>> from sympy import chebyshevt, chebyshevt_root + >>> chebyshevt_root(3, 2) + -sqrt(3)/2 + >>> chebyshevt(3, chebyshevt_root(3, 2)) + 0 + + See Also + ======== + + jacobi, gegenbauer, + chebyshevt, chebyshevu, chebyshevu_root, + legendre, assoc_legendre, + hermite, hermite_prob, + laguerre, assoc_laguerre, + sympy.polys.orthopolys.jacobi_poly + sympy.polys.orthopolys.gegenbauer_poly + sympy.polys.orthopolys.chebyshevt_poly + sympy.polys.orthopolys.chebyshevu_poly + sympy.polys.orthopolys.hermite_poly + sympy.polys.orthopolys.hermite_prob_poly + sympy.polys.orthopolys.legendre_poly + sympy.polys.orthopolys.laguerre_poly + """ + + @classmethod + def eval(cls, n, k): + if not ((0 <= k) and (k < n)): + raise ValueError("must have 0 <= k < n, " + "got k = %s and n = %s" % (k, n)) + return cos(S.Pi*(2*k + 1)/(2*n)) + + +class chebyshevu_root(DefinedFunction): + r""" + ``chebyshevu_root(n, k)`` returns the $k$th root (indexed from zero) of the + $n$th Chebyshev polynomial of the second kind; that is, if $0 \le k < n$, + ``chebyshevu(n, chebyshevu_root(n, k)) == 0``. + + Examples + ======== + + >>> from sympy import chebyshevu, chebyshevu_root + >>> chebyshevu_root(3, 2) + -sqrt(2)/2 + >>> chebyshevu(3, chebyshevu_root(3, 2)) + 0 + + See Also + ======== + + chebyshevt, chebyshevt_root, chebyshevu, + legendre, assoc_legendre, + hermite, hermite_prob, + laguerre, assoc_laguerre, + sympy.polys.orthopolys.jacobi_poly + sympy.polys.orthopolys.gegenbauer_poly + sympy.polys.orthopolys.chebyshevt_poly + sympy.polys.orthopolys.chebyshevu_poly + sympy.polys.orthopolys.hermite_poly + sympy.polys.orthopolys.hermite_prob_poly + sympy.polys.orthopolys.legendre_poly + sympy.polys.orthopolys.laguerre_poly + """ + + + @classmethod + def eval(cls, n, k): + if not ((0 <= k) and (k < n)): + raise ValueError("must have 0 <= k < n, " + "got k = %s and n = %s" % (k, n)) + return cos(S.Pi*(k + 1)/(n + 1)) + +#---------------------------------------------------------------------------- +# Legendre polynomials and Associated Legendre polynomials +# + + +class legendre(OrthogonalPolynomial): + r""" + ``legendre(n, x)`` gives the $n$th Legendre polynomial of $x$, $P_n(x)$ + + Explanation + =========== + + The Legendre polynomials are orthogonal on $[-1, 1]$ with respect to + the constant weight 1. They satisfy $P_n(1) = 1$ for all $n$; further, + $P_n$ is odd for odd $n$ and even for even $n$. + + Examples + ======== + + >>> from sympy import legendre, diff + >>> from sympy.abc import x, n + >>> legendre(0, x) + 1 + >>> legendre(1, x) + x + >>> legendre(2, x) + 3*x**2/2 - 1/2 + >>> legendre(n, x) + legendre(n, x) + >>> diff(legendre(n,x), x) + n*(x*legendre(n, x) - legendre(n - 1, x))/(x**2 - 1) + + See Also + ======== + + jacobi, gegenbauer, + chebyshevt, chebyshevt_root, chebyshevu, chebyshevu_root, + assoc_legendre, + hermite, hermite_prob, + laguerre, assoc_laguerre, + sympy.polys.orthopolys.jacobi_poly + sympy.polys.orthopolys.gegenbauer_poly + sympy.polys.orthopolys.chebyshevt_poly + sympy.polys.orthopolys.chebyshevu_poly + sympy.polys.orthopolys.hermite_poly + sympy.polys.orthopolys.hermite_prob_poly + sympy.polys.orthopolys.legendre_poly + sympy.polys.orthopolys.laguerre_poly + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Legendre_polynomial + .. [2] https://mathworld.wolfram.com/LegendrePolynomial.html + .. [3] https://functions.wolfram.com/Polynomials/LegendreP/ + .. [4] https://functions.wolfram.com/Polynomials/LegendreP2/ + + """ + + _ortho_poly = staticmethod(legendre_poly) + + @classmethod + def eval(cls, n, x): + if not n.is_Number: + # Symbolic result L_n(x) + # L_n(-x) ---> (-1)**n * L_n(x) + if x.could_extract_minus_sign(): + return S.NegativeOne**n * legendre(n, -x) + # L_{-n}(x) ---> L_{n-1}(x) + if n.could_extract_minus_sign() and not(-n - 1).could_extract_minus_sign(): + return legendre(-n - S.One, x) + # We can evaluate for some special values of x + if x.is_zero: + return sqrt(S.Pi)/(gamma(S.Half - n/2)*gamma(S.One + n/2)) + elif x == S.One: + return S.One + elif x is S.Infinity: + return S.Infinity + else: + # n is a given fixed integer, evaluate into polynomial; + # L_{-n}(x) ---> L_{n-1}(x) + if n.is_negative: + n = -n - S.One + return cls._eval_at_order(n, x) + + def fdiff(self, argindex=2): + if argindex == 1: + # Diff wrt n + raise ArgumentIndexError(self, argindex) + elif argindex == 2: + # Diff wrt x + # Find better formula, this is unsuitable for x = +/-1 + # https://www.autodiff.org/ad16/Oral/Buecker_Legendre.pdf says + # at x = 1: + # n*(n + 1)/2 , m = 0 + # oo , m = 1 + # -(n-1)*n*(n+1)*(n+2)/4 , m = 2 + # 0 , m = 3, 4, ..., n + # + # at x = -1 + # (-1)**(n+1)*n*(n + 1)/2 , m = 0 + # (-1)**n*oo , m = 1 + # (-1)**n*(n-1)*n*(n+1)*(n+2)/4 , m = 2 + # 0 , m = 3, 4, ..., n + n, x = self.args + return n/(x**2 - 1)*(x*legendre(n, x) - legendre(n - 1, x)) + else: + raise ArgumentIndexError(self, argindex) + + def _eval_rewrite_as_Sum(self, n, x, **kwargs): + from sympy.concrete.summations import Sum + k = Dummy("k") + kern = S.NegativeOne**k*binomial(n, k)**2*((1 + x)/2)**(n - k)*((1 - x)/2)**k + return Sum(kern, (k, 0, n)) + + def _eval_rewrite_as_polynomial(self, n, x, **kwargs): + # This function is just kept for backwards compatibility + # but should not be used + return self._eval_rewrite_as_Sum(n, x, **kwargs) + + +class assoc_legendre(DefinedFunction): + r""" + ``assoc_legendre(n, m, x)`` gives $P_n^m(x)$, where $n$ and $m$ are + the degree and order or an expression which is related to the nth + order Legendre polynomial, $P_n(x)$ in the following manner: + + .. math:: + P_n^m(x) = (-1)^m (1 - x^2)^{\frac{m}{2}} + \frac{\mathrm{d}^m P_n(x)}{\mathrm{d} x^m} + + Explanation + =========== + + Associated Legendre polynomials are orthogonal on $[-1, 1]$ with: + + - weight $= 1$ for the same $m$ and different $n$. + - weight $= \frac{1}{1-x^2}$ for the same $n$ and different $m$. + + Examples + ======== + + >>> from sympy import assoc_legendre + >>> from sympy.abc import x, m, n + >>> assoc_legendre(0,0, x) + 1 + >>> assoc_legendre(1,0, x) + x + >>> assoc_legendre(1,1, x) + -sqrt(1 - x**2) + >>> assoc_legendre(n,m,x) + assoc_legendre(n, m, x) + + See Also + ======== + + jacobi, gegenbauer, + chebyshevt, chebyshevt_root, chebyshevu, chebyshevu_root, + legendre, + hermite, hermite_prob, + laguerre, assoc_laguerre, + sympy.polys.orthopolys.jacobi_poly + sympy.polys.orthopolys.gegenbauer_poly + sympy.polys.orthopolys.chebyshevt_poly + sympy.polys.orthopolys.chebyshevu_poly + sympy.polys.orthopolys.hermite_poly + sympy.polys.orthopolys.hermite_prob_poly + sympy.polys.orthopolys.legendre_poly + sympy.polys.orthopolys.laguerre_poly + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Associated_Legendre_polynomials + .. [2] https://mathworld.wolfram.com/LegendrePolynomial.html + .. [3] https://functions.wolfram.com/Polynomials/LegendreP/ + .. [4] https://functions.wolfram.com/Polynomials/LegendreP2/ + + """ + + @classmethod + def _eval_at_order(cls, n, m): + P = legendre_poly(n, _x, polys=True).diff((_x, m)) + return S.NegativeOne**m * (1 - _x**2)**Rational(m, 2) * P.as_expr() + + @classmethod + def eval(cls, n, m, x): + if m.could_extract_minus_sign(): + # P^{-m}_n ---> F * P^m_n + return S.NegativeOne**(-m) * (factorial(m + n)/factorial(n - m)) * assoc_legendre(n, -m, x) + if m == 0: + # P^0_n ---> L_n + return legendre(n, x) + if x == 0: + return 2**m*sqrt(S.Pi) / (gamma((1 - m - n)/2)*gamma(1 - (m - n)/2)) + if n.is_Number and m.is_Number and n.is_integer and m.is_integer: + if n.is_negative: + raise ValueError("%s : 1st index must be nonnegative integer (got %r)" % (cls, n)) + if abs(m) > n: + raise ValueError("%s : abs('2nd index') must be <= '1st index' (got %r, %r)" % (cls, n, m)) + return cls._eval_at_order(int(n), abs(int(m))).subs(_x, x) + + def fdiff(self, argindex=3): + if argindex == 1: + # Diff wrt n + raise ArgumentIndexError(self, argindex) + elif argindex == 2: + # Diff wrt m + raise ArgumentIndexError(self, argindex) + elif argindex == 3: + # Diff wrt x + # Find better formula, this is unsuitable for x = 1 + n, m, x = self.args + return 1/(x**2 - 1)*(x*n*assoc_legendre(n, m, x) - (m + n)*assoc_legendre(n - 1, m, x)) + else: + raise ArgumentIndexError(self, argindex) + + def _eval_rewrite_as_Sum(self, n, m, x, **kwargs): + from sympy.concrete.summations import Sum + k = Dummy("k") + kern = factorial(2*n - 2*k)/(2**n*factorial(n - k)*factorial( + k)*factorial(n - 2*k - m))*S.NegativeOne**k*x**(n - m - 2*k) + return (1 - x**2)**(m/2) * Sum(kern, (k, 0, floor((n - m)*S.Half))) + + def _eval_rewrite_as_polynomial(self, n, m, x, **kwargs): + # This function is just kept for backwards compatibility + # but should not be used + return self._eval_rewrite_as_Sum(n, m, x, **kwargs) + + def _eval_conjugate(self): + n, m, x = self.args + return self.func(n, m.conjugate(), x.conjugate()) + +#---------------------------------------------------------------------------- +# Hermite polynomials +# + + +class hermite(OrthogonalPolynomial): + r""" + ``hermite(n, x)`` gives the $n$th Hermite polynomial in $x$, $H_n(x)$. + + Explanation + =========== + + The Hermite polynomials are orthogonal on $(-\infty, \infty)$ + with respect to the weight $\exp\left(-x^2\right)$. + + Examples + ======== + + >>> from sympy import hermite, diff + >>> from sympy.abc import x, n + >>> hermite(0, x) + 1 + >>> hermite(1, x) + 2*x + >>> hermite(2, x) + 4*x**2 - 2 + >>> hermite(n, x) + hermite(n, x) + >>> diff(hermite(n,x), x) + 2*n*hermite(n - 1, x) + >>> hermite(n, -x) + (-1)**n*hermite(n, x) + + See Also + ======== + + jacobi, gegenbauer, + chebyshevt, chebyshevt_root, chebyshevu, chebyshevu_root, + legendre, assoc_legendre, + hermite_prob, + laguerre, assoc_laguerre, + sympy.polys.orthopolys.jacobi_poly + sympy.polys.orthopolys.gegenbauer_poly + sympy.polys.orthopolys.chebyshevt_poly + sympy.polys.orthopolys.chebyshevu_poly + sympy.polys.orthopolys.hermite_poly + sympy.polys.orthopolys.hermite_prob_poly + sympy.polys.orthopolys.legendre_poly + sympy.polys.orthopolys.laguerre_poly + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Hermite_polynomial + .. [2] https://mathworld.wolfram.com/HermitePolynomial.html + .. [3] https://functions.wolfram.com/Polynomials/HermiteH/ + + """ + + _ortho_poly = staticmethod(hermite_poly) + + @classmethod + def eval(cls, n, x): + if not n.is_Number: + # Symbolic result H_n(x) + # H_n(-x) ---> (-1)**n * H_n(x) + if x.could_extract_minus_sign(): + return S.NegativeOne**n * hermite(n, -x) + # We can evaluate for some special values of x + if x.is_zero: + return 2**n * sqrt(S.Pi) / gamma((S.One - n)/2) + elif x is S.Infinity: + return S.Infinity + else: + # n is a given fixed integer, evaluate into polynomial + if n.is_negative: + raise ValueError( + "The index n must be nonnegative integer (got %r)" % n) + else: + return cls._eval_at_order(n, x) + + def fdiff(self, argindex=2): + if argindex == 1: + # Diff wrt n + raise ArgumentIndexError(self, argindex) + elif argindex == 2: + # Diff wrt x + n, x = self.args + return 2*n*hermite(n - 1, x) + else: + raise ArgumentIndexError(self, argindex) + + def _eval_rewrite_as_Sum(self, n, x, **kwargs): + from sympy.concrete.summations import Sum + k = Dummy("k") + kern = S.NegativeOne**k / (factorial(k)*factorial(n - 2*k)) * (2*x)**(n - 2*k) + return factorial(n)*Sum(kern, (k, 0, floor(n/2))) + + def _eval_rewrite_as_polynomial(self, n, x, **kwargs): + # This function is just kept for backwards compatibility + # but should not be used + return self._eval_rewrite_as_Sum(n, x, **kwargs) + + def _eval_rewrite_as_hermite_prob(self, n, x, **kwargs): + return sqrt(2)**n * hermite_prob(n, x*sqrt(2)) + + +class hermite_prob(OrthogonalPolynomial): + r""" + ``hermite_prob(n, x)`` gives the $n$th probabilist's Hermite polynomial + in $x$, $He_n(x)$. + + Explanation + =========== + + The probabilist's Hermite polynomials are orthogonal on $(-\infty, \infty)$ + with respect to the weight $\exp\left(-\frac{x^2}{2}\right)$. They are monic + polynomials, related to the plain Hermite polynomials (:py:class:`~.hermite`) by + + .. math :: He_n(x) = 2^{-n/2} H_n(x/\sqrt{2}) + + Examples + ======== + + >>> from sympy import hermite_prob, diff, I + >>> from sympy.abc import x, n + >>> hermite_prob(1, x) + x + >>> hermite_prob(5, x) + x**5 - 10*x**3 + 15*x + >>> diff(hermite_prob(n,x), x) + n*hermite_prob(n - 1, x) + >>> hermite_prob(n, -x) + (-1)**n*hermite_prob(n, x) + + The sum of absolute values of coefficients of $He_n(x)$ is the number of + matchings in the complete graph $K_n$ or telephone number, A000085 in the OEIS: + + >>> [hermite_prob(n,I) / I**n for n in range(11)] + [1, 1, 2, 4, 10, 26, 76, 232, 764, 2620, 9496] + + See Also + ======== + + jacobi, gegenbauer, + chebyshevt, chebyshevt_root, chebyshevu, chebyshevu_root, + legendre, assoc_legendre, + hermite, + laguerre, assoc_laguerre, + sympy.polys.orthopolys.jacobi_poly + sympy.polys.orthopolys.gegenbauer_poly + sympy.polys.orthopolys.chebyshevt_poly + sympy.polys.orthopolys.chebyshevu_poly + sympy.polys.orthopolys.hermite_poly + sympy.polys.orthopolys.hermite_prob_poly + sympy.polys.orthopolys.legendre_poly + sympy.polys.orthopolys.laguerre_poly + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Hermite_polynomial + .. [2] https://mathworld.wolfram.com/HermitePolynomial.html + """ + + _ortho_poly = staticmethod(hermite_prob_poly) + + @classmethod + def eval(cls, n, x): + if not n.is_Number: + if x.could_extract_minus_sign(): + return S.NegativeOne**n * hermite_prob(n, -x) + if x.is_zero: + return sqrt(S.Pi) / gamma((S.One-n) / 2) + elif x is S.Infinity: + return S.Infinity + else: + if n.is_negative: + ValueError("n must be a nonnegative integer, not %r" % n) + else: + return cls._eval_at_order(n, x) + + def fdiff(self, argindex=2): + if argindex == 2: + n, x = self.args + return n*hermite_prob(n-1, x) + else: + raise ArgumentIndexError(self, argindex) + + def _eval_rewrite_as_Sum(self, n, x, **kwargs): + from sympy.concrete.summations import Sum + k = Dummy("k") + kern = (-S.Half)**k * x**(n-2*k) / (factorial(k) * factorial(n-2*k)) + return factorial(n)*Sum(kern, (k, 0, floor(n/2))) + + def _eval_rewrite_as_polynomial(self, n, x, **kwargs): + # This function is just kept for backwards compatibility + # but should not be used + return self._eval_rewrite_as_Sum(n, x, **kwargs) + + def _eval_rewrite_as_hermite(self, n, x, **kwargs): + return sqrt(2)**(-n) * hermite(n, x/sqrt(2)) + + +#---------------------------------------------------------------------------- +# Laguerre polynomials +# + + +class laguerre(OrthogonalPolynomial): + r""" + Returns the $n$th Laguerre polynomial in $x$, $L_n(x)$. + + Examples + ======== + + >>> from sympy import laguerre, diff + >>> from sympy.abc import x, n + >>> laguerre(0, x) + 1 + >>> laguerre(1, x) + 1 - x + >>> laguerre(2, x) + x**2/2 - 2*x + 1 + >>> laguerre(3, x) + -x**3/6 + 3*x**2/2 - 3*x + 1 + + >>> laguerre(n, x) + laguerre(n, x) + + >>> diff(laguerre(n, x), x) + -assoc_laguerre(n - 1, 1, x) + + Parameters + ========== + + n : int + Degree of Laguerre polynomial. Must be `n \ge 0`. + + See Also + ======== + + jacobi, gegenbauer, + chebyshevt, chebyshevt_root, chebyshevu, chebyshevu_root, + legendre, assoc_legendre, + hermite, hermite_prob, + assoc_laguerre, + sympy.polys.orthopolys.jacobi_poly + sympy.polys.orthopolys.gegenbauer_poly + sympy.polys.orthopolys.chebyshevt_poly + sympy.polys.orthopolys.chebyshevu_poly + sympy.polys.orthopolys.hermite_poly + sympy.polys.orthopolys.hermite_prob_poly + sympy.polys.orthopolys.legendre_poly + sympy.polys.orthopolys.laguerre_poly + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Laguerre_polynomial + .. [2] https://mathworld.wolfram.com/LaguerrePolynomial.html + .. [3] https://functions.wolfram.com/Polynomials/LaguerreL/ + .. [4] https://functions.wolfram.com/Polynomials/LaguerreL3/ + + """ + + _ortho_poly = staticmethod(laguerre_poly) + + @classmethod + def eval(cls, n, x): + if n.is_integer is False: + raise ValueError("Error: n should be an integer.") + if not n.is_Number: + # Symbolic result L_n(x) + # L_{n}(-x) ---> exp(-x) * L_{-n-1}(x) + # L_{-n}(x) ---> exp(x) * L_{n-1}(-x) + if n.could_extract_minus_sign() and not(-n - 1).could_extract_minus_sign(): + return exp(x)*laguerre(-n - 1, -x) + # We can evaluate for some special values of x + if x.is_zero: + return S.One + elif x is S.NegativeInfinity: + return S.Infinity + elif x is S.Infinity: + return S.NegativeOne**n * S.Infinity + else: + if n.is_negative: + return exp(x)*laguerre(-n - 1, -x) + else: + return cls._eval_at_order(n, x) + + def fdiff(self, argindex=2): + if argindex == 1: + # Diff wrt n + raise ArgumentIndexError(self, argindex) + elif argindex == 2: + # Diff wrt x + n, x = self.args + return -assoc_laguerre(n - 1, 1, x) + else: + raise ArgumentIndexError(self, argindex) + + def _eval_rewrite_as_Sum(self, n, x, **kwargs): + from sympy.concrete.summations import Sum + # Make sure n \in N_0 + if n.is_negative: + return exp(x) * self._eval_rewrite_as_Sum(-n - 1, -x, **kwargs) + if n.is_integer is False: + raise ValueError("Error: n should be an integer.") + k = Dummy("k") + kern = RisingFactorial(-n, k) / factorial(k)**2 * x**k + return Sum(kern, (k, 0, n)) + + def _eval_rewrite_as_polynomial(self, n, x, **kwargs): + # This function is just kept for backwards compatibility + # but should not be used + return self._eval_rewrite_as_Sum(n, x, **kwargs) + + +class assoc_laguerre(OrthogonalPolynomial): + r""" + Returns the $n$th generalized Laguerre polynomial in $x$, $L_n(x)$. + + Examples + ======== + + >>> from sympy import assoc_laguerre, diff + >>> from sympy.abc import x, n, a + >>> assoc_laguerre(0, a, x) + 1 + >>> assoc_laguerre(1, a, x) + a - x + 1 + >>> assoc_laguerre(2, a, x) + a**2/2 + 3*a/2 + x**2/2 + x*(-a - 2) + 1 + >>> assoc_laguerre(3, a, x) + a**3/6 + a**2 + 11*a/6 - x**3/6 + x**2*(a/2 + 3/2) + + x*(-a**2/2 - 5*a/2 - 3) + 1 + + >>> assoc_laguerre(n, a, 0) + binomial(a + n, a) + + >>> assoc_laguerre(n, a, x) + assoc_laguerre(n, a, x) + + >>> assoc_laguerre(n, 0, x) + laguerre(n, x) + + >>> diff(assoc_laguerre(n, a, x), x) + -assoc_laguerre(n - 1, a + 1, x) + + >>> diff(assoc_laguerre(n, a, x), a) + Sum(assoc_laguerre(_k, a, x)/(-a + n), (_k, 0, n - 1)) + + Parameters + ========== + + n : int + Degree of Laguerre polynomial. Must be `n \ge 0`. + + alpha : Expr + Arbitrary expression. For ``alpha=0`` regular Laguerre + polynomials will be generated. + + See Also + ======== + + jacobi, gegenbauer, + chebyshevt, chebyshevt_root, chebyshevu, chebyshevu_root, + legendre, assoc_legendre, + hermite, hermite_prob, + laguerre, + sympy.polys.orthopolys.jacobi_poly + sympy.polys.orthopolys.gegenbauer_poly + sympy.polys.orthopolys.chebyshevt_poly + sympy.polys.orthopolys.chebyshevu_poly + sympy.polys.orthopolys.hermite_poly + sympy.polys.orthopolys.hermite_prob_poly + sympy.polys.orthopolys.legendre_poly + sympy.polys.orthopolys.laguerre_poly + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Laguerre_polynomial#Generalized_Laguerre_polynomials + .. [2] https://mathworld.wolfram.com/AssociatedLaguerrePolynomial.html + .. [3] https://functions.wolfram.com/Polynomials/LaguerreL/ + .. [4] https://functions.wolfram.com/Polynomials/LaguerreL3/ + + """ + + @classmethod + def eval(cls, n, alpha, x): + # L_{n}^{0}(x) ---> L_{n}(x) + if alpha.is_zero: + return laguerre(n, x) + + if not n.is_Number: + # We can evaluate for some special values of x + if x.is_zero: + return binomial(n + alpha, alpha) + elif x is S.Infinity and n > 0: + return S.NegativeOne**n * S.Infinity + elif x is S.NegativeInfinity and n > 0: + return S.Infinity + else: + # n is a given fixed integer, evaluate into polynomial + if n.is_negative: + raise ValueError( + "The index n must be nonnegative integer (got %r)" % n) + else: + return laguerre_poly(n, x, alpha) + + def fdiff(self, argindex=3): + from sympy.concrete.summations import Sum + if argindex == 1: + # Diff wrt n + raise ArgumentIndexError(self, argindex) + elif argindex == 2: + # Diff wrt alpha + n, alpha, x = self.args + k = Dummy("k") + return Sum(assoc_laguerre(k, alpha, x) / (n - alpha), (k, 0, n - 1)) + elif argindex == 3: + # Diff wrt x + n, alpha, x = self.args + return -assoc_laguerre(n - 1, alpha + 1, x) + else: + raise ArgumentIndexError(self, argindex) + + def _eval_rewrite_as_Sum(self, n, alpha, x, **kwargs): + from sympy.concrete.summations import Sum + # Make sure n \in N_0 + if n.is_negative or n.is_integer is False: + raise ValueError("Error: n should be a non-negative integer.") + k = Dummy("k") + kern = RisingFactorial( + -n, k) / (gamma(k + alpha + 1) * factorial(k)) * x**k + return gamma(n + alpha + 1) / factorial(n) * Sum(kern, (k, 0, n)) + + def _eval_rewrite_as_polynomial(self, n, alpha, x, **kwargs): + # This function is just kept for backwards compatibility + # but should not be used + return self._eval_rewrite_as_Sum(n, alpha, x, **kwargs) + + def _eval_conjugate(self): + n, alpha, x = self.args + return self.func(n, alpha.conjugate(), x.conjugate()) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/functions/special/singularity_functions.py b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/functions/special/singularity_functions.py new file mode 100644 index 0000000000000000000000000000000000000000..a69026e6e657b1131880b47cb32202b6825b7158 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/functions/special/singularity_functions.py @@ -0,0 +1,235 @@ +from sympy.core import S, oo, diff +from sympy.core.function import DefinedFunction, ArgumentIndexError +from sympy.core.logic import fuzzy_not +from sympy.core.relational import Eq +from sympy.functions.elementary.complexes import im +from sympy.functions.elementary.piecewise import Piecewise +from sympy.functions.special.delta_functions import Heaviside + +############################################################################### +############################# SINGULARITY FUNCTION ############################ +############################################################################### + + +class SingularityFunction(DefinedFunction): + r""" + Singularity functions are a class of discontinuous functions. + + Explanation + =========== + + Singularity functions take a variable, an offset, and an exponent as + arguments. These functions are represented using Macaulay brackets as: + + SingularityFunction(x, a, n) := ^n + + The singularity function will automatically evaluate to + ``Derivative(DiracDelta(x - a), x, -n - 1)`` if ``n < 0`` + and ``(x - a)**n*Heaviside(x - a, 1)`` if ``n >= 0``. + + Examples + ======== + + >>> from sympy import SingularityFunction, diff, Piecewise, DiracDelta, Heaviside, Symbol + >>> from sympy.abc import x, a, n + >>> SingularityFunction(x, a, n) + SingularityFunction(x, a, n) + >>> y = Symbol('y', positive=True) + >>> n = Symbol('n', nonnegative=True) + >>> SingularityFunction(y, -10, n) + (y + 10)**n + >>> y = Symbol('y', negative=True) + >>> SingularityFunction(y, 10, n) + 0 + >>> SingularityFunction(x, 4, -1).subs(x, 4) + oo + >>> SingularityFunction(x, 10, -2).subs(x, 10) + oo + >>> SingularityFunction(4, 1, 5) + 243 + >>> diff(SingularityFunction(x, 1, 5) + SingularityFunction(x, 1, 4), x) + 4*SingularityFunction(x, 1, 3) + 5*SingularityFunction(x, 1, 4) + >>> diff(SingularityFunction(x, 4, 0), x, 2) + SingularityFunction(x, 4, -2) + >>> SingularityFunction(x, 4, 5).rewrite(Piecewise) + Piecewise(((x - 4)**5, x >= 4), (0, True)) + >>> expr = SingularityFunction(x, a, n) + >>> y = Symbol('y', positive=True) + >>> n = Symbol('n', nonnegative=True) + >>> expr.subs({x: y, a: -10, n: n}) + (y + 10)**n + + The methods ``rewrite(DiracDelta)``, ``rewrite(Heaviside)``, and + ``rewrite('HeavisideDiracDelta')`` returns the same output. One can use any + of these methods according to their choice. + + >>> expr = SingularityFunction(x, 4, 5) + SingularityFunction(x, -3, -1) - SingularityFunction(x, 0, -2) + >>> expr.rewrite(Heaviside) + (x - 4)**5*Heaviside(x - 4, 1) + DiracDelta(x + 3) - DiracDelta(x, 1) + >>> expr.rewrite(DiracDelta) + (x - 4)**5*Heaviside(x - 4, 1) + DiracDelta(x + 3) - DiracDelta(x, 1) + >>> expr.rewrite('HeavisideDiracDelta') + (x - 4)**5*Heaviside(x - 4, 1) + DiracDelta(x + 3) - DiracDelta(x, 1) + + See Also + ======== + + DiracDelta, Heaviside + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Singularity_function + + """ + + is_real = True + + def fdiff(self, argindex=1): + """ + Returns the first derivative of a DiracDelta Function. + + Explanation + =========== + + The difference between ``diff()`` and ``fdiff()`` is: ``diff()`` is the + user-level function and ``fdiff()`` is an object method. ``fdiff()`` is + a convenience method available in the ``Function`` class. It returns + the derivative of the function without considering the chain rule. + ``diff(function, x)`` calls ``Function._eval_derivative`` which in turn + calls ``fdiff()`` internally to compute the derivative of the function. + + """ + + if argindex == 1: + x, a, n = self.args + if n in (S.Zero, S.NegativeOne, S(-2), S(-3)): + return self.func(x, a, n-1) + elif n.is_positive: + return n*self.func(x, a, n-1) + else: + raise ArgumentIndexError(self, argindex) + + @classmethod + def eval(cls, variable, offset, exponent): + """ + Returns a simplified form or a value of Singularity Function depending + on the argument passed by the object. + + Explanation + =========== + + The ``eval()`` method is automatically called when the + ``SingularityFunction`` class is about to be instantiated and it + returns either some simplified instance or the unevaluated instance + depending on the argument passed. In other words, ``eval()`` method is + not needed to be called explicitly, it is being called and evaluated + once the object is called. + + Examples + ======== + + >>> from sympy import SingularityFunction, Symbol, nan + >>> from sympy.abc import x, a, n + >>> SingularityFunction(x, a, n) + SingularityFunction(x, a, n) + >>> SingularityFunction(5, 3, 2) + 4 + >>> SingularityFunction(x, a, nan) + nan + >>> SingularityFunction(x, 3, 0).subs(x, 3) + 1 + >>> SingularityFunction(4, 1, 5) + 243 + >>> x = Symbol('x', positive = True) + >>> a = Symbol('a', negative = True) + >>> n = Symbol('n', nonnegative = True) + >>> SingularityFunction(x, a, n) + (-a + x)**n + >>> x = Symbol('x', negative = True) + >>> a = Symbol('a', positive = True) + >>> SingularityFunction(x, a, n) + 0 + + """ + + x = variable + a = offset + n = exponent + shift = (x - a) + + if fuzzy_not(im(shift).is_zero): + raise ValueError("Singularity Functions are defined only for Real Numbers.") + if fuzzy_not(im(n).is_zero): + raise ValueError("Singularity Functions are not defined for imaginary exponents.") + if shift is S.NaN or n is S.NaN: + return S.NaN + if (n + 4).is_negative: + raise ValueError("Singularity Functions are not defined for exponents less than -4.") + if shift.is_extended_negative: + return S.Zero + if n.is_nonnegative: + if shift.is_zero: # use literal 0 in case of Symbol('z', zero=True) + return S.Zero**n + if shift.is_extended_nonnegative: + return shift**n + if n in (S.NegativeOne, -2, -3, -4): + if shift.is_negative or shift.is_extended_positive: + return S.Zero + if shift.is_zero: + return oo + + def _eval_rewrite_as_Piecewise(self, *args, **kwargs): + ''' + Converts a Singularity Function expression into its Piecewise form. + + ''' + x, a, n = self.args + + if n in (S.NegativeOne, S(-2), S(-3), S(-4)): + return Piecewise((oo, Eq(x - a, 0)), (0, True)) + elif n.is_nonnegative: + return Piecewise(((x - a)**n, x - a >= 0), (0, True)) + + def _eval_rewrite_as_Heaviside(self, *args, **kwargs): + ''' + Rewrites a Singularity Function expression using Heavisides and DiracDeltas. + + ''' + x, a, n = self.args + + if n == -4: + return diff(Heaviside(x - a), x.free_symbols.pop(), 4) + if n == -3: + return diff(Heaviside(x - a), x.free_symbols.pop(), 3) + if n == -2: + return diff(Heaviside(x - a), x.free_symbols.pop(), 2) + if n == -1: + return diff(Heaviside(x - a), x.free_symbols.pop(), 1) + if n.is_nonnegative: + return (x - a)**n*Heaviside(x - a, 1) + + def _eval_as_leading_term(self, x, logx, cdir): + z, a, n = self.args + shift = (z - a).subs(x, 0) + if n < 0: + return S.Zero + elif n.is_zero and shift.is_zero: + return S.Zero if cdir == -1 else S.One + elif shift.is_positive: + return shift**n + return S.Zero + + def _eval_nseries(self, x, n, logx=None, cdir=0): + z, a, n = self.args + shift = (z - a).subs(x, 0) + if n < 0: + return S.Zero + elif n.is_zero and shift.is_zero: + return S.Zero if cdir == -1 else S.One + elif shift.is_positive: + return ((z - a)**n)._eval_nseries(x, n, logx=logx, cdir=cdir) + return S.Zero + + _eval_rewrite_as_DiracDelta = _eval_rewrite_as_Heaviside + _eval_rewrite_as_HeavisideDiracDelta = _eval_rewrite_as_Heaviside diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/functions/special/spherical_harmonics.py b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/functions/special/spherical_harmonics.py new file mode 100644 index 0000000000000000000000000000000000000000..541546b75e882b43c41814b5e92bb85ee41628d1 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/functions/special/spherical_harmonics.py @@ -0,0 +1,334 @@ +from sympy.core.expr import Expr +from sympy.core.function import DefinedFunction, ArgumentIndexError +from sympy.core.numbers import I, pi +from sympy.core.singleton import S +from sympy.core.symbol import Dummy +from sympy.functions import assoc_legendre +from sympy.functions.combinatorial.factorials import factorial +from sympy.functions.elementary.complexes import Abs, conjugate +from sympy.functions.elementary.exponential import exp +from sympy.functions.elementary.miscellaneous import sqrt +from sympy.functions.elementary.trigonometric import sin, cos, cot + +_x = Dummy("x") + +class Ynm(DefinedFunction): + r""" + Spherical harmonics defined as + + .. math:: + Y_n^m(\theta, \varphi) := \sqrt{\frac{(2n+1)(n-m)!}{4\pi(n+m)!}} + \exp(i m \varphi) + \mathrm{P}_n^m\left(\cos(\theta)\right) + + Explanation + =========== + + ``Ynm()`` gives the spherical harmonic function of order $n$ and $m$ + in $\theta$ and $\varphi$, $Y_n^m(\theta, \varphi)$. The four + parameters are as follows: $n \geq 0$ an integer and $m$ an integer + such that $-n \leq m \leq n$ holds. The two angles are real-valued + with $\theta \in [0, \pi]$ and $\varphi \in [0, 2\pi]$. + + Examples + ======== + + >>> from sympy import Ynm, Symbol, simplify + >>> from sympy.abc import n,m + >>> theta = Symbol("theta") + >>> phi = Symbol("phi") + + >>> Ynm(n, m, theta, phi) + Ynm(n, m, theta, phi) + + Several symmetries are known, for the order: + + >>> Ynm(n, -m, theta, phi) + (-1)**m*exp(-2*I*m*phi)*Ynm(n, m, theta, phi) + + As well as for the angles: + + >>> Ynm(n, m, -theta, phi) + Ynm(n, m, theta, phi) + + >>> Ynm(n, m, theta, -phi) + exp(-2*I*m*phi)*Ynm(n, m, theta, phi) + + For specific integers $n$ and $m$ we can evaluate the harmonics + to more useful expressions: + + >>> simplify(Ynm(0, 0, theta, phi).expand(func=True)) + 1/(2*sqrt(pi)) + + >>> simplify(Ynm(1, -1, theta, phi).expand(func=True)) + sqrt(6)*exp(-I*phi)*sin(theta)/(4*sqrt(pi)) + + >>> simplify(Ynm(1, 0, theta, phi).expand(func=True)) + sqrt(3)*cos(theta)/(2*sqrt(pi)) + + >>> simplify(Ynm(1, 1, theta, phi).expand(func=True)) + -sqrt(6)*exp(I*phi)*sin(theta)/(4*sqrt(pi)) + + >>> simplify(Ynm(2, -2, theta, phi).expand(func=True)) + sqrt(30)*exp(-2*I*phi)*sin(theta)**2/(8*sqrt(pi)) + + >>> simplify(Ynm(2, -1, theta, phi).expand(func=True)) + sqrt(30)*exp(-I*phi)*sin(2*theta)/(8*sqrt(pi)) + + >>> simplify(Ynm(2, 0, theta, phi).expand(func=True)) + sqrt(5)*(3*cos(theta)**2 - 1)/(4*sqrt(pi)) + + >>> simplify(Ynm(2, 1, theta, phi).expand(func=True)) + -sqrt(30)*exp(I*phi)*sin(2*theta)/(8*sqrt(pi)) + + >>> simplify(Ynm(2, 2, theta, phi).expand(func=True)) + sqrt(30)*exp(2*I*phi)*sin(theta)**2/(8*sqrt(pi)) + + We can differentiate the functions with respect + to both angles: + + >>> from sympy import Ynm, Symbol, diff + >>> from sympy.abc import n,m + >>> theta = Symbol("theta") + >>> phi = Symbol("phi") + + >>> diff(Ynm(n, m, theta, phi), theta) + m*cot(theta)*Ynm(n, m, theta, phi) + sqrt((-m + n)*(m + n + 1))*exp(-I*phi)*Ynm(n, m + 1, theta, phi) + + >>> diff(Ynm(n, m, theta, phi), phi) + I*m*Ynm(n, m, theta, phi) + + Further we can compute the complex conjugation: + + >>> from sympy import Ynm, Symbol, conjugate + >>> from sympy.abc import n,m + >>> theta = Symbol("theta") + >>> phi = Symbol("phi") + + >>> conjugate(Ynm(n, m, theta, phi)) + (-1)**(2*m)*exp(-2*I*m*phi)*Ynm(n, m, theta, phi) + + To get back the well known expressions in spherical + coordinates, we use full expansion: + + >>> from sympy import Ynm, Symbol, expand_func + >>> from sympy.abc import n,m + >>> theta = Symbol("theta") + >>> phi = Symbol("phi") + + >>> expand_func(Ynm(n, m, theta, phi)) + sqrt((2*n + 1)*factorial(-m + n)/factorial(m + n))*exp(I*m*phi)*assoc_legendre(n, m, cos(theta))/(2*sqrt(pi)) + + See Also + ======== + + Ynm_c, Znm + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Spherical_harmonics + .. [2] https://mathworld.wolfram.com/SphericalHarmonic.html + .. [3] https://functions.wolfram.com/Polynomials/SphericalHarmonicY/ + .. [4] https://dlmf.nist.gov/14.30 + + """ + + @classmethod + def eval(cls, n, m, theta, phi): + # Handle negative index m and arguments theta, phi + if m.could_extract_minus_sign(): + m = -m + return S.NegativeOne**m * exp(-2*I*m*phi) * Ynm(n, m, theta, phi) + if theta.could_extract_minus_sign(): + theta = -theta + return Ynm(n, m, theta, phi) + if phi.could_extract_minus_sign(): + phi = -phi + return exp(-2*I*m*phi) * Ynm(n, m, theta, phi) + + # TODO Add more simplififcation here + + def _eval_expand_func(self, **hints): + n, m, theta, phi = self.args + rv = (sqrt((2*n + 1)/(4*pi) * factorial(n - m)/factorial(n + m)) * + exp(I*m*phi) * assoc_legendre(n, m, cos(theta))) + # We can do this because of the range of theta + return rv.subs(sqrt(-cos(theta)**2 + 1), sin(theta)) + + def fdiff(self, argindex=4): + if argindex == 1: + # Diff wrt n + raise ArgumentIndexError(self, argindex) + elif argindex == 2: + # Diff wrt m + raise ArgumentIndexError(self, argindex) + elif argindex == 3: + # Diff wrt theta + n, m, theta, phi = self.args + return (m * cot(theta) * Ynm(n, m, theta, phi) + + sqrt((n - m)*(n + m + 1)) * exp(-I*phi) * Ynm(n, m + 1, theta, phi)) + elif argindex == 4: + # Diff wrt phi + n, m, theta, phi = self.args + return I * m * Ynm(n, m, theta, phi) + else: + raise ArgumentIndexError(self, argindex) + + def _eval_rewrite_as_polynomial(self, n, m, theta, phi, **kwargs): + # TODO: Make sure n \in N + # TODO: Assert |m| <= n ortherwise we should return 0 + return self.expand(func=True) + + def _eval_rewrite_as_sin(self, n, m, theta, phi, **kwargs): + return self.rewrite(cos) + + def _eval_rewrite_as_cos(self, n, m, theta, phi, **kwargs): + # This method can be expensive due to extensive use of simplification! + from sympy.simplify import simplify, trigsimp + # TODO: Make sure n \in N + # TODO: Assert |m| <= n ortherwise we should return 0 + term = simplify(self.expand(func=True)) + # We can do this because of the range of theta + term = term.xreplace({Abs(sin(theta)):sin(theta)}) + return simplify(trigsimp(term)) + + def _eval_conjugate(self): + # TODO: Make sure theta \in R and phi \in R + n, m, theta, phi = self.args + return S.NegativeOne**m * self.func(n, -m, theta, phi) + + def as_real_imag(self, deep=True, **hints): + # TODO: Handle deep and hints + n, m, theta, phi = self.args + re = (sqrt((2*n + 1)/(4*pi) * factorial(n - m)/factorial(n + m)) * + cos(m*phi) * assoc_legendre(n, m, cos(theta))) + im = (sqrt((2*n + 1)/(4*pi) * factorial(n - m)/factorial(n + m)) * + sin(m*phi) * assoc_legendre(n, m, cos(theta))) + return (re, im) + + def _eval_evalf(self, prec): + # Note: works without this function by just calling + # mpmath for Legendre polynomials. But using + # the dedicated function directly is cleaner. + from mpmath import mp, workprec + n = self.args[0]._to_mpmath(prec) + m = self.args[1]._to_mpmath(prec) + theta = self.args[2]._to_mpmath(prec) + phi = self.args[3]._to_mpmath(prec) + with workprec(prec): + res = mp.spherharm(n, m, theta, phi) + return Expr._from_mpmath(res, prec) + + +def Ynm_c(n, m, theta, phi): + r""" + Conjugate spherical harmonics defined as + + .. math:: + \overline{Y_n^m(\theta, \varphi)} := (-1)^m Y_n^{-m}(\theta, \varphi). + + Examples + ======== + + >>> from sympy import Ynm_c, Symbol, simplify + >>> from sympy.abc import n,m + >>> theta = Symbol("theta") + >>> phi = Symbol("phi") + >>> Ynm_c(n, m, theta, phi) + (-1)**(2*m)*exp(-2*I*m*phi)*Ynm(n, m, theta, phi) + >>> Ynm_c(n, m, -theta, phi) + (-1)**(2*m)*exp(-2*I*m*phi)*Ynm(n, m, theta, phi) + + For specific integers $n$ and $m$ we can evaluate the harmonics + to more useful expressions: + + >>> simplify(Ynm_c(0, 0, theta, phi).expand(func=True)) + 1/(2*sqrt(pi)) + >>> simplify(Ynm_c(1, -1, theta, phi).expand(func=True)) + sqrt(6)*exp(I*(-phi + 2*conjugate(phi)))*sin(theta)/(4*sqrt(pi)) + + See Also + ======== + + Ynm, Znm + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Spherical_harmonics + .. [2] https://mathworld.wolfram.com/SphericalHarmonic.html + .. [3] https://functions.wolfram.com/Polynomials/SphericalHarmonicY/ + + """ + return conjugate(Ynm(n, m, theta, phi)) + + +class Znm(DefinedFunction): + r""" + Real spherical harmonics defined as + + .. math:: + + Z_n^m(\theta, \varphi) := + \begin{cases} + \frac{Y_n^m(\theta, \varphi) + \overline{Y_n^m(\theta, \varphi)}}{\sqrt{2}} &\quad m > 0 \\ + Y_n^m(\theta, \varphi) &\quad m = 0 \\ + \frac{Y_n^m(\theta, \varphi) - \overline{Y_n^m(\theta, \varphi)}}{i \sqrt{2}} &\quad m < 0 \\ + \end{cases} + + which gives in simplified form + + .. math:: + + Z_n^m(\theta, \varphi) = + \begin{cases} + \frac{Y_n^m(\theta, \varphi) + (-1)^m Y_n^{-m}(\theta, \varphi)}{\sqrt{2}} &\quad m > 0 \\ + Y_n^m(\theta, \varphi) &\quad m = 0 \\ + \frac{Y_n^m(\theta, \varphi) - (-1)^m Y_n^{-m}(\theta, \varphi)}{i \sqrt{2}} &\quad m < 0 \\ + \end{cases} + + Examples + ======== + + >>> from sympy import Znm, Symbol, simplify + >>> from sympy.abc import n, m + >>> theta = Symbol("theta") + >>> phi = Symbol("phi") + >>> Znm(n, m, theta, phi) + Znm(n, m, theta, phi) + + For specific integers n and m we can evaluate the harmonics + to more useful expressions: + + >>> simplify(Znm(0, 0, theta, phi).expand(func=True)) + 1/(2*sqrt(pi)) + >>> simplify(Znm(1, 1, theta, phi).expand(func=True)) + -sqrt(3)*sin(theta)*cos(phi)/(2*sqrt(pi)) + >>> simplify(Znm(2, 1, theta, phi).expand(func=True)) + -sqrt(15)*sin(2*theta)*cos(phi)/(4*sqrt(pi)) + + See Also + ======== + + Ynm, Ynm_c + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Spherical_harmonics + .. [2] https://mathworld.wolfram.com/SphericalHarmonic.html + .. [3] https://functions.wolfram.com/Polynomials/SphericalHarmonicY/ + + """ + + @classmethod + def eval(cls, n, m, theta, phi): + if m.is_positive: + zz = (Ynm(n, m, theta, phi) + Ynm_c(n, m, theta, phi)) / sqrt(2) + return zz + elif m.is_zero: + return Ynm(n, m, theta, phi) + elif m.is_negative: + zz = (Ynm(n, m, theta, phi) - Ynm_c(n, m, theta, phi)) / (sqrt(2)*I) + return zz diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/functions/special/tensor_functions.py b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/functions/special/tensor_functions.py new file mode 100644 index 0000000000000000000000000000000000000000..6d996a58cbc8320620c9a1f6e68529c3b5e99aef --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/functions/special/tensor_functions.py @@ -0,0 +1,474 @@ +from math import prod + +from sympy.core import S, Integer +from sympy.core.function import DefinedFunction +from sympy.core.logic import fuzzy_not +from sympy.core.relational import Ne +from sympy.core.sorting import default_sort_key +from sympy.external.gmpy import SYMPY_INTS +from sympy.functions.combinatorial.factorials import factorial +from sympy.functions.elementary.piecewise import Piecewise +from sympy.utilities.iterables import has_dups + +############################################################################### +###################### Kronecker Delta, Levi-Civita etc. ###################### +############################################################################### + + +def Eijk(*args, **kwargs): + """ + Represent the Levi-Civita symbol. + + This is a compatibility wrapper to ``LeviCivita()``. + + See Also + ======== + + LeviCivita + + """ + return LeviCivita(*args, **kwargs) + + +def eval_levicivita(*args): + """Evaluate Levi-Civita symbol.""" + n = len(args) + return prod( + prod(args[j] - args[i] for j in range(i + 1, n)) + / factorial(i) for i in range(n)) + # converting factorial(i) to int is slightly faster + + +class LeviCivita(DefinedFunction): + """ + Represent the Levi-Civita symbol. + + Explanation + =========== + + For even permutations of indices it returns 1, for odd permutations -1, and + for everything else (a repeated index) it returns 0. + + Thus it represents an alternating pseudotensor. + + Examples + ======== + + >>> from sympy import LeviCivita + >>> from sympy.abc import i, j, k + >>> LeviCivita(1, 2, 3) + 1 + >>> LeviCivita(1, 3, 2) + -1 + >>> LeviCivita(1, 2, 2) + 0 + >>> LeviCivita(i, j, k) + LeviCivita(i, j, k) + >>> LeviCivita(i, j, i) + 0 + + See Also + ======== + + Eijk + + """ + + is_integer = True + + @classmethod + def eval(cls, *args): + if all(isinstance(a, (SYMPY_INTS, Integer)) for a in args): + return eval_levicivita(*args) + if has_dups(args): + return S.Zero + + def doit(self, **hints): + return eval_levicivita(*self.args) + + +class KroneckerDelta(DefinedFunction): + """ + The discrete, or Kronecker, delta function. + + Explanation + =========== + + A function that takes in two integers $i$ and $j$. It returns $0$ if $i$ + and $j$ are not equal, or it returns $1$ if $i$ and $j$ are equal. + + Examples + ======== + + An example with integer indices: + + >>> from sympy import KroneckerDelta + >>> KroneckerDelta(1, 2) + 0 + >>> KroneckerDelta(3, 3) + 1 + + Symbolic indices: + + >>> from sympy.abc import i, j, k + >>> KroneckerDelta(i, j) + KroneckerDelta(i, j) + >>> KroneckerDelta(i, i) + 1 + >>> KroneckerDelta(i, i + 1) + 0 + >>> KroneckerDelta(i, i + 1 + k) + KroneckerDelta(i, i + k + 1) + + Parameters + ========== + + i : Number, Symbol + The first index of the delta function. + j : Number, Symbol + The second index of the delta function. + + See Also + ======== + + eval + DiracDelta + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Kronecker_delta + + """ + + is_integer = True + + @classmethod + def eval(cls, i, j, delta_range=None): + """ + Evaluates the discrete delta function. + + Examples + ======== + + >>> from sympy import KroneckerDelta + >>> from sympy.abc import i, j, k + + >>> KroneckerDelta(i, j) + KroneckerDelta(i, j) + >>> KroneckerDelta(i, i) + 1 + >>> KroneckerDelta(i, i + 1) + 0 + >>> KroneckerDelta(i, i + 1 + k) + KroneckerDelta(i, i + k + 1) + + # indirect doctest + + """ + + if delta_range is not None: + dinf, dsup = delta_range + if (dinf - i > 0) == True: + return S.Zero + if (dinf - j > 0) == True: + return S.Zero + if (dsup - i < 0) == True: + return S.Zero + if (dsup - j < 0) == True: + return S.Zero + + diff = i - j + if diff.is_zero: + return S.One + elif fuzzy_not(diff.is_zero): + return S.Zero + + if i.assumptions0.get("below_fermi") and \ + j.assumptions0.get("above_fermi"): + return S.Zero + if j.assumptions0.get("below_fermi") and \ + i.assumptions0.get("above_fermi"): + return S.Zero + # to make KroneckerDelta canonical + # following lines will check if inputs are in order + # if not, will return KroneckerDelta with correct order + if default_sort_key(j) < default_sort_key(i): + if delta_range: + return cls(j, i, delta_range) + else: + return cls(j, i) + + @property + def delta_range(self): + if len(self.args) > 2: + return self.args[2] + + def _eval_power(self, expt): + if expt.is_positive: + return self + if expt.is_negative and expt is not S.NegativeOne: + return 1/self + + @property + def is_above_fermi(self): + """ + True if Delta can be non-zero above fermi. + + Examples + ======== + + >>> from sympy import KroneckerDelta, Symbol + >>> a = Symbol('a', above_fermi=True) + >>> i = Symbol('i', below_fermi=True) + >>> p = Symbol('p') + >>> q = Symbol('q') + >>> KroneckerDelta(p, a).is_above_fermi + True + >>> KroneckerDelta(p, i).is_above_fermi + False + >>> KroneckerDelta(p, q).is_above_fermi + True + + See Also + ======== + + is_below_fermi, is_only_below_fermi, is_only_above_fermi + + """ + if self.args[0].assumptions0.get("below_fermi"): + return False + if self.args[1].assumptions0.get("below_fermi"): + return False + return True + + @property + def is_below_fermi(self): + """ + True if Delta can be non-zero below fermi. + + Examples + ======== + + >>> from sympy import KroneckerDelta, Symbol + >>> a = Symbol('a', above_fermi=True) + >>> i = Symbol('i', below_fermi=True) + >>> p = Symbol('p') + >>> q = Symbol('q') + >>> KroneckerDelta(p, a).is_below_fermi + False + >>> KroneckerDelta(p, i).is_below_fermi + True + >>> KroneckerDelta(p, q).is_below_fermi + True + + See Also + ======== + + is_above_fermi, is_only_above_fermi, is_only_below_fermi + + """ + if self.args[0].assumptions0.get("above_fermi"): + return False + if self.args[1].assumptions0.get("above_fermi"): + return False + return True + + @property + def is_only_above_fermi(self): + """ + True if Delta is restricted to above fermi. + + Examples + ======== + + >>> from sympy import KroneckerDelta, Symbol + >>> a = Symbol('a', above_fermi=True) + >>> i = Symbol('i', below_fermi=True) + >>> p = Symbol('p') + >>> q = Symbol('q') + >>> KroneckerDelta(p, a).is_only_above_fermi + True + >>> KroneckerDelta(p, q).is_only_above_fermi + False + >>> KroneckerDelta(p, i).is_only_above_fermi + False + + See Also + ======== + + is_above_fermi, is_below_fermi, is_only_below_fermi + + """ + return ( self.args[0].assumptions0.get("above_fermi") + or + self.args[1].assumptions0.get("above_fermi") + ) or False + + @property + def is_only_below_fermi(self): + """ + True if Delta is restricted to below fermi. + + Examples + ======== + + >>> from sympy import KroneckerDelta, Symbol + >>> a = Symbol('a', above_fermi=True) + >>> i = Symbol('i', below_fermi=True) + >>> p = Symbol('p') + >>> q = Symbol('q') + >>> KroneckerDelta(p, i).is_only_below_fermi + True + >>> KroneckerDelta(p, q).is_only_below_fermi + False + >>> KroneckerDelta(p, a).is_only_below_fermi + False + + See Also + ======== + + is_above_fermi, is_below_fermi, is_only_above_fermi + + """ + return ( self.args[0].assumptions0.get("below_fermi") + or + self.args[1].assumptions0.get("below_fermi") + ) or False + + @property + def indices_contain_equal_information(self): + """ + Returns True if indices are either both above or below fermi. + + Examples + ======== + + >>> from sympy import KroneckerDelta, Symbol + >>> a = Symbol('a', above_fermi=True) + >>> i = Symbol('i', below_fermi=True) + >>> p = Symbol('p') + >>> q = Symbol('q') + >>> KroneckerDelta(p, q).indices_contain_equal_information + True + >>> KroneckerDelta(p, q+1).indices_contain_equal_information + True + >>> KroneckerDelta(i, p).indices_contain_equal_information + False + + """ + if (self.args[0].assumptions0.get("below_fermi") and + self.args[1].assumptions0.get("below_fermi")): + return True + if (self.args[0].assumptions0.get("above_fermi") + and self.args[1].assumptions0.get("above_fermi")): + return True + + # if both indices are general we are True, else false + return self.is_below_fermi and self.is_above_fermi + + @property + def preferred_index(self): + """ + Returns the index which is preferred to keep in the final expression. + + Explanation + =========== + + The preferred index is the index with more information regarding fermi + level. If indices contain the same information, 'a' is preferred before + 'b'. + + Examples + ======== + + >>> from sympy import KroneckerDelta, Symbol + >>> a = Symbol('a', above_fermi=True) + >>> i = Symbol('i', below_fermi=True) + >>> j = Symbol('j', below_fermi=True) + >>> p = Symbol('p') + >>> KroneckerDelta(p, i).preferred_index + i + >>> KroneckerDelta(p, a).preferred_index + a + >>> KroneckerDelta(i, j).preferred_index + i + + See Also + ======== + + killable_index + + """ + if self._get_preferred_index(): + return self.args[1] + else: + return self.args[0] + + @property + def killable_index(self): + """ + Returns the index which is preferred to substitute in the final + expression. + + Explanation + =========== + + The index to substitute is the index with less information regarding + fermi level. If indices contain the same information, 'a' is preferred + before 'b'. + + Examples + ======== + + >>> from sympy import KroneckerDelta, Symbol + >>> a = Symbol('a', above_fermi=True) + >>> i = Symbol('i', below_fermi=True) + >>> j = Symbol('j', below_fermi=True) + >>> p = Symbol('p') + >>> KroneckerDelta(p, i).killable_index + p + >>> KroneckerDelta(p, a).killable_index + p + >>> KroneckerDelta(i, j).killable_index + j + + See Also + ======== + + preferred_index + + """ + if self._get_preferred_index(): + return self.args[0] + else: + return self.args[1] + + def _get_preferred_index(self): + """ + Returns the index which is preferred to keep in the final expression. + + The preferred index is the index with more information regarding fermi + level. If indices contain the same information, index 0 is returned. + + """ + if not self.is_above_fermi: + if self.args[0].assumptions0.get("below_fermi"): + return 0 + else: + return 1 + elif not self.is_below_fermi: + if self.args[0].assumptions0.get("above_fermi"): + return 0 + else: + return 1 + else: + return 0 + + @property + def indices(self): + return self.args[0:2] + + def _eval_rewrite_as_Piecewise(self, *args, **kwargs): + i, j = args + return Piecewise((0, Ne(i, j)), (1, True)) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/functions/special/zeta_functions.py b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/functions/special/zeta_functions.py new file mode 100644 index 0000000000000000000000000000000000000000..8f410f0f1086de91490c714cd3becf11df9ab189 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/functions/special/zeta_functions.py @@ -0,0 +1,786 @@ +""" Riemann zeta and related function. """ + +from sympy.core.add import Add +from sympy.core.cache import cacheit +from sympy.core.function import ArgumentIndexError, expand_mul, DefinedFunction +from sympy.core.logic import fuzzy_not +from sympy.core.numbers import pi, I, Integer +from sympy.core.relational import Eq +from sympy.core.singleton import S +from sympy.core.symbol import Dummy +from sympy.core.sympify import sympify +from sympy.functions.combinatorial.numbers import bernoulli, factorial, genocchi, harmonic +from sympy.functions.elementary.complexes import re, unpolarify, Abs, polar_lift +from sympy.functions.elementary.exponential import log, exp_polar, exp +from sympy.functions.elementary.integers import ceiling, floor +from sympy.functions.elementary.miscellaneous import sqrt +from sympy.functions.elementary.piecewise import Piecewise +from sympy.polys.polytools import Poly + +############################################################################### +###################### LERCH TRANSCENDENT ##################################### +############################################################################### + + +class lerchphi(DefinedFunction): + r""" + Lerch transcendent (Lerch phi function). + + Explanation + =========== + + For $\operatorname{Re}(a) > 0$, $|z| < 1$ and $s \in \mathbb{C}$, the + Lerch transcendent is defined as + + .. math :: \Phi(z, s, a) = \sum_{n=0}^\infty \frac{z^n}{(n + a)^s}, + + where the standard branch of the argument is used for $n + a$, + and by analytic continuation for other values of the parameters. + + A commonly used related function is the Lerch zeta function, defined by + + .. math:: L(q, s, a) = \Phi(e^{2\pi i q}, s, a). + + **Analytic Continuation and Branching Behavior** + + It can be shown that + + .. math:: \Phi(z, s, a) = z\Phi(z, s, a+1) + a^{-s}. + + This provides the analytic continuation to $\operatorname{Re}(a) \le 0$. + + Assume now $\operatorname{Re}(a) > 0$. The integral representation + + .. math:: \Phi_0(z, s, a) = \int_0^\infty \frac{t^{s-1} e^{-at}}{1 - ze^{-t}} + \frac{\mathrm{d}t}{\Gamma(s)} + + provides an analytic continuation to $\mathbb{C} - [1, \infty)$. + Finally, for $x \in (1, \infty)$ we find + + .. math:: \lim_{\epsilon \to 0^+} \Phi_0(x + i\epsilon, s, a) + -\lim_{\epsilon \to 0^+} \Phi_0(x - i\epsilon, s, a) + = \frac{2\pi i \log^{s-1}{x}}{x^a \Gamma(s)}, + + using the standard branch for both $\log{x}$ and + $\log{\log{x}}$ (a branch of $\log{\log{x}}$ is needed to + evaluate $\log{x}^{s-1}$). + This concludes the analytic continuation. The Lerch transcendent is thus + branched at $z \in \{0, 1, \infty\}$ and + $a \in \mathbb{Z}_{\le 0}$. For fixed $z, a$ outside these + branch points, it is an entire function of $s$. + + Examples + ======== + + The Lerch transcendent is a fairly general function, for this reason it does + not automatically evaluate to simpler functions. Use ``expand_func()`` to + achieve this. + + If $z=1$, the Lerch transcendent reduces to the Hurwitz zeta function: + + >>> from sympy import lerchphi, expand_func + >>> from sympy.abc import z, s, a + >>> expand_func(lerchphi(1, s, a)) + zeta(s, a) + + More generally, if $z$ is a root of unity, the Lerch transcendent + reduces to a sum of Hurwitz zeta functions: + + >>> expand_func(lerchphi(-1, s, a)) + zeta(s, a/2)/2**s - zeta(s, a/2 + 1/2)/2**s + + If $a=1$, the Lerch transcendent reduces to the polylogarithm: + + >>> expand_func(lerchphi(z, s, 1)) + polylog(s, z)/z + + More generally, if $a$ is rational, the Lerch transcendent reduces + to a sum of polylogarithms: + + >>> from sympy import S + >>> expand_func(lerchphi(z, s, S(1)/2)) + 2**(s - 1)*(polylog(s, sqrt(z))/sqrt(z) - + polylog(s, sqrt(z)*exp_polar(I*pi))/sqrt(z)) + >>> expand_func(lerchphi(z, s, S(3)/2)) + -2**s/z + 2**(s - 1)*(polylog(s, sqrt(z))/sqrt(z) - + polylog(s, sqrt(z)*exp_polar(I*pi))/sqrt(z))/z + + The derivatives with respect to $z$ and $a$ can be computed in + closed form: + + >>> lerchphi(z, s, a).diff(z) + (-a*lerchphi(z, s, a) + lerchphi(z, s - 1, a))/z + >>> lerchphi(z, s, a).diff(a) + -s*lerchphi(z, s + 1, a) + + See Also + ======== + + polylog, zeta + + References + ========== + + .. [1] Bateman, H.; Erdelyi, A. (1953), Higher Transcendental Functions, + Vol. I, New York: McGraw-Hill. Section 1.11. + .. [2] https://dlmf.nist.gov/25.14 + .. [3] https://en.wikipedia.org/wiki/Lerch_transcendent + + """ + + def _eval_expand_func(self, **hints): + z, s, a = self.args + if z == 1: + return zeta(s, a) + if s.is_Integer and s <= 0: + t = Dummy('t') + p = Poly((t + a)**(-s), t) + start = 1/(1 - t) + res = S.Zero + for c in reversed(p.all_coeffs()): + res += c*start + start = t*start.diff(t) + return res.subs(t, z) + + if a.is_Rational: + # See section 18 of + # Kelly B. Roach. Hypergeometric Function Representations. + # In: Proceedings of the 1997 International Symposium on Symbolic and + # Algebraic Computation, pages 205-211, New York, 1997. ACM. + # TODO should something be polarified here? + add = S.Zero + mul = S.One + # First reduce a to the interaval (0, 1] + if a > 1: + n = floor(a) + if n == a: + n -= 1 + a -= n + mul = z**(-n) + add = Add(*[-z**(k - n)/(a + k)**s for k in range(n)]) + elif a <= 0: + n = floor(-a) + 1 + a += n + mul = z**n + add = Add(*[z**(n - 1 - k)/(a - k - 1)**s for k in range(n)]) + + m, n = S([a.p, a.q]) + zet = exp_polar(2*pi*I/n) + root = z**(1/n) + up_zet = unpolarify(zet) + addargs = [] + for k in range(n): + p = polylog(s, zet**k*root) + if isinstance(p, polylog): + p = p._eval_expand_func(**hints) + addargs.append(p/(up_zet**k*root)**m) + return add + mul*n**(s - 1)*Add(*addargs) + + # TODO use minpoly instead of ad-hoc methods when issue 5888 is fixed + if isinstance(z, exp) and (z.args[0]/(pi*I)).is_Rational or z in [-1, I, -I]: + # TODO reference? + if z == -1: + p, q = S([1, 2]) + elif z == I: + p, q = S([1, 4]) + elif z == -I: + p, q = S([-1, 4]) + else: + arg = z.args[0]/(2*pi*I) + p, q = S([arg.p, arg.q]) + return Add(*[exp(2*pi*I*k*p/q)/q**s*zeta(s, (k + a)/q) + for k in range(q)]) + + return lerchphi(z, s, a) + + def fdiff(self, argindex=1): + z, s, a = self.args + if argindex == 3: + return -s*lerchphi(z, s + 1, a) + elif argindex == 1: + return (lerchphi(z, s - 1, a) - a*lerchphi(z, s, a))/z + else: + raise ArgumentIndexError + + def _eval_rewrite_helper(self, target): + res = self._eval_expand_func() + if res.has(target): + return res + else: + return self + + def _eval_rewrite_as_zeta(self, z, s, a, **kwargs): + return self._eval_rewrite_helper(zeta) + + def _eval_rewrite_as_polylog(self, z, s, a, **kwargs): + return self._eval_rewrite_helper(polylog) + +############################################################################### +###################### POLYLOGARITHM ########################################## +############################################################################### + + +class polylog(DefinedFunction): + r""" + Polylogarithm function. + + Explanation + =========== + + For $|z| < 1$ and $s \in \mathbb{C}$, the polylogarithm is + defined by + + .. math:: \operatorname{Li}_s(z) = \sum_{n=1}^\infty \frac{z^n}{n^s}, + + where the standard branch of the argument is used for $n$. It admits + an analytic continuation which is branched at $z=1$ (notably not on the + sheet of initial definition), $z=0$ and $z=\infty$. + + The name polylogarithm comes from the fact that for $s=1$, the + polylogarithm is related to the ordinary logarithm (see examples), and that + + .. math:: \operatorname{Li}_{s+1}(z) = + \int_0^z \frac{\operatorname{Li}_s(t)}{t} \mathrm{d}t. + + The polylogarithm is a special case of the Lerch transcendent: + + .. math:: \operatorname{Li}_{s}(z) = z \Phi(z, s, 1). + + Examples + ======== + + For $z \in \{0, 1, -1\}$, the polylogarithm is automatically expressed + using other functions: + + >>> from sympy import polylog + >>> from sympy.abc import s + >>> polylog(s, 0) + 0 + >>> polylog(s, 1) + zeta(s) + >>> polylog(s, -1) + -dirichlet_eta(s) + + If $s$ is a negative integer, $0$ or $1$, the polylogarithm can be + expressed using elementary functions. This can be done using + ``expand_func()``: + + >>> from sympy import expand_func + >>> from sympy.abc import z + >>> expand_func(polylog(1, z)) + -log(1 - z) + >>> expand_func(polylog(0, z)) + z/(1 - z) + + The derivative with respect to $z$ can be computed in closed form: + + >>> polylog(s, z).diff(z) + polylog(s - 1, z)/z + + The polylogarithm can be expressed in terms of the lerch transcendent: + + >>> from sympy import lerchphi + >>> polylog(s, z).rewrite(lerchphi) + z*lerchphi(z, s, 1) + + See Also + ======== + + zeta, lerchphi + + """ + + @classmethod + def eval(cls, s, z): + if z.is_number: + if z is S.One: + return zeta(s) + elif z is S.NegativeOne: + return -dirichlet_eta(s) + elif z is S.Zero: + return S.Zero + elif s == 2: + dilogtable = _dilogtable() + if z in dilogtable: + return dilogtable[z] + + if z.is_zero: + return S.Zero + + # Make an effort to determine if z is 1 to avoid replacing into + # expression with singularity + zone = z.equals(S.One) + + if zone: + return zeta(s) + elif zone is False: + # For s = 0 or -1 use explicit formulas to evaluate, but + # automatically expanding polylog(1, z) to -log(1-z) seems + # undesirable for summation methods based on hypergeometric + # functions + if s is S.Zero: + return z/(1 - z) + elif s is S.NegativeOne: + return z/(1 - z)**2 + if s.is_zero: + return z/(1 - z) + + # polylog is branched, but not over the unit disk + if z.has(exp_polar, polar_lift) and (zone or (Abs(z) <= S.One) == True): + return cls(s, unpolarify(z)) + + def fdiff(self, argindex=1): + s, z = self.args + if argindex == 2: + return polylog(s - 1, z)/z + raise ArgumentIndexError + + def _eval_rewrite_as_lerchphi(self, s, z, **kwargs): + return z*lerchphi(z, s, 1) + + def _eval_expand_func(self, **hints): + s, z = self.args + if s == 1: + return -log(1 - z) + if s.is_Integer and s <= 0: + u = Dummy('u') + start = u/(1 - u) + for _ in range(-s): + start = u*start.diff(u) + return expand_mul(start).subs(u, z) + return polylog(s, z) + + def _eval_is_zero(self): + z = self.args[1] + if z.is_zero: + return True + + def _eval_nseries(self, x, n, logx, cdir=0): + from sympy.series.order import Order + nu, z = self.args + + z0 = z.subs(x, 0) + if z0 is S.NaN: + z0 = z.limit(x, 0, dir='-' if re(cdir).is_negative else '+') + + if z0.is_zero: + # In case of powers less than 1, number of terms need to be computed + # separately to avoid repeated callings of _eval_nseries with wrong n + try: + _, exp = z.leadterm(x) + except (ValueError, NotImplementedError): + return self + + if exp.is_positive: + newn = ceiling(n/exp) + o = Order(x**n, x) + r = z._eval_nseries(x, n, logx, cdir).removeO() + if r is S.Zero: + return o + + term = r + s = [term] + for k in range(2, newn): + term *= r + s.append(term/k**nu) + return Add(*s) + o + + return super(polylog, self)._eval_nseries(x, n, logx, cdir) + +############################################################################### +###################### HURWITZ GENERALIZED ZETA FUNCTION ###################### +############################################################################### + + +class zeta(DefinedFunction): + r""" + Hurwitz zeta function (or Riemann zeta function). + + Explanation + =========== + + For $\operatorname{Re}(a) > 0$ and $\operatorname{Re}(s) > 1$, this + function is defined as + + .. math:: \zeta(s, a) = \sum_{n=0}^\infty \frac{1}{(n + a)^s}, + + where the standard choice of argument for $n + a$ is used. For fixed + $a$ not a nonpositive integer the Hurwitz zeta function admits a + meromorphic continuation to all of $\mathbb{C}$; it is an unbranched + function with a simple pole at $s = 1$. + + The Hurwitz zeta function is a special case of the Lerch transcendent: + + .. math:: \zeta(s, a) = \Phi(1, s, a). + + This formula defines an analytic continuation for all possible values of + $s$ and $a$ (also $\operatorname{Re}(a) < 0$), see the documentation of + :class:`lerchphi` for a description of the branching behavior. + + If no value is passed for $a$ a default value of $a = 1$ is assumed, + yielding the Riemann zeta function. + + Examples + ======== + + For $a = 1$ the Hurwitz zeta function reduces to the famous Riemann + zeta function: + + .. math:: \zeta(s, 1) = \zeta(s) = \sum_{n=1}^\infty \frac{1}{n^s}. + + >>> from sympy import zeta + >>> from sympy.abc import s + >>> zeta(s, 1) + zeta(s) + >>> zeta(s) + zeta(s) + + The Riemann zeta function can also be expressed using the Dirichlet eta + function: + + >>> from sympy import dirichlet_eta + >>> zeta(s).rewrite(dirichlet_eta) + dirichlet_eta(s)/(1 - 2**(1 - s)) + + The Riemann zeta function at nonnegative even and negative integer + values is related to the Bernoulli numbers and polynomials: + + >>> zeta(2) + pi**2/6 + >>> zeta(4) + pi**4/90 + >>> zeta(0) + -1/2 + >>> zeta(-1) + -1/12 + >>> zeta(-4) + 0 + + The specific formulae are: + + .. math:: \zeta(2n) = -\frac{(2\pi i)^{2n} B_{2n}}{2(2n)!} + .. math:: \zeta(-n,a) = -\frac{B_{n+1}(a)}{n+1} + + No closed-form expressions are known at positive odd integers, but + numerical evaluation is possible: + + >>> zeta(3).n() + 1.20205690315959 + + The derivative of $\zeta(s, a)$ with respect to $a$ can be computed: + + >>> from sympy.abc import a + >>> zeta(s, a).diff(a) + -s*zeta(s + 1, a) + + However the derivative with respect to $s$ has no useful closed form + expression: + + >>> zeta(s, a).diff(s) + Derivative(zeta(s, a), s) + + The Hurwitz zeta function can be expressed in terms of the Lerch + transcendent, :class:`~.lerchphi`: + + >>> from sympy import lerchphi + >>> zeta(s, a).rewrite(lerchphi) + lerchphi(1, s, a) + + See Also + ======== + + dirichlet_eta, lerchphi, polylog + + References + ========== + + .. [1] https://dlmf.nist.gov/25.11 + .. [2] https://en.wikipedia.org/wiki/Hurwitz_zeta_function + + """ + + @classmethod + def eval(cls, s, a=None): + if a is S.One: + return cls(s) + elif s is S.NaN or a is S.NaN: + return S.NaN + elif s is S.One: + return S.ComplexInfinity + elif s is S.Infinity: + return S.One + elif a is S.Infinity: + return S.Zero + + sint = s.is_Integer + if a is None: + a = S.One + if sint and s.is_nonpositive: + return bernoulli(1-s, a) / (s-1) + elif a is S.One: + if sint and s.is_even: + return -(2*pi*I)**s * bernoulli(s) / (2*factorial(s)) + elif sint and a.is_Integer and a.is_positive: + return cls(s) - harmonic(a-1, s) + elif a.is_Integer and a.is_nonpositive and \ + (s.is_integer is False or s.is_nonpositive is False): + return S.NaN + + def _eval_rewrite_as_bernoulli(self, s, a=1, **kwargs): + if a == 1 and s.is_integer and s.is_nonnegative and s.is_even: + return -(2*pi*I)**s * bernoulli(s) / (2*factorial(s)) + return bernoulli(1-s, a) / (s-1) + + def _eval_rewrite_as_dirichlet_eta(self, s, a=1, **kwargs): + if a != 1: + return self + s = self.args[0] + return dirichlet_eta(s)/(1 - 2**(1 - s)) + + def _eval_rewrite_as_lerchphi(self, s, a=1, **kwargs): + return lerchphi(1, s, a) + + def _eval_is_finite(self): + return fuzzy_not((self.args[0] - 1).is_zero) + + def _eval_expand_func(self, **hints): + s = self.args[0] + a = self.args[1] if len(self.args) > 1 else S.One + if a.is_integer: + if a.is_positive: + return zeta(s) - harmonic(a-1, s) + if a.is_nonpositive and (s.is_integer is False or + s.is_nonpositive is False): + return S.NaN + return self + + def fdiff(self, argindex=1): + if len(self.args) == 2: + s, a = self.args + else: + s, a = self.args + (1,) + if argindex == 2: + return -s*zeta(s + 1, a) + else: + raise ArgumentIndexError + + def _eval_as_leading_term(self, x, logx, cdir): + if len(self.args) == 2: + s, a = self.args + else: + s, a = self.args + (S.One,) + + try: + c, e = a.leadterm(x) + except NotImplementedError: + return self + + if e.is_negative and not s.is_positive: + raise NotImplementedError + + return super(zeta, self)._eval_as_leading_term(x, logx=logx, cdir=cdir) + + +class dirichlet_eta(DefinedFunction): + r""" + Dirichlet eta function. + + Explanation + =========== + + For $\operatorname{Re}(s) > 0$ and $0 < x \le 1$, this function is defined as + + .. math:: \eta(s, a) = \sum_{n=0}^\infty \frac{(-1)^n}{(n+a)^s}. + + It admits a unique analytic continuation to all of $\mathbb{C}$ for any + fixed $a$ not a nonpositive integer. It is an entire, unbranched function. + + It can be expressed using the Hurwitz zeta function as + + .. math:: \eta(s, a) = \zeta(s,a) - 2^{1-s} \zeta\left(s, \frac{a+1}{2}\right) + + and using the generalized Genocchi function as + + .. math:: \eta(s, a) = \frac{G(1-s, a)}{2(s-1)}. + + In both cases the limiting value of $\log2 - \psi(a) + \psi\left(\frac{a+1}{2}\right)$ + is used when $s = 1$. + + Examples + ======== + + >>> from sympy import dirichlet_eta, zeta + >>> from sympy.abc import s + >>> dirichlet_eta(s).rewrite(zeta) + Piecewise((log(2), Eq(s, 1)), ((1 - 2**(1 - s))*zeta(s), True)) + + See Also + ======== + + zeta + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Dirichlet_eta_function + .. [2] Peter Luschny, "An introduction to the Bernoulli function", + https://arxiv.org/abs/2009.06743 + + """ + + @classmethod + def eval(cls, s, a=None): + if a is S.One: + return cls(s) + if a is None: + if s == 1: + return log(2) + z = zeta(s) + if not z.has(zeta): + return (1 - 2**(1-s)) * z + return + elif s == 1: + from sympy.functions.special.gamma_functions import digamma + return log(2) - digamma(a) + digamma((a+1)/2) + z1 = zeta(s, a) + z2 = zeta(s, (a+1)/2) + if not z1.has(zeta) and not z2.has(zeta): + return z1 - 2**(1-s) * z2 + + def _eval_rewrite_as_zeta(self, s, a=1, **kwargs): + from sympy.functions.special.gamma_functions import digamma + if a == 1: + return Piecewise((log(2), Eq(s, 1)), ((1 - 2**(1-s)) * zeta(s), True)) + return Piecewise((log(2) - digamma(a) + digamma((a+1)/2), Eq(s, 1)), + (zeta(s, a) - 2**(1-s) * zeta(s, (a+1)/2), True)) + + def _eval_rewrite_as_genocchi(self, s, a=S.One, **kwargs): + from sympy.functions.special.gamma_functions import digamma + return Piecewise((log(2) - digamma(a) + digamma((a+1)/2), Eq(s, 1)), + (genocchi(1-s, a) / (2 * (s-1)), True)) + + def _eval_evalf(self, prec): + if all(i.is_number for i in self.args): + return self.rewrite(zeta)._eval_evalf(prec) + + +class riemann_xi(DefinedFunction): + r""" + Riemann Xi function. + + Examples + ======== + + The Riemann Xi function is closely related to the Riemann zeta function. + The zeros of Riemann Xi function are precisely the non-trivial zeros + of the zeta function. + + >>> from sympy import riemann_xi, zeta + >>> from sympy.abc import s + >>> riemann_xi(s).rewrite(zeta) + s*(s - 1)*gamma(s/2)*zeta(s)/(2*pi**(s/2)) + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Riemann_Xi_function + + """ + + + @classmethod + def eval(cls, s): + from sympy.functions.special.gamma_functions import gamma + z = zeta(s) + if s in (S.Zero, S.One): + return S.Half + + if not isinstance(z, zeta): + return s*(s - 1)*gamma(s/2)*z/(2*pi**(s/2)) + + def _eval_rewrite_as_zeta(self, s, **kwargs): + from sympy.functions.special.gamma_functions import gamma + return s*(s - 1)*gamma(s/2)*zeta(s)/(2*pi**(s/2)) + + +class stieltjes(DefinedFunction): + r""" + Represents Stieltjes constants, $\gamma_{k}$ that occur in + Laurent Series expansion of the Riemann zeta function. + + Examples + ======== + + >>> from sympy import stieltjes + >>> from sympy.abc import n, m + >>> stieltjes(n) + stieltjes(n) + + The zero'th stieltjes constant: + + >>> stieltjes(0) + EulerGamma + >>> stieltjes(0, 1) + EulerGamma + + For generalized stieltjes constants: + + >>> stieltjes(n, m) + stieltjes(n, m) + + Constants are only defined for integers >= 0: + + >>> stieltjes(-1) + zoo + + References + ========== + + .. [1] https://en.wikipedia.org/wiki/Stieltjes_constants + + """ + + @classmethod + def eval(cls, n, a=None): + if a is not None: + a = sympify(a) + if a is S.NaN: + return S.NaN + if a.is_Integer and a.is_nonpositive: + return S.ComplexInfinity + + if n.is_Number: + if n is S.NaN: + return S.NaN + elif n < 0: + return S.ComplexInfinity + elif not n.is_Integer: + return S.ComplexInfinity + elif n is S.Zero and a in [None, 1]: + return S.EulerGamma + + if n.is_extended_negative: + return S.ComplexInfinity + + if n.is_zero and a in [None, 1]: + return S.EulerGamma + + if n.is_integer == False: + return S.ComplexInfinity + + +@cacheit +def _dilogtable(): + return { + S.Half: pi**2/12 - log(2)**2/2, + Integer(2) : pi**2/4 - I*pi*log(2), + -(sqrt(5) - 1)/2 : -pi**2/15 + log((sqrt(5)-1)/2)**2/2, + -(sqrt(5) + 1)/2 : -pi**2/10 - log((sqrt(5)+1)/2)**2, + (3 - sqrt(5))/2 : pi**2/15 - log((sqrt(5)-1)/2)**2, + (sqrt(5) - 1)/2 : pi**2/10 - log((sqrt(5)-1)/2)**2, + I : I*S.Catalan - pi**2/48, + -I : -I*S.Catalan - pi**2/48, + 1 - I : pi**2/16 - I*S.Catalan - pi*I/4*log(2), + 1 + I : pi**2/16 + I*S.Catalan + pi*I/4*log(2), + (1 - I)/2 : -log(2)**2/8 + pi*I*log(2)/8 + 5*pi**2/96 - I*S.Catalan + } diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/__pycache__/__init__.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5d25063d06d22c4c4d412417b719f1e320a876aa Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/__pycache__/__init__.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/__pycache__/experimental_lambdify.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/__pycache__/experimental_lambdify.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bcba551cd090182651dbc46cc1667d914b01157c Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/__pycache__/experimental_lambdify.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/__pycache__/plot.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/__pycache__/plot.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e355a49bedd08b009be48ce2cb7bb77ed705fc90 Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/__pycache__/plot.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/__pycache__/plot_implicit.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/__pycache__/plot_implicit.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6f27f8a89811c2478ae77f18e95585c67f5e2e2e Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/__pycache__/plot_implicit.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/__pycache__/plotgrid.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/__pycache__/plotgrid.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5dfee24797c1279293bdc48423d9be24ee115aa4 Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/__pycache__/plotgrid.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/__pycache__/textplot.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/__pycache__/textplot.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f69d4aaa44ac9a9caaa6e48b9107bc75062f9ca7 Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/__pycache__/textplot.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/__pycache__/utils.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/__pycache__/utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6eb5692abf5c1311f45707030c2557dec20968b5 Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/__pycache__/utils.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/backends/textbackend/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/backends/textbackend/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ca4685e4b7790653a97b712c27b240ade5bb481a --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/backends/textbackend/__init__.py @@ -0,0 +1,3 @@ +from sympy.plotting.backends.textbackend.text import TextBackend + +__all__ = ["TextBackend"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/backends/textbackend/text.py b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/backends/textbackend/text.py new file mode 100644 index 0000000000000000000000000000000000000000..0917ec78b3463a929c373c98fdd279d84ce4c9e5 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/backends/textbackend/text.py @@ -0,0 +1,24 @@ +import sympy.plotting.backends.base_backend as base_backend +from sympy.plotting.series import LineOver1DRangeSeries +from sympy.plotting.textplot import textplot + + +class TextBackend(base_backend.Plot): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def show(self): + if not base_backend._show: + return + if len(self._series) != 1: + raise ValueError( + 'The TextBackend supports only one graph per Plot.') + elif not isinstance(self._series[0], LineOver1DRangeSeries): + raise ValueError( + 'The TextBackend supports only expressions over a 1D range') + else: + ser = self._series[0] + textplot(ser.expr, ser.start, ser.end) + + def close(self): + pass diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/intervalmath/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/intervalmath/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..fb9a6a57f94e931f0c5f5b3dda7b0b6fd31841f4 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/intervalmath/__init__.py @@ -0,0 +1,12 @@ +from .interval_arithmetic import interval +from .lib_interval import (Abs, exp, log, log10, sin, cos, tan, sqrt, + imin, imax, sinh, cosh, tanh, acosh, asinh, atanh, + asin, acos, atan, ceil, floor, And, Or) + +__all__ = [ + 'interval', + + 'Abs', 'exp', 'log', 'log10', 'sin', 'cos', 'tan', 'sqrt', 'imin', 'imax', + 'sinh', 'cosh', 'tanh', 'acosh', 'asinh', 'atanh', 'asin', 'acos', 'atan', + 'ceil', 'floor', 'And', 'Or', +] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/intervalmath/interval_arithmetic.py b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/intervalmath/interval_arithmetic.py new file mode 100644 index 0000000000000000000000000000000000000000..fc5c0e2ef118c7cf4f80de53a3590de11130410e --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/intervalmath/interval_arithmetic.py @@ -0,0 +1,413 @@ +""" +Interval Arithmetic for plotting. +This module does not implement interval arithmetic accurately and +hence cannot be used for purposes other than plotting. If you want +to use interval arithmetic, use mpmath's interval arithmetic. + +The module implements interval arithmetic using numpy and +python floating points. The rounding up and down is not handled +and hence this is not an accurate implementation of interval +arithmetic. + +The module uses numpy for speed which cannot be achieved with mpmath. +""" + +# Q: Why use numpy? Why not simply use mpmath's interval arithmetic? +# A: mpmath's interval arithmetic simulates a floating point unit +# and hence is slow, while numpy evaluations are orders of magnitude +# faster. + +# Q: Why create a separate class for intervals? Why not use SymPy's +# Interval Sets? +# A: The functionalities that will be required for plotting is quite +# different from what Interval Sets implement. + +# Q: Why is rounding up and down according to IEEE754 not handled? +# A: It is not possible to do it in both numpy and python. An external +# library has to used, which defeats the whole purpose i.e., speed. Also +# rounding is handled for very few functions in those libraries. + +# Q Will my plots be affected? +# A It will not affect most of the plots. The interval arithmetic +# module based suffers the same problems as that of floating point +# arithmetic. + +from sympy.core.numbers import int_valued +from sympy.core.logic import fuzzy_and +from sympy.simplify.simplify import nsimplify + +from .interval_membership import intervalMembership + + +class interval: + """ Represents an interval containing floating points as start and + end of the interval + The is_valid variable tracks whether the interval obtained as the + result of the function is in the domain and is continuous. + - True: Represents the interval result of a function is continuous and + in the domain of the function. + - False: The interval argument of the function was not in the domain of + the function, hence the is_valid of the result interval is False + - None: The function was not continuous over the interval or + the function's argument interval is partly in the domain of the + function + + A comparison between an interval and a real number, or a + comparison between two intervals may return ``intervalMembership`` + of two 3-valued logic values. + """ + + def __init__(self, *args, is_valid=True, **kwargs): + self.is_valid = is_valid + if len(args) == 1: + if isinstance(args[0], interval): + self.start, self.end = args[0].start, args[0].end + else: + self.start = float(args[0]) + self.end = float(args[0]) + elif len(args) == 2: + if args[0] < args[1]: + self.start = float(args[0]) + self.end = float(args[1]) + else: + self.start = float(args[1]) + self.end = float(args[0]) + + else: + raise ValueError("interval takes a maximum of two float values " + "as arguments") + + @property + def mid(self): + return (self.start + self.end) / 2.0 + + @property + def width(self): + return self.end - self.start + + def __repr__(self): + return "interval(%f, %f)" % (self.start, self.end) + + def __str__(self): + return "[%f, %f]" % (self.start, self.end) + + def __lt__(self, other): + if isinstance(other, (int, float)): + if self.end < other: + return intervalMembership(True, self.is_valid) + elif self.start > other: + return intervalMembership(False, self.is_valid) + else: + return intervalMembership(None, self.is_valid) + + elif isinstance(other, interval): + valid = fuzzy_and([self.is_valid, other.is_valid]) + if self.end < other. start: + return intervalMembership(True, valid) + if self.start > other.end: + return intervalMembership(False, valid) + return intervalMembership(None, valid) + else: + return NotImplemented + + def __gt__(self, other): + if isinstance(other, (int, float)): + if self.start > other: + return intervalMembership(True, self.is_valid) + elif self.end < other: + return intervalMembership(False, self.is_valid) + else: + return intervalMembership(None, self.is_valid) + elif isinstance(other, interval): + return other.__lt__(self) + else: + return NotImplemented + + def __eq__(self, other): + if isinstance(other, (int, float)): + if self.start == other and self.end == other: + return intervalMembership(True, self.is_valid) + if other in self: + return intervalMembership(None, self.is_valid) + else: + return intervalMembership(False, self.is_valid) + + if isinstance(other, interval): + valid = fuzzy_and([self.is_valid, other.is_valid]) + if self.start == other.start and self.end == other.end: + return intervalMembership(True, valid) + elif self.__lt__(other)[0] is not None: + return intervalMembership(False, valid) + else: + return intervalMembership(None, valid) + else: + return NotImplemented + + def __ne__(self, other): + if isinstance(other, (int, float)): + if self.start == other and self.end == other: + return intervalMembership(False, self.is_valid) + if other in self: + return intervalMembership(None, self.is_valid) + else: + return intervalMembership(True, self.is_valid) + + if isinstance(other, interval): + valid = fuzzy_and([self.is_valid, other.is_valid]) + if self.start == other.start and self.end == other.end: + return intervalMembership(False, valid) + if not self.__lt__(other)[0] is None: + return intervalMembership(True, valid) + return intervalMembership(None, valid) + else: + return NotImplemented + + def __le__(self, other): + if isinstance(other, (int, float)): + if self.end <= other: + return intervalMembership(True, self.is_valid) + if self.start > other: + return intervalMembership(False, self.is_valid) + else: + return intervalMembership(None, self.is_valid) + + if isinstance(other, interval): + valid = fuzzy_and([self.is_valid, other.is_valid]) + if self.end <= other.start: + return intervalMembership(True, valid) + if self.start > other.end: + return intervalMembership(False, valid) + return intervalMembership(None, valid) + else: + return NotImplemented + + def __ge__(self, other): + if isinstance(other, (int, float)): + if self.start >= other: + return intervalMembership(True, self.is_valid) + elif self.end < other: + return intervalMembership(False, self.is_valid) + else: + return intervalMembership(None, self.is_valid) + elif isinstance(other, interval): + return other.__le__(self) + + def __add__(self, other): + if isinstance(other, (int, float)): + if self.is_valid: + return interval(self.start + other, self.end + other) + else: + start = self.start + other + end = self.end + other + return interval(start, end, is_valid=self.is_valid) + + elif isinstance(other, interval): + start = self.start + other.start + end = self.end + other.end + valid = fuzzy_and([self.is_valid, other.is_valid]) + return interval(start, end, is_valid=valid) + else: + return NotImplemented + + __radd__ = __add__ + + def __sub__(self, other): + if isinstance(other, (int, float)): + start = self.start - other + end = self.end - other + return interval(start, end, is_valid=self.is_valid) + + elif isinstance(other, interval): + start = self.start - other.end + end = self.end - other.start + valid = fuzzy_and([self.is_valid, other.is_valid]) + return interval(start, end, is_valid=valid) + else: + return NotImplemented + + def __rsub__(self, other): + if isinstance(other, (int, float)): + start = other - self.end + end = other - self.start + return interval(start, end, is_valid=self.is_valid) + elif isinstance(other, interval): + return other.__sub__(self) + else: + return NotImplemented + + def __neg__(self): + if self.is_valid: + return interval(-self.end, -self.start) + else: + return interval(-self.end, -self.start, is_valid=self.is_valid) + + def __mul__(self, other): + if isinstance(other, interval): + if self.is_valid is False or other.is_valid is False: + return interval(-float('inf'), float('inf'), is_valid=False) + elif self.is_valid is None or other.is_valid is None: + return interval(-float('inf'), float('inf'), is_valid=None) + else: + inters = [] + inters.append(self.start * other.start) + inters.append(self.end * other.start) + inters.append(self.start * other.end) + inters.append(self.end * other.end) + start = min(inters) + end = max(inters) + return interval(start, end) + elif isinstance(other, (int, float)): + return interval(self.start*other, self.end*other, is_valid=self.is_valid) + else: + return NotImplemented + + __rmul__ = __mul__ + + def __contains__(self, other): + if isinstance(other, (int, float)): + return self.start <= other and self.end >= other + else: + return self.start <= other.start and other.end <= self.end + + def __rtruediv__(self, other): + if isinstance(other, (int, float)): + other = interval(other) + return other.__truediv__(self) + elif isinstance(other, interval): + return other.__truediv__(self) + else: + return NotImplemented + + def __truediv__(self, other): + # Both None and False are handled + if not self.is_valid: + # Don't divide as the value is not valid + return interval(-float('inf'), float('inf'), is_valid=self.is_valid) + if isinstance(other, (int, float)): + if other == 0: + # Divide by zero encountered. valid nowhere + return interval(-float('inf'), float('inf'), is_valid=False) + else: + return interval(self.start / other, self.end / other) + + elif isinstance(other, interval): + if other.is_valid is False or self.is_valid is False: + return interval(-float('inf'), float('inf'), is_valid=False) + elif other.is_valid is None or self.is_valid is None: + return interval(-float('inf'), float('inf'), is_valid=None) + else: + # denominator contains both signs, i.e. being divided by zero + # return the whole real line with is_valid = None + if 0 in other: + return interval(-float('inf'), float('inf'), is_valid=None) + + # denominator negative + this = self + if other.end < 0: + this = -this + other = -other + + # denominator positive + inters = [] + inters.append(this.start / other.start) + inters.append(this.end / other.start) + inters.append(this.start / other.end) + inters.append(this.end / other.end) + start = max(inters) + end = min(inters) + return interval(start, end) + else: + return NotImplemented + + def __pow__(self, other): + # Implements only power to an integer. + from .lib_interval import exp, log + if not self.is_valid: + return self + if isinstance(other, interval): + return exp(other * log(self)) + elif isinstance(other, (float, int)): + if other < 0: + return 1 / self.__pow__(abs(other)) + else: + if int_valued(other): + return _pow_int(self, other) + else: + return _pow_float(self, other) + else: + return NotImplemented + + def __rpow__(self, other): + if isinstance(other, (float, int)): + if not self.is_valid: + #Don't do anything + return self + elif other < 0: + if self.width > 0: + return interval(-float('inf'), float('inf'), is_valid=False) + else: + power_rational = nsimplify(self.start) + num, denom = power_rational.as_numer_denom() + if denom % 2 == 0: + return interval(-float('inf'), float('inf'), + is_valid=False) + else: + start = -abs(other)**self.start + end = start + return interval(start, end) + else: + return interval(other**self.start, other**self.end) + elif isinstance(other, interval): + return other.__pow__(self) + else: + return NotImplemented + + def __hash__(self): + return hash((self.is_valid, self.start, self.end)) + + +def _pow_float(inter, power): + """Evaluates an interval raised to a floating point.""" + power_rational = nsimplify(power) + num, denom = power_rational.as_numer_denom() + if num % 2 == 0: + start = abs(inter.start)**power + end = abs(inter.end)**power + if start < 0: + ret = interval(0, max(start, end)) + else: + ret = interval(start, end) + return ret + elif denom % 2 == 0: + if inter.end < 0: + return interval(-float('inf'), float('inf'), is_valid=False) + elif inter.start < 0: + return interval(0, inter.end**power, is_valid=None) + else: + return interval(inter.start**power, inter.end**power) + else: + if inter.start < 0: + start = -abs(inter.start)**power + else: + start = inter.start**power + + if inter.end < 0: + end = -abs(inter.end)**power + else: + end = inter.end**power + + return interval(start, end, is_valid=inter.is_valid) + + +def _pow_int(inter, power): + """Evaluates an interval raised to an integer power""" + power = int(power) + if power & 1: + return interval(inter.start**power, inter.end**power) + else: + if inter.start < 0 and inter.end > 0: + start = 0 + end = max(inter.start**power, inter.end**power) + return interval(start, end) + else: + return interval(inter.start**power, inter.end**power) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/intervalmath/interval_membership.py b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/intervalmath/interval_membership.py new file mode 100644 index 0000000000000000000000000000000000000000..c4887c2d96f0d006b95a8e207a4f4a75940aec23 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/intervalmath/interval_membership.py @@ -0,0 +1,78 @@ +from sympy.core.logic import fuzzy_and, fuzzy_or, fuzzy_not, fuzzy_xor + + +class intervalMembership: + """Represents a boolean expression returned by the comparison of + the interval object. + + Parameters + ========== + + (a, b) : (bool, bool) + The first value determines the comparison as follows: + - True: If the comparison is True throughout the intervals. + - False: If the comparison is False throughout the intervals. + - None: If the comparison is True for some part of the intervals. + + The second value is determined as follows: + - True: If both the intervals in comparison are valid. + - False: If at least one of the intervals is False, else + - None + """ + def __init__(self, a, b): + self._wrapped = (a, b) + + def __getitem__(self, i): + try: + return self._wrapped[i] + except IndexError: + raise IndexError( + "{} must be a valid indexing for the 2-tuple." + .format(i)) + + def __len__(self): + return 2 + + def __iter__(self): + return iter(self._wrapped) + + def __str__(self): + return "intervalMembership({}, {})".format(*self) + __repr__ = __str__ + + def __and__(self, other): + if not isinstance(other, intervalMembership): + raise ValueError( + "The comparison is not supported for {}.".format(other)) + + a1, b1 = self + a2, b2 = other + return intervalMembership(fuzzy_and([a1, a2]), fuzzy_and([b1, b2])) + + def __or__(self, other): + if not isinstance(other, intervalMembership): + raise ValueError( + "The comparison is not supported for {}.".format(other)) + + a1, b1 = self + a2, b2 = other + return intervalMembership(fuzzy_or([a1, a2]), fuzzy_and([b1, b2])) + + def __invert__(self): + a, b = self + return intervalMembership(fuzzy_not(a), b) + + def __xor__(self, other): + if not isinstance(other, intervalMembership): + raise ValueError( + "The comparison is not supported for {}.".format(other)) + + a1, b1 = self + a2, b2 = other + return intervalMembership(fuzzy_xor([a1, a2]), fuzzy_and([b1, b2])) + + def __eq__(self, other): + return self._wrapped == other + + def __ne__(self, other): + return self._wrapped != other diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/intervalmath/lib_interval.py b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/intervalmath/lib_interval.py new file mode 100644 index 0000000000000000000000000000000000000000..7549a05820d747ce057892f8df1fbcbc61cc3f43 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/intervalmath/lib_interval.py @@ -0,0 +1,452 @@ +""" The module contains implemented functions for interval arithmetic.""" +from functools import reduce + +from sympy.plotting.intervalmath import interval +from sympy.external import import_module + + +def Abs(x): + if isinstance(x, (int, float)): + return interval(abs(x)) + elif isinstance(x, interval): + if x.start < 0 and x.end > 0: + return interval(0, max(abs(x.start), abs(x.end)), is_valid=x.is_valid) + else: + return interval(abs(x.start), abs(x.end)) + else: + raise NotImplementedError + +#Monotonic + + +def exp(x): + """evaluates the exponential of an interval""" + np = import_module('numpy') + if isinstance(x, (int, float)): + return interval(np.exp(x), np.exp(x)) + elif isinstance(x, interval): + return interval(np.exp(x.start), np.exp(x.end), is_valid=x.is_valid) + else: + raise NotImplementedError + + +#Monotonic +def log(x): + """evaluates the natural logarithm of an interval""" + np = import_module('numpy') + if isinstance(x, (int, float)): + if x <= 0: + return interval(-np.inf, np.inf, is_valid=False) + else: + return interval(np.log(x)) + elif isinstance(x, interval): + if not x.is_valid: + return interval(-np.inf, np.inf, is_valid=x.is_valid) + elif x.end <= 0: + return interval(-np.inf, np.inf, is_valid=False) + elif x.start <= 0: + return interval(-np.inf, np.inf, is_valid=None) + + return interval(np.log(x.start), np.log(x.end)) + else: + raise NotImplementedError + + +#Monotonic +def log10(x): + """evaluates the logarithm to the base 10 of an interval""" + np = import_module('numpy') + if isinstance(x, (int, float)): + if x <= 0: + return interval(-np.inf, np.inf, is_valid=False) + else: + return interval(np.log10(x)) + elif isinstance(x, interval): + if not x.is_valid: + return interval(-np.inf, np.inf, is_valid=x.is_valid) + elif x.end <= 0: + return interval(-np.inf, np.inf, is_valid=False) + elif x.start <= 0: + return interval(-np.inf, np.inf, is_valid=None) + return interval(np.log10(x.start), np.log10(x.end)) + else: + raise NotImplementedError + + +#Monotonic +def atan(x): + """evaluates the tan inverse of an interval""" + np = import_module('numpy') + if isinstance(x, (int, float)): + return interval(np.arctan(x)) + elif isinstance(x, interval): + start = np.arctan(x.start) + end = np.arctan(x.end) + return interval(start, end, is_valid=x.is_valid) + else: + raise NotImplementedError + + +#periodic +def sin(x): + """evaluates the sine of an interval""" + np = import_module('numpy') + if isinstance(x, (int, float)): + return interval(np.sin(x)) + elif isinstance(x, interval): + if not x.is_valid: + return interval(-1, 1, is_valid=x.is_valid) + na, __ = divmod(x.start, np.pi / 2.0) + nb, __ = divmod(x.end, np.pi / 2.0) + start = min(np.sin(x.start), np.sin(x.end)) + end = max(np.sin(x.start), np.sin(x.end)) + if nb - na > 4: + return interval(-1, 1, is_valid=x.is_valid) + elif na == nb: + return interval(start, end, is_valid=x.is_valid) + else: + if (na - 1) // 4 != (nb - 1) // 4: + #sin has max + end = 1 + if (na - 3) // 4 != (nb - 3) // 4: + #sin has min + start = -1 + return interval(start, end) + else: + raise NotImplementedError + + +#periodic +def cos(x): + """Evaluates the cos of an interval""" + np = import_module('numpy') + if isinstance(x, (int, float)): + return interval(np.sin(x)) + elif isinstance(x, interval): + if not (np.isfinite(x.start) and np.isfinite(x.end)): + return interval(-1, 1, is_valid=x.is_valid) + na, __ = divmod(x.start, np.pi / 2.0) + nb, __ = divmod(x.end, np.pi / 2.0) + start = min(np.cos(x.start), np.cos(x.end)) + end = max(np.cos(x.start), np.cos(x.end)) + if nb - na > 4: + #differ more than 2*pi + return interval(-1, 1, is_valid=x.is_valid) + elif na == nb: + #in the same quadarant + return interval(start, end, is_valid=x.is_valid) + else: + if (na) // 4 != (nb) // 4: + #cos has max + end = 1 + if (na - 2) // 4 != (nb - 2) // 4: + #cos has min + start = -1 + return interval(start, end, is_valid=x.is_valid) + else: + raise NotImplementedError + + +def tan(x): + """Evaluates the tan of an interval""" + return sin(x) / cos(x) + + +#Monotonic +def sqrt(x): + """Evaluates the square root of an interval""" + np = import_module('numpy') + if isinstance(x, (int, float)): + if x > 0: + return interval(np.sqrt(x)) + else: + return interval(-np.inf, np.inf, is_valid=False) + elif isinstance(x, interval): + #Outside the domain + if x.end < 0: + return interval(-np.inf, np.inf, is_valid=False) + #Partially outside the domain + elif x.start < 0: + return interval(-np.inf, np.inf, is_valid=None) + else: + return interval(np.sqrt(x.start), np.sqrt(x.end), + is_valid=x.is_valid) + else: + raise NotImplementedError + + +def imin(*args): + """Evaluates the minimum of a list of intervals""" + np = import_module('numpy') + if not all(isinstance(arg, (int, float, interval)) for arg in args): + return NotImplementedError + else: + new_args = [a for a in args if isinstance(a, (int, float)) + or a.is_valid] + if len(new_args) == 0: + if all(a.is_valid is False for a in args): + return interval(-np.inf, np.inf, is_valid=False) + else: + return interval(-np.inf, np.inf, is_valid=None) + start_array = [a if isinstance(a, (int, float)) else a.start + for a in new_args] + + end_array = [a if isinstance(a, (int, float)) else a.end + for a in new_args] + return interval(min(start_array), min(end_array)) + + +def imax(*args): + """Evaluates the maximum of a list of intervals""" + np = import_module('numpy') + if not all(isinstance(arg, (int, float, interval)) for arg in args): + return NotImplementedError + else: + new_args = [a for a in args if isinstance(a, (int, float)) + or a.is_valid] + if len(new_args) == 0: + if all(a.is_valid is False for a in args): + return interval(-np.inf, np.inf, is_valid=False) + else: + return interval(-np.inf, np.inf, is_valid=None) + start_array = [a if isinstance(a, (int, float)) else a.start + for a in new_args] + + end_array = [a if isinstance(a, (int, float)) else a.end + for a in new_args] + + return interval(max(start_array), max(end_array)) + + +#Monotonic +def sinh(x): + """Evaluates the hyperbolic sine of an interval""" + np = import_module('numpy') + if isinstance(x, (int, float)): + return interval(np.sinh(x), np.sinh(x)) + elif isinstance(x, interval): + return interval(np.sinh(x.start), np.sinh(x.end), is_valid=x.is_valid) + else: + raise NotImplementedError + + +def cosh(x): + """Evaluates the hyperbolic cos of an interval""" + np = import_module('numpy') + if isinstance(x, (int, float)): + return interval(np.cosh(x), np.cosh(x)) + elif isinstance(x, interval): + #both signs + if x.start < 0 and x.end > 0: + end = max(np.cosh(x.start), np.cosh(x.end)) + return interval(1, end, is_valid=x.is_valid) + else: + #Monotonic + start = np.cosh(x.start) + end = np.cosh(x.end) + return interval(start, end, is_valid=x.is_valid) + else: + raise NotImplementedError + + +#Monotonic +def tanh(x): + """Evaluates the hyperbolic tan of an interval""" + np = import_module('numpy') + if isinstance(x, (int, float)): + return interval(np.tanh(x), np.tanh(x)) + elif isinstance(x, interval): + return interval(np.tanh(x.start), np.tanh(x.end), is_valid=x.is_valid) + else: + raise NotImplementedError + + +def asin(x): + """Evaluates the inverse sine of an interval""" + np = import_module('numpy') + if isinstance(x, (int, float)): + #Outside the domain + if abs(x) > 1: + return interval(-np.inf, np.inf, is_valid=False) + else: + return interval(np.arcsin(x), np.arcsin(x)) + elif isinstance(x, interval): + #Outside the domain + if x.is_valid is False or x.start > 1 or x.end < -1: + return interval(-np.inf, np.inf, is_valid=False) + #Partially outside the domain + elif x.start < -1 or x.end > 1: + return interval(-np.inf, np.inf, is_valid=None) + else: + start = np.arcsin(x.start) + end = np.arcsin(x.end) + return interval(start, end, is_valid=x.is_valid) + + +def acos(x): + """Evaluates the inverse cos of an interval""" + np = import_module('numpy') + if isinstance(x, (int, float)): + if abs(x) > 1: + #Outside the domain + return interval(-np.inf, np.inf, is_valid=False) + else: + return interval(np.arccos(x), np.arccos(x)) + elif isinstance(x, interval): + #Outside the domain + if x.is_valid is False or x.start > 1 or x.end < -1: + return interval(-np.inf, np.inf, is_valid=False) + #Partially outside the domain + elif x.start < -1 or x.end > 1: + return interval(-np.inf, np.inf, is_valid=None) + else: + start = np.arccos(x.start) + end = np.arccos(x.end) + return interval(start, end, is_valid=x.is_valid) + + +def ceil(x): + """Evaluates the ceiling of an interval""" + np = import_module('numpy') + if isinstance(x, (int, float)): + return interval(np.ceil(x)) + elif isinstance(x, interval): + if x.is_valid is False: + return interval(-np.inf, np.inf, is_valid=False) + else: + start = np.ceil(x.start) + end = np.ceil(x.end) + #Continuous over the interval + if start == end: + return interval(start, end, is_valid=x.is_valid) + else: + #Not continuous over the interval + return interval(start, end, is_valid=None) + else: + return NotImplementedError + + +def floor(x): + """Evaluates the floor of an interval""" + np = import_module('numpy') + if isinstance(x, (int, float)): + return interval(np.floor(x)) + elif isinstance(x, interval): + if x.is_valid is False: + return interval(-np.inf, np.inf, is_valid=False) + else: + start = np.floor(x.start) + end = np.floor(x.end) + #continuous over the argument + if start == end: + return interval(start, end, is_valid=x.is_valid) + else: + #not continuous over the interval + return interval(start, end, is_valid=None) + else: + return NotImplementedError + + +def acosh(x): + """Evaluates the inverse hyperbolic cosine of an interval""" + np = import_module('numpy') + if isinstance(x, (int, float)): + #Outside the domain + if x < 1: + return interval(-np.inf, np.inf, is_valid=False) + else: + return interval(np.arccosh(x)) + elif isinstance(x, interval): + #Outside the domain + if x.end < 1: + return interval(-np.inf, np.inf, is_valid=False) + #Partly outside the domain + elif x.start < 1: + return interval(-np.inf, np.inf, is_valid=None) + else: + start = np.arccosh(x.start) + end = np.arccosh(x.end) + return interval(start, end, is_valid=x.is_valid) + else: + return NotImplementedError + + +#Monotonic +def asinh(x): + """Evaluates the inverse hyperbolic sine of an interval""" + np = import_module('numpy') + if isinstance(x, (int, float)): + return interval(np.arcsinh(x)) + elif isinstance(x, interval): + start = np.arcsinh(x.start) + end = np.arcsinh(x.end) + return interval(start, end, is_valid=x.is_valid) + else: + return NotImplementedError + + +def atanh(x): + """Evaluates the inverse hyperbolic tangent of an interval""" + np = import_module('numpy') + if isinstance(x, (int, float)): + #Outside the domain + if abs(x) >= 1: + return interval(-np.inf, np.inf, is_valid=False) + else: + return interval(np.arctanh(x)) + elif isinstance(x, interval): + #outside the domain + if x.is_valid is False or x.start >= 1 or x.end <= -1: + return interval(-np.inf, np.inf, is_valid=False) + #partly outside the domain + elif x.start <= -1 or x.end >= 1: + return interval(-np.inf, np.inf, is_valid=None) + else: + start = np.arctanh(x.start) + end = np.arctanh(x.end) + return interval(start, end, is_valid=x.is_valid) + else: + return NotImplementedError + + +#Three valued logic for interval plotting. + +def And(*args): + """Defines the three valued ``And`` behaviour for a 2-tuple of + three valued logic values""" + def reduce_and(cmp_intervala, cmp_intervalb): + if cmp_intervala[0] is False or cmp_intervalb[0] is False: + first = False + elif cmp_intervala[0] is None or cmp_intervalb[0] is None: + first = None + else: + first = True + if cmp_intervala[1] is False or cmp_intervalb[1] is False: + second = False + elif cmp_intervala[1] is None or cmp_intervalb[1] is None: + second = None + else: + second = True + return (first, second) + return reduce(reduce_and, args) + + +def Or(*args): + """Defines the three valued ``Or`` behaviour for a 2-tuple of + three valued logic values""" + def reduce_or(cmp_intervala, cmp_intervalb): + if cmp_intervala[0] is True or cmp_intervalb[0] is True: + first = True + elif cmp_intervala[0] is None or cmp_intervalb[0] is None: + first = None + else: + first = False + + if cmp_intervala[1] is True or cmp_intervalb[1] is True: + second = True + elif cmp_intervala[1] is None or cmp_intervalb[1] is None: + second = None + else: + second = False + return (first, second) + return reduce(reduce_or, args) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/pygletplot/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/pygletplot/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..cd86a505d8c4b8026bd91cde27d441e00223a8bc --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/pygletplot/__init__.py @@ -0,0 +1,138 @@ +"""Plotting module that can plot 2D and 3D functions +""" + +from sympy.utilities.decorator import doctest_depends_on + +@doctest_depends_on(modules=('pyglet',)) +def PygletPlot(*args, **kwargs): + """ + + Plot Examples + ============= + + See examples/advanced/pyglet_plotting.py for many more examples. + + >>> from sympy.plotting.pygletplot import PygletPlot as Plot + >>> from sympy.abc import x, y, z + + >>> Plot(x*y**3-y*x**3) + [0]: -x**3*y + x*y**3, 'mode=cartesian' + + >>> p = Plot() + >>> p[1] = x*y + >>> p[1].color = z, (0.4,0.4,0.9), (0.9,0.4,0.4) + + >>> p = Plot() + >>> p[1] = x**2+y**2 + >>> p[2] = -x**2-y**2 + + + Variable Intervals + ================== + + The basic format is [var, min, max, steps], but the + syntax is flexible and arguments left out are taken + from the defaults for the current coordinate mode: + + >>> Plot(x**2) # implies [x,-5,5,100] + [0]: x**2, 'mode=cartesian' + + >>> Plot(x**2, [], []) # [x,-1,1,40], [y,-1,1,40] + [0]: x**2, 'mode=cartesian' + >>> Plot(x**2-y**2, [100], [100]) # [x,-1,1,100], [y,-1,1,100] + [0]: x**2 - y**2, 'mode=cartesian' + >>> Plot(x**2, [x,-13,13,100]) + [0]: x**2, 'mode=cartesian' + >>> Plot(x**2, [-13,13]) # [x,-13,13,100] + [0]: x**2, 'mode=cartesian' + >>> Plot(x**2, [x,-13,13]) # [x,-13,13,100] + [0]: x**2, 'mode=cartesian' + >>> Plot(1*x, [], [x], mode='cylindrical') + ... # [unbound_theta,0,2*Pi,40], [x,-1,1,20] + [0]: x, 'mode=cartesian' + + + Coordinate Modes + ================ + + Plot supports several curvilinear coordinate modes, and + they independent for each plotted function. You can specify + a coordinate mode explicitly with the 'mode' named argument, + but it can be automatically determined for Cartesian or + parametric plots, and therefore must only be specified for + polar, cylindrical, and spherical modes. + + Specifically, Plot(function arguments) and Plot[n] = + (function arguments) will interpret your arguments as a + Cartesian plot if you provide one function and a parametric + plot if you provide two or three functions. Similarly, the + arguments will be interpreted as a curve if one variable is + used, and a surface if two are used. + + Supported mode names by number of variables: + + 1: parametric, cartesian, polar + 2: parametric, cartesian, cylindrical = polar, spherical + + >>> Plot(1, mode='spherical') + + + Calculator-like Interface + ========================= + + >>> p = Plot(visible=False) + >>> f = x**2 + >>> p[1] = f + >>> p[2] = f.diff(x) + >>> p[3] = f.diff(x).diff(x) + >>> p + [1]: x**2, 'mode=cartesian' + [2]: 2*x, 'mode=cartesian' + [3]: 2, 'mode=cartesian' + >>> p.show() + >>> p.clear() + >>> p + + >>> p[1] = x**2+y**2 + >>> p[1].style = 'solid' + >>> p[2] = -x**2-y**2 + >>> p[2].style = 'wireframe' + >>> p[1].color = z, (0.4,0.4,0.9), (0.9,0.4,0.4) + >>> p[1].style = 'both' + >>> p[2].style = 'both' + >>> p.close() + + + Plot Window Keyboard Controls + ============================= + + Screen Rotation: + X,Y axis Arrow Keys, A,S,D,W, Numpad 4,6,8,2 + Z axis Q,E, Numpad 7,9 + + Model Rotation: + Z axis Z,C, Numpad 1,3 + + Zoom: R,F, PgUp,PgDn, Numpad +,- + + Reset Camera: X, Numpad 5 + + Camera Presets: + XY F1 + XZ F2 + YZ F3 + Perspective F4 + + Sensitivity Modifier: SHIFT + + Axes Toggle: + Visible F5 + Colors F6 + + Close Window: ESCAPE + + ============================= + """ + + from sympy.plotting.pygletplot.plot import PygletPlot + return PygletPlot(*args, **kwargs) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/pygletplot/color_scheme.py b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/pygletplot/color_scheme.py new file mode 100644 index 0000000000000000000000000000000000000000..613e777a7f45f54349c47d272aa6d1c157bcd117 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/pygletplot/color_scheme.py @@ -0,0 +1,336 @@ +from sympy.core.basic import Basic +from sympy.core.symbol import (Symbol, symbols) +from sympy.utilities.lambdify import lambdify +from .util import interpolate, rinterpolate, create_bounds, update_bounds +from sympy.utilities.iterables import sift + + +class ColorGradient: + colors = [0.4, 0.4, 0.4], [0.9, 0.9, 0.9] + intervals = 0.0, 1.0 + + def __init__(self, *args): + if len(args) == 2: + self.colors = list(args) + self.intervals = [0.0, 1.0] + elif len(args) > 0: + if len(args) % 2 != 0: + raise ValueError("len(args) should be even") + self.colors = [args[i] for i in range(1, len(args), 2)] + self.intervals = [args[i] for i in range(0, len(args), 2)] + assert len(self.colors) == len(self.intervals) + + def copy(self): + c = ColorGradient() + c.colors = [e[::] for e in self.colors] + c.intervals = self.intervals[::] + return c + + def _find_interval(self, v): + m = len(self.intervals) + i = 0 + while i < m - 1 and self.intervals[i] <= v: + i += 1 + return i + + def _interpolate_axis(self, axis, v): + i = self._find_interval(v) + v = rinterpolate(self.intervals[i - 1], self.intervals[i], v) + return interpolate(self.colors[i - 1][axis], self.colors[i][axis], v) + + def __call__(self, r, g, b): + c = self._interpolate_axis + return c(0, r), c(1, g), c(2, b) + +default_color_schemes = {} # defined at the bottom of this file + + +class ColorScheme: + + def __init__(self, *args, **kwargs): + self.args = args + self.f, self.gradient = None, ColorGradient() + + if len(args) == 1 and not isinstance(args[0], Basic) and callable(args[0]): + self.f = args[0] + elif len(args) == 1 and isinstance(args[0], str): + if args[0] in default_color_schemes: + cs = default_color_schemes[args[0]] + self.f, self.gradient = cs.f, cs.gradient.copy() + else: + self.f = lambdify('x,y,z,u,v', args[0]) + else: + self.f, self.gradient = self._interpret_args(args) + self._test_color_function() + if not isinstance(self.gradient, ColorGradient): + raise ValueError("Color gradient not properly initialized. " + "(Not a ColorGradient instance.)") + + def _interpret_args(self, args): + f, gradient = None, self.gradient + atoms, lists = self._sort_args(args) + s = self._pop_symbol_list(lists) + s = self._fill_in_vars(s) + + # prepare the error message for lambdification failure + f_str = ', '.join(str(fa) for fa in atoms) + s_str = (str(sa) for sa in s) + s_str = ', '.join(sa for sa in s_str if sa.find('unbound') < 0) + f_error = ValueError("Could not interpret arguments " + "%s as functions of %s." % (f_str, s_str)) + + # try to lambdify args + if len(atoms) == 1: + fv = atoms[0] + try: + f = lambdify(s, [fv, fv, fv]) + except TypeError: + raise f_error + + elif len(atoms) == 3: + fr, fg, fb = atoms + try: + f = lambdify(s, [fr, fg, fb]) + except TypeError: + raise f_error + + else: + raise ValueError("A ColorScheme must provide 1 or 3 " + "functions in x, y, z, u, and/or v.") + + # try to intrepret any given color information + if len(lists) == 0: + gargs = [] + + elif len(lists) == 1: + gargs = lists[0] + + elif len(lists) == 2: + try: + (r1, g1, b1), (r2, g2, b2) = lists + except TypeError: + raise ValueError("If two color arguments are given, " + "they must be given in the format " + "(r1, g1, b1), (r2, g2, b2).") + gargs = lists + + elif len(lists) == 3: + try: + (r1, r2), (g1, g2), (b1, b2) = lists + except Exception: + raise ValueError("If three color arguments are given, " + "they must be given in the format " + "(r1, r2), (g1, g2), (b1, b2). To create " + "a multi-step gradient, use the syntax " + "[0, colorStart, step1, color1, ..., 1, " + "colorEnd].") + gargs = [[r1, g1, b1], [r2, g2, b2]] + + else: + raise ValueError("Don't know what to do with collection " + "arguments %s." % (', '.join(str(l) for l in lists))) + + if gargs: + try: + gradient = ColorGradient(*gargs) + except Exception as ex: + raise ValueError(("Could not initialize a gradient " + "with arguments %s. Inner " + "exception: %s") % (gargs, str(ex))) + + return f, gradient + + def _pop_symbol_list(self, lists): + symbol_lists = [] + for l in lists: + mark = True + for s in l: + if s is not None and not isinstance(s, Symbol): + mark = False + break + if mark: + lists.remove(l) + symbol_lists.append(l) + if len(symbol_lists) == 1: + return symbol_lists[0] + elif len(symbol_lists) == 0: + return [] + else: + raise ValueError("Only one list of Symbols " + "can be given for a color scheme.") + + def _fill_in_vars(self, args): + defaults = symbols('x,y,z,u,v') + v_error = ValueError("Could not find what to plot.") + if len(args) == 0: + return defaults + if not isinstance(args, (tuple, list)): + raise v_error + if len(args) == 0: + return defaults + for s in args: + if s is not None and not isinstance(s, Symbol): + raise v_error + # when vars are given explicitly, any vars + # not given are marked 'unbound' as to not + # be accidentally used in an expression + vars = [Symbol('unbound%i' % (i)) for i in range(1, 6)] + # interpret as t + if len(args) == 1: + vars[3] = args[0] + # interpret as u,v + elif len(args) == 2: + if args[0] is not None: + vars[3] = args[0] + if args[1] is not None: + vars[4] = args[1] + # interpret as x,y,z + elif len(args) >= 3: + # allow some of x,y,z to be + # left unbound if not given + if args[0] is not None: + vars[0] = args[0] + if args[1] is not None: + vars[1] = args[1] + if args[2] is not None: + vars[2] = args[2] + # interpret the rest as t + if len(args) >= 4: + vars[3] = args[3] + # ...or u,v + if len(args) >= 5: + vars[4] = args[4] + return vars + + def _sort_args(self, args): + lists, atoms = sift(args, + lambda a: isinstance(a, (tuple, list)), binary=True) + return atoms, lists + + def _test_color_function(self): + if not callable(self.f): + raise ValueError("Color function is not callable.") + try: + result = self.f(0, 0, 0, 0, 0) + if len(result) != 3: + raise ValueError("length should be equal to 3") + except TypeError: + raise ValueError("Color function needs to accept x,y,z,u,v, " + "as arguments even if it doesn't use all of them.") + except AssertionError: + raise ValueError("Color function needs to return 3-tuple r,g,b.") + except Exception: + pass # color function probably not valid at 0,0,0,0,0 + + def __call__(self, x, y, z, u, v): + try: + return self.f(x, y, z, u, v) + except Exception: + return None + + def apply_to_curve(self, verts, u_set, set_len=None, inc_pos=None): + """ + Apply this color scheme to a + set of vertices over a single + independent variable u. + """ + bounds = create_bounds() + cverts = [] + if callable(set_len): + set_len(len(u_set)*2) + # calculate f() = r,g,b for each vert + # and find the min and max for r,g,b + for _u in range(len(u_set)): + if verts[_u] is None: + cverts.append(None) + else: + x, y, z = verts[_u] + u, v = u_set[_u], None + c = self(x, y, z, u, v) + if c is not None: + c = list(c) + update_bounds(bounds, c) + cverts.append(c) + if callable(inc_pos): + inc_pos() + # scale and apply gradient + for _u in range(len(u_set)): + if cverts[_u] is not None: + for _c in range(3): + # scale from [f_min, f_max] to [0,1] + cverts[_u][_c] = rinterpolate(bounds[_c][0], bounds[_c][1], + cverts[_u][_c]) + # apply gradient + cverts[_u] = self.gradient(*cverts[_u]) + if callable(inc_pos): + inc_pos() + return cverts + + def apply_to_surface(self, verts, u_set, v_set, set_len=None, inc_pos=None): + """ + Apply this color scheme to a + set of vertices over two + independent variables u and v. + """ + bounds = create_bounds() + cverts = [] + if callable(set_len): + set_len(len(u_set)*len(v_set)*2) + # calculate f() = r,g,b for each vert + # and find the min and max for r,g,b + for _u in range(len(u_set)): + column = [] + for _v in range(len(v_set)): + if verts[_u][_v] is None: + column.append(None) + else: + x, y, z = verts[_u][_v] + u, v = u_set[_u], v_set[_v] + c = self(x, y, z, u, v) + if c is not None: + c = list(c) + update_bounds(bounds, c) + column.append(c) + if callable(inc_pos): + inc_pos() + cverts.append(column) + # scale and apply gradient + for _u in range(len(u_set)): + for _v in range(len(v_set)): + if cverts[_u][_v] is not None: + # scale from [f_min, f_max] to [0,1] + for _c in range(3): + cverts[_u][_v][_c] = rinterpolate(bounds[_c][0], + bounds[_c][1], cverts[_u][_v][_c]) + # apply gradient + cverts[_u][_v] = self.gradient(*cverts[_u][_v]) + if callable(inc_pos): + inc_pos() + return cverts + + def str_base(self): + return ", ".join(str(a) for a in self.args) + + def __repr__(self): + return "%s" % (self.str_base()) + + +x, y, z, t, u, v = symbols('x,y,z,t,u,v') + +default_color_schemes['rainbow'] = ColorScheme(z, y, x) +default_color_schemes['zfade'] = ColorScheme(z, (0.4, 0.4, 0.97), + (0.97, 0.4, 0.4), (None, None, z)) +default_color_schemes['zfade3'] = ColorScheme(z, (None, None, z), + [0.00, (0.2, 0.2, 1.0), + 0.35, (0.2, 0.8, 0.4), + 0.50, (0.3, 0.9, 0.3), + 0.65, (0.4, 0.8, 0.2), + 1.00, (1.0, 0.2, 0.2)]) + +default_color_schemes['zfade4'] = ColorScheme(z, (None, None, z), + [0.0, (0.3, 0.3, 1.0), + 0.30, (0.3, 1.0, 0.3), + 0.55, (0.95, 1.0, 0.2), + 0.65, (1.0, 0.95, 0.2), + 0.85, (1.0, 0.7, 0.2), + 1.0, (1.0, 0.3, 0.2)]) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/pygletplot/managed_window.py b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/pygletplot/managed_window.py new file mode 100644 index 0000000000000000000000000000000000000000..81fa2541b4dd9e13534aabfd2a11bf88c479daf8 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/pygletplot/managed_window.py @@ -0,0 +1,106 @@ +from pyglet.window import Window +from pyglet.clock import Clock + +from threading import Thread, Lock + +gl_lock = Lock() + + +class ManagedWindow(Window): + """ + A pyglet window with an event loop which executes automatically + in a separate thread. Behavior is added by creating a subclass + which overrides setup, update, and/or draw. + """ + fps_limit = 30 + default_win_args = {"width": 600, + "height": 500, + "vsync": False, + "resizable": True} + + def __init__(self, **win_args): + """ + It is best not to override this function in the child + class, unless you need to take additional arguments. + Do any OpenGL initialization calls in setup(). + """ + + # check if this is run from the doctester + if win_args.get('runfromdoctester', False): + return + + self.win_args = dict(self.default_win_args, **win_args) + self.Thread = Thread(target=self.__event_loop__) + self.Thread.start() + + def __event_loop__(self, **win_args): + """ + The event loop thread function. Do not override or call + directly (it is called by __init__). + """ + gl_lock.acquire() + try: + try: + super().__init__(**self.win_args) + self.switch_to() + self.setup() + except Exception as e: + print("Window initialization failed: %s" % (str(e))) + self.has_exit = True + finally: + gl_lock.release() + + clock = Clock() + clock.fps_limit = self.fps_limit + while not self.has_exit: + dt = clock.tick() + gl_lock.acquire() + try: + try: + self.switch_to() + self.dispatch_events() + self.clear() + self.update(dt) + self.draw() + self.flip() + except Exception as e: + print("Uncaught exception in event loop: %s" % str(e)) + self.has_exit = True + finally: + gl_lock.release() + super().close() + + def close(self): + """ + Closes the window. + """ + self.has_exit = True + + def setup(self): + """ + Called once before the event loop begins. + Override this method in a child class. This + is the best place to put things like OpenGL + initialization calls. + """ + pass + + def update(self, dt): + """ + Called before draw during each iteration of + the event loop. dt is the elapsed time in + seconds since the last update. OpenGL rendering + calls are best put in draw() rather than here. + """ + pass + + def draw(self): + """ + Called after update during each iteration of + the event loop. Put OpenGL rendering calls + here. + """ + pass + +if __name__ == '__main__': + ManagedWindow() diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/pygletplot/plot.py b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/pygletplot/plot.py new file mode 100644 index 0000000000000000000000000000000000000000..8c3dd3c8d4ce6c660cc07f93a55029eef98e55a2 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/pygletplot/plot.py @@ -0,0 +1,464 @@ +from threading import RLock + +# it is sufficient to import "pyglet" here once +try: + import pyglet.gl as pgl +except ImportError: + raise ImportError("pyglet is required for plotting.\n " + "visit https://pyglet.org/") + +from sympy.core.numbers import Integer +from sympy.external.gmpy import SYMPY_INTS +from sympy.geometry.entity import GeometryEntity +from sympy.plotting.pygletplot.plot_axes import PlotAxes +from sympy.plotting.pygletplot.plot_mode import PlotMode +from sympy.plotting.pygletplot.plot_object import PlotObject +from sympy.plotting.pygletplot.plot_window import PlotWindow +from sympy.plotting.pygletplot.util import parse_option_string +from sympy.utilities.decorator import doctest_depends_on +from sympy.utilities.iterables import is_sequence + +from time import sleep +from os import getcwd, listdir + +import ctypes + +@doctest_depends_on(modules=('pyglet',)) +class PygletPlot: + """ + Plot Examples + ============= + + See examples/advanced/pyglet_plotting.py for many more examples. + + >>> from sympy.plotting.pygletplot import PygletPlot as Plot + >>> from sympy.abc import x, y, z + + >>> Plot(x*y**3-y*x**3) + [0]: -x**3*y + x*y**3, 'mode=cartesian' + + >>> p = Plot() + >>> p[1] = x*y + >>> p[1].color = z, (0.4,0.4,0.9), (0.9,0.4,0.4) + + >>> p = Plot() + >>> p[1] = x**2+y**2 + >>> p[2] = -x**2-y**2 + + + Variable Intervals + ================== + + The basic format is [var, min, max, steps], but the + syntax is flexible and arguments left out are taken + from the defaults for the current coordinate mode: + + >>> Plot(x**2) # implies [x,-5,5,100] + [0]: x**2, 'mode=cartesian' + >>> Plot(x**2, [], []) # [x,-1,1,40], [y,-1,1,40] + [0]: x**2, 'mode=cartesian' + >>> Plot(x**2-y**2, [100], [100]) # [x,-1,1,100], [y,-1,1,100] + [0]: x**2 - y**2, 'mode=cartesian' + >>> Plot(x**2, [x,-13,13,100]) + [0]: x**2, 'mode=cartesian' + >>> Plot(x**2, [-13,13]) # [x,-13,13,100] + [0]: x**2, 'mode=cartesian' + >>> Plot(x**2, [x,-13,13]) # [x,-13,13,10] + [0]: x**2, 'mode=cartesian' + >>> Plot(1*x, [], [x], mode='cylindrical') + ... # [unbound_theta,0,2*Pi,40], [x,-1,1,20] + [0]: x, 'mode=cartesian' + + + Coordinate Modes + ================ + + Plot supports several curvilinear coordinate modes, and + they independent for each plotted function. You can specify + a coordinate mode explicitly with the 'mode' named argument, + but it can be automatically determined for Cartesian or + parametric plots, and therefore must only be specified for + polar, cylindrical, and spherical modes. + + Specifically, Plot(function arguments) and Plot[n] = + (function arguments) will interpret your arguments as a + Cartesian plot if you provide one function and a parametric + plot if you provide two or three functions. Similarly, the + arguments will be interpreted as a curve if one variable is + used, and a surface if two are used. + + Supported mode names by number of variables: + + 1: parametric, cartesian, polar + 2: parametric, cartesian, cylindrical = polar, spherical + + >>> Plot(1, mode='spherical') + + + Calculator-like Interface + ========================= + + >>> p = Plot(visible=False) + >>> f = x**2 + >>> p[1] = f + >>> p[2] = f.diff(x) + >>> p[3] = f.diff(x).diff(x) + >>> p + [1]: x**2, 'mode=cartesian' + [2]: 2*x, 'mode=cartesian' + [3]: 2, 'mode=cartesian' + >>> p.show() + >>> p.clear() + >>> p + + >>> p[1] = x**2+y**2 + >>> p[1].style = 'solid' + >>> p[2] = -x**2-y**2 + >>> p[2].style = 'wireframe' + >>> p[1].color = z, (0.4,0.4,0.9), (0.9,0.4,0.4) + >>> p[1].style = 'both' + >>> p[2].style = 'both' + >>> p.close() + + + Plot Window Keyboard Controls + ============================= + + Screen Rotation: + X,Y axis Arrow Keys, A,S,D,W, Numpad 4,6,8,2 + Z axis Q,E, Numpad 7,9 + + Model Rotation: + Z axis Z,C, Numpad 1,3 + + Zoom: R,F, PgUp,PgDn, Numpad +,- + + Reset Camera: X, Numpad 5 + + Camera Presets: + XY F1 + XZ F2 + YZ F3 + Perspective F4 + + Sensitivity Modifier: SHIFT + + Axes Toggle: + Visible F5 + Colors F6 + + Close Window: ESCAPE + + ============================= + + """ + + @doctest_depends_on(modules=('pyglet',)) + def __init__(self, *fargs, **win_args): + """ + Positional Arguments + ==================== + + Any given positional arguments are used to + initialize a plot function at index 1. In + other words... + + >>> from sympy.plotting.pygletplot import PygletPlot as Plot + >>> from sympy.abc import x + >>> p = Plot(x**2, visible=False) + + ...is equivalent to... + + >>> p = Plot(visible=False) + >>> p[1] = x**2 + + Note that in earlier versions of the plotting + module, you were able to specify multiple + functions in the initializer. This functionality + has been dropped in favor of better automatic + plot plot_mode detection. + + + Named Arguments + =============== + + axes + An option string of the form + "key1=value1; key2 = value2" which + can use the following options: + + style = ordinate + none OR frame OR box OR ordinate + + stride = 0.25 + val OR (val_x, val_y, val_z) + + overlay = True (draw on top of plot) + True OR False + + colored = False (False uses Black, + True uses colors + R,G,B = X,Y,Z) + True OR False + + label_axes = False (display axis names + at endpoints) + True OR False + + visible = True (show immediately + True OR False + + + The following named arguments are passed as + arguments to window initialization: + + antialiasing = True + True OR False + + ortho = False + True OR False + + invert_mouse_zoom = False + True OR False + + """ + # Register the plot modes + from . import plot_modes # noqa + + self._win_args = win_args + self._window = None + + self._render_lock = RLock() + + self._functions = {} + self._pobjects = [] + self._screenshot = ScreenShot(self) + + axe_options = parse_option_string(win_args.pop('axes', '')) + self.axes = PlotAxes(**axe_options) + self._pobjects.append(self.axes) + + self[0] = fargs + if win_args.get('visible', True): + self.show() + + ## Window Interfaces + + def show(self): + """ + Creates and displays a plot window, or activates it + (gives it focus) if it has already been created. + """ + if self._window and not self._window.has_exit: + self._window.activate() + else: + self._win_args['visible'] = True + self.axes.reset_resources() + + #if hasattr(self, '_doctest_depends_on'): + # self._win_args['runfromdoctester'] = True + + self._window = PlotWindow(self, **self._win_args) + + def close(self): + """ + Closes the plot window. + """ + if self._window: + self._window.close() + + def saveimage(self, outfile=None, format='', size=(600, 500)): + """ + Saves a screen capture of the plot window to an + image file. + + If outfile is given, it can either be a path + or a file object. Otherwise a png image will + be saved to the current working directory. + If the format is omitted, it is determined from + the filename extension. + """ + self._screenshot.save(outfile, format, size) + + ## Function List Interfaces + + def clear(self): + """ + Clears the function list of this plot. + """ + self._render_lock.acquire() + self._functions = {} + self.adjust_all_bounds() + self._render_lock.release() + + def __getitem__(self, i): + """ + Returns the function at position i in the + function list. + """ + return self._functions[i] + + def __setitem__(self, i, args): + """ + Parses and adds a PlotMode to the function + list. + """ + if not (isinstance(i, (SYMPY_INTS, Integer)) and i >= 0): + raise ValueError("Function index must " + "be an integer >= 0.") + + if isinstance(args, PlotObject): + f = args + else: + if (not is_sequence(args)) or isinstance(args, GeometryEntity): + args = [args] + if len(args) == 0: + return # no arguments given + kwargs = {"bounds_callback": self.adjust_all_bounds} + f = PlotMode(*args, **kwargs) + + if f: + self._render_lock.acquire() + self._functions[i] = f + self._render_lock.release() + else: + raise ValueError("Failed to parse '%s'." + % ', '.join(str(a) for a in args)) + + def __delitem__(self, i): + """ + Removes the function in the function list at + position i. + """ + self._render_lock.acquire() + del self._functions[i] + self.adjust_all_bounds() + self._render_lock.release() + + def firstavailableindex(self): + """ + Returns the first unused index in the function list. + """ + i = 0 + self._render_lock.acquire() + while i in self._functions: + i += 1 + self._render_lock.release() + return i + + def append(self, *args): + """ + Parses and adds a PlotMode to the function + list at the first available index. + """ + self.__setitem__(self.firstavailableindex(), args) + + def __len__(self): + """ + Returns the number of functions in the function list. + """ + return len(self._functions) + + def __iter__(self): + """ + Allows iteration of the function list. + """ + return self._functions.itervalues() + + def __repr__(self): + return str(self) + + def __str__(self): + """ + Returns a string containing a new-line separated + list of the functions in the function list. + """ + s = "" + if len(self._functions) == 0: + s += "" + else: + self._render_lock.acquire() + s += "\n".join(["%s[%i]: %s" % ("", i, str(self._functions[i])) + for i in self._functions]) + self._render_lock.release() + return s + + def adjust_all_bounds(self): + self._render_lock.acquire() + self.axes.reset_bounding_box() + for f in self._functions: + self.axes.adjust_bounds(self._functions[f].bounds) + self._render_lock.release() + + def wait_for_calculations(self): + sleep(0) + self._render_lock.acquire() + for f in self._functions: + a = self._functions[f]._get_calculating_verts + b = self._functions[f]._get_calculating_cverts + while a() or b(): + sleep(0) + self._render_lock.release() + +class ScreenShot: + def __init__(self, plot): + self._plot = plot + self.screenshot_requested = False + self.outfile = None + self.format = '' + self.invisibleMode = False + self.flag = 0 + + def __bool__(self): + return self.screenshot_requested + + def _execute_saving(self): + if self.flag < 3: + self.flag += 1 + return + + size_x, size_y = self._plot._window.get_size() + size = size_x*size_y*4*ctypes.sizeof(ctypes.c_ubyte) + image = ctypes.create_string_buffer(size) + pgl.glReadPixels(0, 0, size_x, size_y, pgl.GL_RGBA, pgl.GL_UNSIGNED_BYTE, image) + from PIL import Image + im = Image.frombuffer('RGBA', (size_x, size_y), + image.raw, 'raw', 'RGBA', 0, 1) + im.transpose(Image.FLIP_TOP_BOTTOM).save(self.outfile, self.format) + + self.flag = 0 + self.screenshot_requested = False + if self.invisibleMode: + self._plot._window.close() + + def save(self, outfile=None, format='', size=(600, 500)): + self.outfile = outfile + self.format = format + self.size = size + self.screenshot_requested = True + + if not self._plot._window or self._plot._window.has_exit: + self._plot._win_args['visible'] = False + + self._plot._win_args['width'] = size[0] + self._plot._win_args['height'] = size[1] + + self._plot.axes.reset_resources() + self._plot._window = PlotWindow(self._plot, **self._plot._win_args) + self.invisibleMode = True + + if self.outfile is None: + self.outfile = self._create_unique_path() + print(self.outfile) + + def _create_unique_path(self): + cwd = getcwd() + l = listdir(cwd) + path = '' + i = 0 + while True: + if not 'plot_%s.png' % i in l: + path = cwd + '/plot_%s.png' % i + break + i += 1 + return path diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/pygletplot/plot_axes.py b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/pygletplot/plot_axes.py new file mode 100644 index 0000000000000000000000000000000000000000..ae26fb0b2fa64e7f7318c51ce3fe5afaa276b48e --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/pygletplot/plot_axes.py @@ -0,0 +1,251 @@ +import pyglet.gl as pgl +from pyglet import font + +from sympy.core import S +from sympy.plotting.pygletplot.plot_object import PlotObject +from sympy.plotting.pygletplot.util import billboard_matrix, dot_product, \ + get_direction_vectors, strided_range, vec_mag, vec_sub +from sympy.utilities.iterables import is_sequence + + +class PlotAxes(PlotObject): + + def __init__(self, *args, + style='', none=None, frame=None, box=None, ordinate=None, + stride=0.25, + visible='', overlay='', colored='', label_axes='', label_ticks='', + tick_length=0.1, + font_face='Arial', font_size=28, + **kwargs): + # initialize style parameter + style = style.lower() + + # allow alias kwargs to override style kwarg + if none is not None: + style = 'none' + if frame is not None: + style = 'frame' + if box is not None: + style = 'box' + if ordinate is not None: + style = 'ordinate' + + if style in ['', 'ordinate']: + self._render_object = PlotAxesOrdinate(self) + elif style in ['frame', 'box']: + self._render_object = PlotAxesFrame(self) + elif style in ['none']: + self._render_object = None + else: + raise ValueError(("Unrecognized axes style %s.") % (style)) + + # initialize stride parameter + try: + stride = eval(stride) + except TypeError: + pass + if is_sequence(stride): + if len(stride) != 3: + raise ValueError("length should be equal to 3") + self._stride = stride + else: + self._stride = [stride, stride, stride] + self._tick_length = float(tick_length) + + # setup bounding box and ticks + self._origin = [0, 0, 0] + self.reset_bounding_box() + + def flexible_boolean(input, default): + if input in [True, False]: + return input + if input in ('f', 'F', 'false', 'False'): + return False + if input in ('t', 'T', 'true', 'True'): + return True + return default + + # initialize remaining parameters + self.visible = flexible_boolean(kwargs, True) + self._overlay = flexible_boolean(overlay, True) + self._colored = flexible_boolean(colored, False) + self._label_axes = flexible_boolean(label_axes, False) + self._label_ticks = flexible_boolean(label_ticks, True) + + # setup label font + self.font_face = font_face + self.font_size = font_size + + # this is also used to reinit the + # font on window close/reopen + self.reset_resources() + + def reset_resources(self): + self.label_font = None + + def reset_bounding_box(self): + self._bounding_box = [[None, None], [None, None], [None, None]] + self._axis_ticks = [[], [], []] + + def draw(self): + if self._render_object: + pgl.glPushAttrib(pgl.GL_ENABLE_BIT | pgl.GL_POLYGON_BIT | pgl.GL_DEPTH_BUFFER_BIT) + if self._overlay: + pgl.glDisable(pgl.GL_DEPTH_TEST) + self._render_object.draw() + pgl.glPopAttrib() + + def adjust_bounds(self, child_bounds): + b = self._bounding_box + c = child_bounds + for i in range(3): + if abs(c[i][0]) is S.Infinity or abs(c[i][1]) is S.Infinity: + continue + b[i][0] = c[i][0] if b[i][0] is None else min([b[i][0], c[i][0]]) + b[i][1] = c[i][1] if b[i][1] is None else max([b[i][1], c[i][1]]) + self._bounding_box = b + self._recalculate_axis_ticks(i) + + def _recalculate_axis_ticks(self, axis): + b = self._bounding_box + if b[axis][0] is None or b[axis][1] is None: + self._axis_ticks[axis] = [] + else: + self._axis_ticks[axis] = strided_range(b[axis][0], b[axis][1], + self._stride[axis]) + + def toggle_visible(self): + self.visible = not self.visible + + def toggle_colors(self): + self._colored = not self._colored + + +class PlotAxesBase(PlotObject): + + def __init__(self, parent_axes): + self._p = parent_axes + + def draw(self): + color = [([0.2, 0.1, 0.3], [0.2, 0.1, 0.3], [0.2, 0.1, 0.3]), + ([0.9, 0.3, 0.5], [0.5, 1.0, 0.5], [0.3, 0.3, 0.9])][self._p._colored] + self.draw_background(color) + self.draw_axis(2, color[2]) + self.draw_axis(1, color[1]) + self.draw_axis(0, color[0]) + + def draw_background(self, color): + pass # optional + + def draw_axis(self, axis, color): + raise NotImplementedError() + + def draw_text(self, text, position, color, scale=1.0): + if len(color) == 3: + color = (color[0], color[1], color[2], 1.0) + + if self._p.label_font is None: + self._p.label_font = font.load(self._p.font_face, + self._p.font_size, + bold=True, italic=False) + + label = font.Text(self._p.label_font, text, + color=color, + valign=font.Text.BASELINE, + halign=font.Text.CENTER) + + pgl.glPushMatrix() + pgl.glTranslatef(*position) + billboard_matrix() + scale_factor = 0.005 * scale + pgl.glScalef(scale_factor, scale_factor, scale_factor) + pgl.glColor4f(0, 0, 0, 0) + label.draw() + pgl.glPopMatrix() + + def draw_line(self, v, color): + o = self._p._origin + pgl.glBegin(pgl.GL_LINES) + pgl.glColor3f(*color) + pgl.glVertex3f(v[0][0] + o[0], v[0][1] + o[1], v[0][2] + o[2]) + pgl.glVertex3f(v[1][0] + o[0], v[1][1] + o[1], v[1][2] + o[2]) + pgl.glEnd() + + +class PlotAxesOrdinate(PlotAxesBase): + + def __init__(self, parent_axes): + super().__init__(parent_axes) + + def draw_axis(self, axis, color): + ticks = self._p._axis_ticks[axis] + radius = self._p._tick_length / 2.0 + if len(ticks) < 2: + return + + # calculate the vector for this axis + axis_lines = [[0, 0, 0], [0, 0, 0]] + axis_lines[0][axis], axis_lines[1][axis] = ticks[0], ticks[-1] + axis_vector = vec_sub(axis_lines[1], axis_lines[0]) + + # calculate angle to the z direction vector + pos_z = get_direction_vectors()[2] + d = abs(dot_product(axis_vector, pos_z)) + d = d / vec_mag(axis_vector) + + # don't draw labels if we're looking down the axis + labels_visible = abs(d - 1.0) > 0.02 + + # draw the ticks and labels + for tick in ticks: + self.draw_tick_line(axis, color, radius, tick, labels_visible) + + # draw the axis line and labels + self.draw_axis_line(axis, color, ticks[0], ticks[-1], labels_visible) + + def draw_axis_line(self, axis, color, a_min, a_max, labels_visible): + axis_line = [[0, 0, 0], [0, 0, 0]] + axis_line[0][axis], axis_line[1][axis] = a_min, a_max + self.draw_line(axis_line, color) + if labels_visible: + self.draw_axis_line_labels(axis, color, axis_line) + + def draw_axis_line_labels(self, axis, color, axis_line): + if not self._p._label_axes: + return + axis_labels = [axis_line[0][::], axis_line[1][::]] + axis_labels[0][axis] -= 0.3 + axis_labels[1][axis] += 0.3 + a_str = ['X', 'Y', 'Z'][axis] + self.draw_text("-" + a_str, axis_labels[0], color) + self.draw_text("+" + a_str, axis_labels[1], color) + + def draw_tick_line(self, axis, color, radius, tick, labels_visible): + tick_axis = {0: 1, 1: 0, 2: 1}[axis] + tick_line = [[0, 0, 0], [0, 0, 0]] + tick_line[0][axis] = tick_line[1][axis] = tick + tick_line[0][tick_axis], tick_line[1][tick_axis] = -radius, radius + self.draw_line(tick_line, color) + if labels_visible: + self.draw_tick_line_label(axis, color, radius, tick) + + def draw_tick_line_label(self, axis, color, radius, tick): + if not self._p._label_axes: + return + tick_label_vector = [0, 0, 0] + tick_label_vector[axis] = tick + tick_label_vector[{0: 1, 1: 0, 2: 1}[axis]] = [-1, 1, 1][ + axis] * radius * 3.5 + self.draw_text(str(tick), tick_label_vector, color, scale=0.5) + + +class PlotAxesFrame(PlotAxesBase): + + def __init__(self, parent_axes): + super().__init__(parent_axes) + + def draw_background(self, color): + pass + + def draw_axis(self, axis, color): + raise NotImplementedError() diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/pygletplot/plot_camera.py b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/pygletplot/plot_camera.py new file mode 100644 index 0000000000000000000000000000000000000000..43598debac252ffd22beb8690fef30745259c634 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/pygletplot/plot_camera.py @@ -0,0 +1,124 @@ +import pyglet.gl as pgl +from sympy.plotting.pygletplot.plot_rotation import get_spherical_rotatation +from sympy.plotting.pygletplot.util import get_model_matrix, model_to_screen, \ + screen_to_model, vec_subs + + +class PlotCamera: + + min_dist = 0.05 + max_dist = 500.0 + + min_ortho_dist = 100.0 + max_ortho_dist = 10000.0 + + _default_dist = 6.0 + _default_ortho_dist = 600.0 + + rot_presets = { + 'xy': (0, 0, 0), + 'xz': (-90, 0, 0), + 'yz': (0, 90, 0), + 'perspective': (-45, 0, -45) + } + + def __init__(self, window, ortho=False): + self.window = window + self.axes = self.window.plot.axes + self.ortho = ortho + self.reset() + + def init_rot_matrix(self): + pgl.glPushMatrix() + pgl.glLoadIdentity() + self._rot = get_model_matrix() + pgl.glPopMatrix() + + def set_rot_preset(self, preset_name): + self.init_rot_matrix() + if preset_name not in self.rot_presets: + raise ValueError( + "%s is not a valid rotation preset." % preset_name) + r = self.rot_presets[preset_name] + self.euler_rotate(r[0], 1, 0, 0) + self.euler_rotate(r[1], 0, 1, 0) + self.euler_rotate(r[2], 0, 0, 1) + + def reset(self): + self._dist = 0.0 + self._x, self._y = 0.0, 0.0 + self._rot = None + if self.ortho: + self._dist = self._default_ortho_dist + else: + self._dist = self._default_dist + self.init_rot_matrix() + + def mult_rot_matrix(self, rot): + pgl.glPushMatrix() + pgl.glLoadMatrixf(rot) + pgl.glMultMatrixf(self._rot) + self._rot = get_model_matrix() + pgl.glPopMatrix() + + def setup_projection(self): + pgl.glMatrixMode(pgl.GL_PROJECTION) + pgl.glLoadIdentity() + if self.ortho: + # yep, this is pseudo ortho (don't tell anyone) + pgl.gluPerspective( + 0.3, float(self.window.width)/float(self.window.height), + self.min_ortho_dist - 0.01, self.max_ortho_dist + 0.01) + else: + pgl.gluPerspective( + 30.0, float(self.window.width)/float(self.window.height), + self.min_dist - 0.01, self.max_dist + 0.01) + pgl.glMatrixMode(pgl.GL_MODELVIEW) + + def _get_scale(self): + return 1.0, 1.0, 1.0 + + def apply_transformation(self): + pgl.glLoadIdentity() + pgl.glTranslatef(self._x, self._y, -self._dist) + if self._rot is not None: + pgl.glMultMatrixf(self._rot) + pgl.glScalef(*self._get_scale()) + + def spherical_rotate(self, p1, p2, sensitivity=1.0): + mat = get_spherical_rotatation(p1, p2, self.window.width, + self.window.height, sensitivity) + if mat is not None: + self.mult_rot_matrix(mat) + + def euler_rotate(self, angle, x, y, z): + pgl.glPushMatrix() + pgl.glLoadMatrixf(self._rot) + pgl.glRotatef(angle, x, y, z) + self._rot = get_model_matrix() + pgl.glPopMatrix() + + def zoom_relative(self, clicks, sensitivity): + + if self.ortho: + dist_d = clicks * sensitivity * 50.0 + min_dist = self.min_ortho_dist + max_dist = self.max_ortho_dist + else: + dist_d = clicks * sensitivity + min_dist = self.min_dist + max_dist = self.max_dist + + new_dist = (self._dist - dist_d) + if (clicks < 0 and new_dist < max_dist) or new_dist > min_dist: + self._dist = new_dist + + def mouse_translate(self, x, y, dx, dy): + pgl.glPushMatrix() + pgl.glLoadIdentity() + pgl.glTranslatef(0, 0, -self._dist) + z = model_to_screen(0, 0, 0)[2] + d = vec_subs(screen_to_model(x, y, z), screen_to_model(x - dx, y - dy, z)) + pgl.glPopMatrix() + self._x += d[0] + self._y += d[1] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/pygletplot/plot_controller.py b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/pygletplot/plot_controller.py new file mode 100644 index 0000000000000000000000000000000000000000..aa7e01e6fd17fddf07b733442208a0a4c9d87d5b --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/pygletplot/plot_controller.py @@ -0,0 +1,218 @@ +from pyglet.window import key +from pyglet.window.mouse import LEFT, RIGHT, MIDDLE +from sympy.plotting.pygletplot.util import get_direction_vectors, get_basis_vectors + + +class PlotController: + + normal_mouse_sensitivity = 4.0 + modified_mouse_sensitivity = 1.0 + + normal_key_sensitivity = 160.0 + modified_key_sensitivity = 40.0 + + keymap = { + key.LEFT: 'left', + key.A: 'left', + key.NUM_4: 'left', + + key.RIGHT: 'right', + key.D: 'right', + key.NUM_6: 'right', + + key.UP: 'up', + key.W: 'up', + key.NUM_8: 'up', + + key.DOWN: 'down', + key.S: 'down', + key.NUM_2: 'down', + + key.Z: 'rotate_z_neg', + key.NUM_1: 'rotate_z_neg', + + key.C: 'rotate_z_pos', + key.NUM_3: 'rotate_z_pos', + + key.Q: 'spin_left', + key.NUM_7: 'spin_left', + key.E: 'spin_right', + key.NUM_9: 'spin_right', + + key.X: 'reset_camera', + key.NUM_5: 'reset_camera', + + key.NUM_ADD: 'zoom_in', + key.PAGEUP: 'zoom_in', + key.R: 'zoom_in', + + key.NUM_SUBTRACT: 'zoom_out', + key.PAGEDOWN: 'zoom_out', + key.F: 'zoom_out', + + key.RSHIFT: 'modify_sensitivity', + key.LSHIFT: 'modify_sensitivity', + + key.F1: 'rot_preset_xy', + key.F2: 'rot_preset_xz', + key.F3: 'rot_preset_yz', + key.F4: 'rot_preset_perspective', + + key.F5: 'toggle_axes', + key.F6: 'toggle_axe_colors', + + key.F8: 'save_image' + } + + def __init__(self, window, *, invert_mouse_zoom=False, **kwargs): + self.invert_mouse_zoom = invert_mouse_zoom + self.window = window + self.camera = window.camera + self.action = { + # Rotation around the view Y (up) vector + 'left': False, + 'right': False, + # Rotation around the view X vector + 'up': False, + 'down': False, + # Rotation around the view Z vector + 'spin_left': False, + 'spin_right': False, + # Rotation around the model Z vector + 'rotate_z_neg': False, + 'rotate_z_pos': False, + # Reset to the default rotation + 'reset_camera': False, + # Performs camera z-translation + 'zoom_in': False, + 'zoom_out': False, + # Use alternative sensitivity (speed) + 'modify_sensitivity': False, + # Rotation presets + 'rot_preset_xy': False, + 'rot_preset_xz': False, + 'rot_preset_yz': False, + 'rot_preset_perspective': False, + # axes + 'toggle_axes': False, + 'toggle_axe_colors': False, + # screenshot + 'save_image': False + } + + def update(self, dt): + z = 0 + if self.action['zoom_out']: + z -= 1 + if self.action['zoom_in']: + z += 1 + if z != 0: + self.camera.zoom_relative(z/10.0, self.get_key_sensitivity()/10.0) + + dx, dy, dz = 0, 0, 0 + if self.action['left']: + dx -= 1 + if self.action['right']: + dx += 1 + if self.action['up']: + dy -= 1 + if self.action['down']: + dy += 1 + if self.action['spin_left']: + dz += 1 + if self.action['spin_right']: + dz -= 1 + + if not self.is_2D(): + if dx != 0: + self.camera.euler_rotate(dx*dt*self.get_key_sensitivity(), + *(get_direction_vectors()[1])) + if dy != 0: + self.camera.euler_rotate(dy*dt*self.get_key_sensitivity(), + *(get_direction_vectors()[0])) + if dz != 0: + self.camera.euler_rotate(dz*dt*self.get_key_sensitivity(), + *(get_direction_vectors()[2])) + else: + self.camera.mouse_translate(0, 0, dx*dt*self.get_key_sensitivity(), + -dy*dt*self.get_key_sensitivity()) + + rz = 0 + if self.action['rotate_z_neg'] and not self.is_2D(): + rz -= 1 + if self.action['rotate_z_pos'] and not self.is_2D(): + rz += 1 + + if rz != 0: + self.camera.euler_rotate(rz*dt*self.get_key_sensitivity(), + *(get_basis_vectors()[2])) + + if self.action['reset_camera']: + self.camera.reset() + + if self.action['rot_preset_xy']: + self.camera.set_rot_preset('xy') + if self.action['rot_preset_xz']: + self.camera.set_rot_preset('xz') + if self.action['rot_preset_yz']: + self.camera.set_rot_preset('yz') + if self.action['rot_preset_perspective']: + self.camera.set_rot_preset('perspective') + + if self.action['toggle_axes']: + self.action['toggle_axes'] = False + self.camera.axes.toggle_visible() + + if self.action['toggle_axe_colors']: + self.action['toggle_axe_colors'] = False + self.camera.axes.toggle_colors() + + if self.action['save_image']: + self.action['save_image'] = False + self.window.plot.saveimage() + + return True + + def get_mouse_sensitivity(self): + if self.action['modify_sensitivity']: + return self.modified_mouse_sensitivity + else: + return self.normal_mouse_sensitivity + + def get_key_sensitivity(self): + if self.action['modify_sensitivity']: + return self.modified_key_sensitivity + else: + return self.normal_key_sensitivity + + def on_key_press(self, symbol, modifiers): + if symbol in self.keymap: + self.action[self.keymap[symbol]] = True + + def on_key_release(self, symbol, modifiers): + if symbol in self.keymap: + self.action[self.keymap[symbol]] = False + + def on_mouse_drag(self, x, y, dx, dy, buttons, modifiers): + if buttons & LEFT: + if self.is_2D(): + self.camera.mouse_translate(x, y, dx, dy) + else: + self.camera.spherical_rotate((x - dx, y - dy), (x, y), + self.get_mouse_sensitivity()) + if buttons & MIDDLE: + self.camera.zoom_relative([1, -1][self.invert_mouse_zoom]*dy, + self.get_mouse_sensitivity()/20.0) + if buttons & RIGHT: + self.camera.mouse_translate(x, y, dx, dy) + + def on_mouse_scroll(self, x, y, dx, dy): + self.camera.zoom_relative([1, -1][self.invert_mouse_zoom]*dy, + self.get_mouse_sensitivity()) + + def is_2D(self): + functions = self.window.plot._functions + for i in functions: + if len(functions[i].i_vars) > 1 or len(functions[i].d_vars) > 2: + return False + return True diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/pygletplot/plot_curve.py b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/pygletplot/plot_curve.py new file mode 100644 index 0000000000000000000000000000000000000000..6b97dac843f58c76694d424f0b0b7e3499ba5202 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/pygletplot/plot_curve.py @@ -0,0 +1,82 @@ +import pyglet.gl as pgl +from sympy.core import S +from sympy.plotting.pygletplot.plot_mode_base import PlotModeBase + + +class PlotCurve(PlotModeBase): + + style_override = 'wireframe' + + def _on_calculate_verts(self): + self.t_interval = self.intervals[0] + self.t_set = list(self.t_interval.frange()) + self.bounds = [[S.Infinity, S.NegativeInfinity, 0], + [S.Infinity, S.NegativeInfinity, 0], + [S.Infinity, S.NegativeInfinity, 0]] + evaluate = self._get_evaluator() + + self._calculating_verts_pos = 0.0 + self._calculating_verts_len = float(self.t_interval.v_len) + + self.verts = [] + b = self.bounds + for t in self.t_set: + try: + _e = evaluate(t) # calculate vertex + except (NameError, ZeroDivisionError): + _e = None + if _e is not None: # update bounding box + for axis in range(3): + b[axis][0] = min([b[axis][0], _e[axis]]) + b[axis][1] = max([b[axis][1], _e[axis]]) + self.verts.append(_e) + self._calculating_verts_pos += 1.0 + + for axis in range(3): + b[axis][2] = b[axis][1] - b[axis][0] + if b[axis][2] == 0.0: + b[axis][2] = 1.0 + + self.push_wireframe(self.draw_verts(False)) + + def _on_calculate_cverts(self): + if not self.verts or not self.color: + return + + def set_work_len(n): + self._calculating_cverts_len = float(n) + + def inc_work_pos(): + self._calculating_cverts_pos += 1.0 + set_work_len(1) + self._calculating_cverts_pos = 0 + self.cverts = self.color.apply_to_curve(self.verts, + self.t_set, + set_len=set_work_len, + inc_pos=inc_work_pos) + self.push_wireframe(self.draw_verts(True)) + + def calculate_one_cvert(self, t): + vert = self.verts[t] + return self.color(vert[0], vert[1], vert[2], + self.t_set[t], None) + + def draw_verts(self, use_cverts): + def f(): + pgl.glBegin(pgl.GL_LINE_STRIP) + for t in range(len(self.t_set)): + p = self.verts[t] + if p is None: + pgl.glEnd() + pgl.glBegin(pgl.GL_LINE_STRIP) + continue + if use_cverts: + c = self.cverts[t] + if c is None: + c = (0, 0, 0) + pgl.glColor3f(*c) + else: + pgl.glColor3f(*self.default_wireframe_color) + pgl.glVertex3f(*p) + pgl.glEnd() + return f diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/pygletplot/plot_interval.py b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/pygletplot/plot_interval.py new file mode 100644 index 0000000000000000000000000000000000000000..085ab096915bbc4a3761b71736b4dd14f1ff779f --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/pygletplot/plot_interval.py @@ -0,0 +1,181 @@ +from sympy.core.singleton import S +from sympy.core.symbol import Symbol +from sympy.core.sympify import sympify +from sympy.core.numbers import Integer + + +class PlotInterval: + """ + """ + _v, _v_min, _v_max, _v_steps = None, None, None, None + + def require_all_args(f): + def check(self, *args, **kwargs): + for g in [self._v, self._v_min, self._v_max, self._v_steps]: + if g is None: + raise ValueError("PlotInterval is incomplete.") + return f(self, *args, **kwargs) + return check + + def __init__(self, *args): + if len(args) == 1: + if isinstance(args[0], PlotInterval): + self.fill_from(args[0]) + return + elif isinstance(args[0], str): + try: + args = eval(args[0]) + except TypeError: + s_eval_error = "Could not interpret string %s." + raise ValueError(s_eval_error % (args[0])) + elif isinstance(args[0], (tuple, list)): + args = args[0] + else: + raise ValueError("Not an interval.") + if not isinstance(args, (tuple, list)) or len(args) > 4: + f_error = "PlotInterval must be a tuple or list of length 4 or less." + raise ValueError(f_error) + + args = list(args) + if len(args) > 0 and (args[0] is None or isinstance(args[0], Symbol)): + self.v = args.pop(0) + if len(args) in [2, 3]: + self.v_min = args.pop(0) + self.v_max = args.pop(0) + if len(args) == 1: + self.v_steps = args.pop(0) + elif len(args) == 1: + self.v_steps = args.pop(0) + + def get_v(self): + return self._v + + def set_v(self, v): + if v is None: + self._v = None + return + if not isinstance(v, Symbol): + raise ValueError("v must be a SymPy Symbol.") + self._v = v + + def get_v_min(self): + return self._v_min + + def set_v_min(self, v_min): + if v_min is None: + self._v_min = None + return + try: + self._v_min = sympify(v_min) + float(self._v_min.evalf()) + except TypeError: + raise ValueError("v_min could not be interpreted as a number.") + + def get_v_max(self): + return self._v_max + + def set_v_max(self, v_max): + if v_max is None: + self._v_max = None + return + try: + self._v_max = sympify(v_max) + float(self._v_max.evalf()) + except TypeError: + raise ValueError("v_max could not be interpreted as a number.") + + def get_v_steps(self): + return self._v_steps + + def set_v_steps(self, v_steps): + if v_steps is None: + self._v_steps = None + return + if isinstance(v_steps, int): + v_steps = Integer(v_steps) + elif not isinstance(v_steps, Integer): + raise ValueError("v_steps must be an int or SymPy Integer.") + if v_steps <= S.Zero: + raise ValueError("v_steps must be positive.") + self._v_steps = v_steps + + @require_all_args + def get_v_len(self): + return self.v_steps + 1 + + v = property(get_v, set_v) + v_min = property(get_v_min, set_v_min) + v_max = property(get_v_max, set_v_max) + v_steps = property(get_v_steps, set_v_steps) + v_len = property(get_v_len) + + def fill_from(self, b): + if b.v is not None: + self.v = b.v + if b.v_min is not None: + self.v_min = b.v_min + if b.v_max is not None: + self.v_max = b.v_max + if b.v_steps is not None: + self.v_steps = b.v_steps + + @staticmethod + def try_parse(*args): + """ + Returns a PlotInterval if args can be interpreted + as such, otherwise None. + """ + if len(args) == 1 and isinstance(args[0], PlotInterval): + return args[0] + try: + return PlotInterval(*args) + except ValueError: + return None + + def _str_base(self): + return ",".join([str(self.v), str(self.v_min), + str(self.v_max), str(self.v_steps)]) + + def __repr__(self): + """ + A string representing the interval in class constructor form. + """ + return "PlotInterval(%s)" % (self._str_base()) + + def __str__(self): + """ + A string representing the interval in list form. + """ + return "[%s]" % (self._str_base()) + + @require_all_args + def assert_complete(self): + pass + + @require_all_args + def vrange(self): + """ + Yields v_steps+1 SymPy numbers ranging from + v_min to v_max. + """ + d = (self.v_max - self.v_min) / self.v_steps + for i in range(self.v_steps + 1): + a = self.v_min + (d * Integer(i)) + yield a + + @require_all_args + def vrange2(self): + """ + Yields v_steps pairs of SymPy numbers ranging from + (v_min, v_min + step) to (v_max - step, v_max). + """ + d = (self.v_max - self.v_min) / self.v_steps + a = self.v_min + (d * S.Zero) + for i in range(self.v_steps): + b = self.v_min + (d * Integer(i + 1)) + yield a, b + a = b + + def frange(self): + for i in self.vrange(): + yield float(i.evalf()) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/pygletplot/plot_mode.py b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/pygletplot/plot_mode.py new file mode 100644 index 0000000000000000000000000000000000000000..f4ee00db9177b98b3259438949836fe5b69416c2 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/pygletplot/plot_mode.py @@ -0,0 +1,400 @@ +from .plot_interval import PlotInterval +from .plot_object import PlotObject +from .util import parse_option_string +from sympy.core.symbol import Symbol +from sympy.core.sympify import sympify +from sympy.geometry.entity import GeometryEntity +from sympy.utilities.iterables import is_sequence + + +class PlotMode(PlotObject): + """ + Grandparent class for plotting + modes. Serves as interface for + registration, lookup, and init + of modes. + + To create a new plot mode, + inherit from PlotModeBase + or one of its children, such + as PlotSurface or PlotCurve. + """ + + ## Class-level attributes + ## used to register and lookup + ## plot modes. See PlotModeBase + ## for descriptions and usage. + + i_vars, d_vars = '', '' + intervals = [] + aliases = [] + is_default = False + + ## Draw is the only method here which + ## is meant to be overridden in child + ## classes, and PlotModeBase provides + ## a base implementation. + def draw(self): + raise NotImplementedError() + + ## Everything else in this file has to + ## do with registration and retrieval + ## of plot modes. This is where I've + ## hidden much of the ugliness of automatic + ## plot mode divination... + + ## Plot mode registry data structures + _mode_alias_list = [] + _mode_map = { + 1: {1: {}, 2: {}}, + 2: {1: {}, 2: {}}, + 3: {1: {}, 2: {}}, + } # [d][i][alias_str]: class + _mode_default_map = { + 1: {}, + 2: {}, + 3: {}, + } # [d][i]: class + _i_var_max, _d_var_max = 2, 3 + + def __new__(cls, *args, **kwargs): + """ + This is the function which interprets + arguments given to Plot.__init__ and + Plot.__setattr__. Returns an initialized + instance of the appropriate child class. + """ + + newargs, newkwargs = PlotMode._extract_options(args, kwargs) + mode_arg = newkwargs.get('mode', '') + + # Interpret the arguments + d_vars, intervals = PlotMode._interpret_args(newargs) + i_vars = PlotMode._find_i_vars(d_vars, intervals) + i, d = max([len(i_vars), len(intervals)]), len(d_vars) + + # Find the appropriate mode + subcls = PlotMode._get_mode(mode_arg, i, d) + + # Create the object + o = object.__new__(subcls) + + # Do some setup for the mode instance + o.d_vars = d_vars + o._fill_i_vars(i_vars) + o._fill_intervals(intervals) + o.options = newkwargs + + return o + + @staticmethod + def _get_mode(mode_arg, i_var_count, d_var_count): + """ + Tries to return an appropriate mode class. + Intended to be called only by __new__. + + mode_arg + Can be a string or a class. If it is a + PlotMode subclass, it is simply returned. + If it is a string, it can an alias for + a mode or an empty string. In the latter + case, we try to find a default mode for + the i_var_count and d_var_count. + + i_var_count + The number of independent variables + needed to evaluate the d_vars. + + d_var_count + The number of dependent variables; + usually the number of functions to + be evaluated in plotting. + + For example, a Cartesian function y = f(x) has + one i_var (x) and one d_var (y). A parametric + form x,y,z = f(u,v), f(u,v), f(u,v) has two + two i_vars (u,v) and three d_vars (x,y,z). + """ + # if the mode_arg is simply a PlotMode class, + # check that the mode supports the numbers + # of independent and dependent vars, then + # return it + try: + m = None + if issubclass(mode_arg, PlotMode): + m = mode_arg + except TypeError: + pass + if m: + if not m._was_initialized: + raise ValueError(("To use unregistered plot mode %s " + "you must first call %s._init_mode().") + % (m.__name__, m.__name__)) + if d_var_count != m.d_var_count: + raise ValueError(("%s can only plot functions " + "with %i dependent variables.") + % (m.__name__, + m.d_var_count)) + if i_var_count > m.i_var_count: + raise ValueError(("%s cannot plot functions " + "with more than %i independent " + "variables.") + % (m.__name__, + m.i_var_count)) + return m + # If it is a string, there are two possibilities. + if isinstance(mode_arg, str): + i, d = i_var_count, d_var_count + if i > PlotMode._i_var_max: + raise ValueError(var_count_error(True, True)) + if d > PlotMode._d_var_max: + raise ValueError(var_count_error(False, True)) + # If the string is '', try to find a suitable + # default mode + if not mode_arg: + return PlotMode._get_default_mode(i, d) + # Otherwise, interpret the string as a mode + # alias (e.g. 'cartesian', 'parametric', etc) + else: + return PlotMode._get_aliased_mode(mode_arg, i, d) + else: + raise ValueError("PlotMode argument must be " + "a class or a string") + + @staticmethod + def _get_default_mode(i, d, i_vars=-1): + if i_vars == -1: + i_vars = i + try: + return PlotMode._mode_default_map[d][i] + except KeyError: + # Keep looking for modes in higher i var counts + # which support the given d var count until we + # reach the max i_var count. + if i < PlotMode._i_var_max: + return PlotMode._get_default_mode(i + 1, d, i_vars) + else: + raise ValueError(("Couldn't find a default mode " + "for %i independent and %i " + "dependent variables.") % (i_vars, d)) + + @staticmethod + def _get_aliased_mode(alias, i, d, i_vars=-1): + if i_vars == -1: + i_vars = i + if alias not in PlotMode._mode_alias_list: + raise ValueError(("Couldn't find a mode called" + " %s. Known modes: %s.") + % (alias, ", ".join(PlotMode._mode_alias_list))) + try: + return PlotMode._mode_map[d][i][alias] + except TypeError: + # Keep looking for modes in higher i var counts + # which support the given d var count and alias + # until we reach the max i_var count. + if i < PlotMode._i_var_max: + return PlotMode._get_aliased_mode(alias, i + 1, d, i_vars) + else: + raise ValueError(("Couldn't find a %s mode " + "for %i independent and %i " + "dependent variables.") + % (alias, i_vars, d)) + + @classmethod + def _register(cls): + """ + Called once for each user-usable plot mode. + For Cartesian2D, it is invoked after the + class definition: Cartesian2D._register() + """ + name = cls.__name__ + cls._init_mode() + + try: + i, d = cls.i_var_count, cls.d_var_count + # Add the mode to _mode_map under all + # given aliases + for a in cls.aliases: + if a not in PlotMode._mode_alias_list: + # Also track valid aliases, so + # we can quickly know when given + # an invalid one in _get_mode. + PlotMode._mode_alias_list.append(a) + PlotMode._mode_map[d][i][a] = cls + if cls.is_default: + # If this mode was marked as the + # default for this d,i combination, + # also set that. + PlotMode._mode_default_map[d][i] = cls + + except Exception as e: + raise RuntimeError(("Failed to register " + "plot mode %s. Reason: %s") + % (name, (str(e)))) + + @classmethod + def _init_mode(cls): + """ + Initializes the plot mode based on + the 'mode-specific parameters' above. + Only intended to be called by + PlotMode._register(). To use a mode without + registering it, you can directly call + ModeSubclass._init_mode(). + """ + def symbols_list(symbol_str): + return [Symbol(s) for s in symbol_str] + + # Convert the vars strs into + # lists of symbols. + cls.i_vars = symbols_list(cls.i_vars) + cls.d_vars = symbols_list(cls.d_vars) + + # Var count is used often, calculate + # it once here + cls.i_var_count = len(cls.i_vars) + cls.d_var_count = len(cls.d_vars) + + if cls.i_var_count > PlotMode._i_var_max: + raise ValueError(var_count_error(True, False)) + if cls.d_var_count > PlotMode._d_var_max: + raise ValueError(var_count_error(False, False)) + + # Try to use first alias as primary_alias + if len(cls.aliases) > 0: + cls.primary_alias = cls.aliases[0] + else: + cls.primary_alias = cls.__name__ + + di = cls.intervals + if len(di) != cls.i_var_count: + raise ValueError("Plot mode must provide a " + "default interval for each i_var.") + for i in range(cls.i_var_count): + # default intervals must be given [min,max,steps] + # (no var, but they must be in the same order as i_vars) + if len(di[i]) != 3: + raise ValueError("length should be equal to 3") + + # Initialize an incomplete interval, + # to later be filled with a var when + # the mode is instantiated. + di[i] = PlotInterval(None, *di[i]) + + # To prevent people from using modes + # without these required fields set up. + cls._was_initialized = True + + _was_initialized = False + + ## Initializer Helper Methods + + @staticmethod + def _find_i_vars(functions, intervals): + i_vars = [] + + # First, collect i_vars in the + # order they are given in any + # intervals. + for i in intervals: + if i.v is None: + continue + elif i.v in i_vars: + raise ValueError(("Multiple intervals given " + "for %s.") % (str(i.v))) + i_vars.append(i.v) + + # Then, find any remaining + # i_vars in given functions + # (aka d_vars) + for f in functions: + for a in f.free_symbols: + if a not in i_vars: + i_vars.append(a) + + return i_vars + + def _fill_i_vars(self, i_vars): + # copy default i_vars + self.i_vars = [Symbol(str(i)) for i in self.i_vars] + # replace with given i_vars + for i in range(len(i_vars)): + self.i_vars[i] = i_vars[i] + + def _fill_intervals(self, intervals): + # copy default intervals + self.intervals = [PlotInterval(i) for i in self.intervals] + # track i_vars used so far + v_used = [] + # fill copy of default + # intervals with given info + for i in range(len(intervals)): + self.intervals[i].fill_from(intervals[i]) + if self.intervals[i].v is not None: + v_used.append(self.intervals[i].v) + # Find any orphan intervals and + # assign them i_vars + for i in range(len(self.intervals)): + if self.intervals[i].v is None: + u = [v for v in self.i_vars if v not in v_used] + if len(u) == 0: + raise ValueError("length should not be equal to 0") + self.intervals[i].v = u[0] + v_used.append(u[0]) + + @staticmethod + def _interpret_args(args): + interval_wrong_order = "PlotInterval %s was given before any function(s)." + interpret_error = "Could not interpret %s as a function or interval." + + functions, intervals = [], [] + if isinstance(args[0], GeometryEntity): + for coords in list(args[0].arbitrary_point()): + functions.append(coords) + intervals.append(PlotInterval.try_parse(args[0].plot_interval())) + else: + for a in args: + i = PlotInterval.try_parse(a) + if i is not None: + if len(functions) == 0: + raise ValueError(interval_wrong_order % (str(i))) + else: + intervals.append(i) + else: + if is_sequence(a, include=str): + raise ValueError(interpret_error % (str(a))) + try: + f = sympify(a) + functions.append(f) + except TypeError: + raise ValueError(interpret_error % str(a)) + + return functions, intervals + + @staticmethod + def _extract_options(args, kwargs): + newkwargs, newargs = {}, [] + for a in args: + if isinstance(a, str): + newkwargs = dict(newkwargs, **parse_option_string(a)) + else: + newargs.append(a) + newkwargs = dict(newkwargs, **kwargs) + return newargs, newkwargs + + +def var_count_error(is_independent, is_plotting): + """ + Used to format an error message which differs + slightly in 4 places. + """ + if is_plotting: + v = "Plotting" + else: + v = "Registering plot modes" + if is_independent: + n, s = PlotMode._i_var_max, "independent" + else: + n, s = PlotMode._d_var_max, "dependent" + return ("%s with more than %i %s variables " + "is not supported.") % (v, n, s) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/pygletplot/plot_mode_base.py b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/pygletplot/plot_mode_base.py new file mode 100644 index 0000000000000000000000000000000000000000..2c6503650afda122e271bdecb2365c8fa20f2376 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/pygletplot/plot_mode_base.py @@ -0,0 +1,378 @@ +import pyglet.gl as pgl +from sympy.core import S +from sympy.plotting.pygletplot.color_scheme import ColorScheme +from sympy.plotting.pygletplot.plot_mode import PlotMode +from sympy.utilities.iterables import is_sequence +from time import sleep +from threading import Thread, Event, RLock +import warnings + + +class PlotModeBase(PlotMode): + """ + Intended parent class for plotting + modes. Provides base functionality + in conjunction with its parent, + PlotMode. + """ + + ## + ## Class-Level Attributes + ## + + """ + The following attributes are meant + to be set at the class level, and serve + as parameters to the plot mode registry + (in PlotMode). See plot_modes.py for + concrete examples. + """ + + """ + i_vars + 'x' for Cartesian2D + 'xy' for Cartesian3D + etc. + + d_vars + 'y' for Cartesian2D + 'r' for Polar + etc. + """ + i_vars, d_vars = '', '' + + """ + intervals + Default intervals for each i_var, and in the + same order. Specified [min, max, steps]. + No variable can be given (it is bound later). + """ + intervals = [] + + """ + aliases + A list of strings which can be used to + access this mode. + 'cartesian' for Cartesian2D and Cartesian3D + 'polar' for Polar + 'cylindrical', 'polar' for Cylindrical + + Note that _init_mode chooses the first alias + in the list as the mode's primary_alias, which + will be displayed to the end user in certain + contexts. + """ + aliases = [] + + """ + is_default + Whether to set this mode as the default + for arguments passed to PlotMode() containing + the same number of d_vars as this mode and + at most the same number of i_vars. + """ + is_default = False + + """ + All of the above attributes are defined in PlotMode. + The following ones are specific to PlotModeBase. + """ + + """ + A list of the render styles. Do not modify. + """ + styles = {'wireframe': 1, 'solid': 2, 'both': 3} + + """ + style_override + Always use this style if not blank. + """ + style_override = '' + + """ + default_wireframe_color + default_solid_color + Can be used when color is None or being calculated. + Used by PlotCurve and PlotSurface, but not anywhere + in PlotModeBase. + """ + + default_wireframe_color = (0.85, 0.85, 0.85) + default_solid_color = (0.6, 0.6, 0.9) + default_rot_preset = 'xy' + + ## + ## Instance-Level Attributes + ## + + ## 'Abstract' member functions + def _get_evaluator(self): + if self.use_lambda_eval: + try: + e = self._get_lambda_evaluator() + return e + except Exception: + warnings.warn("\nWarning: creating lambda evaluator failed. " + "Falling back on SymPy subs evaluator.") + return self._get_sympy_evaluator() + + def _get_sympy_evaluator(self): + raise NotImplementedError() + + def _get_lambda_evaluator(self): + raise NotImplementedError() + + def _on_calculate_verts(self): + raise NotImplementedError() + + def _on_calculate_cverts(self): + raise NotImplementedError() + + ## Base member functions + def __init__(self, *args, bounds_callback=None, **kwargs): + self.verts = [] + self.cverts = [] + self.bounds = [[S.Infinity, S.NegativeInfinity, 0], + [S.Infinity, S.NegativeInfinity, 0], + [S.Infinity, S.NegativeInfinity, 0]] + self.cbounds = [[S.Infinity, S.NegativeInfinity, 0], + [S.Infinity, S.NegativeInfinity, 0], + [S.Infinity, S.NegativeInfinity, 0]] + + self._draw_lock = RLock() + + self._calculating_verts = Event() + self._calculating_cverts = Event() + self._calculating_verts_pos = 0.0 + self._calculating_verts_len = 0.0 + self._calculating_cverts_pos = 0.0 + self._calculating_cverts_len = 0.0 + + self._max_render_stack_size = 3 + self._draw_wireframe = [-1] + self._draw_solid = [-1] + + self._style = None + self._color = None + + self.predraw = [] + self.postdraw = [] + + self.use_lambda_eval = self.options.pop('use_sympy_eval', None) is None + self.style = self.options.pop('style', '') + self.color = self.options.pop('color', 'rainbow') + self.bounds_callback = bounds_callback + + self._on_calculate() + + def synchronized(f): + def w(self, *args, **kwargs): + self._draw_lock.acquire() + try: + r = f(self, *args, **kwargs) + return r + finally: + self._draw_lock.release() + return w + + @synchronized + def push_wireframe(self, function): + """ + Push a function which performs gl commands + used to build a display list. (The list is + built outside of the function) + """ + assert callable(function) + self._draw_wireframe.append(function) + if len(self._draw_wireframe) > self._max_render_stack_size: + del self._draw_wireframe[1] # leave marker element + + @synchronized + def push_solid(self, function): + """ + Push a function which performs gl commands + used to build a display list. (The list is + built outside of the function) + """ + assert callable(function) + self._draw_solid.append(function) + if len(self._draw_solid) > self._max_render_stack_size: + del self._draw_solid[1] # leave marker element + + def _create_display_list(self, function): + dl = pgl.glGenLists(1) + pgl.glNewList(dl, pgl.GL_COMPILE) + function() + pgl.glEndList() + return dl + + def _render_stack_top(self, render_stack): + top = render_stack[-1] + if top == -1: + return -1 # nothing to display + elif callable(top): + dl = self._create_display_list(top) + render_stack[-1] = (dl, top) + return dl # display newly added list + elif len(top) == 2: + if pgl.GL_TRUE == pgl.glIsList(top[0]): + return top[0] # display stored list + dl = self._create_display_list(top[1]) + render_stack[-1] = (dl, top[1]) + return dl # display regenerated list + + def _draw_solid_display_list(self, dl): + pgl.glPushAttrib(pgl.GL_ENABLE_BIT | pgl.GL_POLYGON_BIT) + pgl.glPolygonMode(pgl.GL_FRONT_AND_BACK, pgl.GL_FILL) + pgl.glCallList(dl) + pgl.glPopAttrib() + + def _draw_wireframe_display_list(self, dl): + pgl.glPushAttrib(pgl.GL_ENABLE_BIT | pgl.GL_POLYGON_BIT) + pgl.glPolygonMode(pgl.GL_FRONT_AND_BACK, pgl.GL_LINE) + pgl.glEnable(pgl.GL_POLYGON_OFFSET_LINE) + pgl.glPolygonOffset(-0.005, -50.0) + pgl.glCallList(dl) + pgl.glPopAttrib() + + @synchronized + def draw(self): + for f in self.predraw: + if callable(f): + f() + if self.style_override: + style = self.styles[self.style_override] + else: + style = self.styles[self._style] + # Draw solid component if style includes solid + if style & 2: + dl = self._render_stack_top(self._draw_solid) + if dl > 0 and pgl.GL_TRUE == pgl.glIsList(dl): + self._draw_solid_display_list(dl) + # Draw wireframe component if style includes wireframe + if style & 1: + dl = self._render_stack_top(self._draw_wireframe) + if dl > 0 and pgl.GL_TRUE == pgl.glIsList(dl): + self._draw_wireframe_display_list(dl) + for f in self.postdraw: + if callable(f): + f() + + def _on_change_color(self, color): + Thread(target=self._calculate_cverts).start() + + def _on_calculate(self): + Thread(target=self._calculate_all).start() + + def _calculate_all(self): + self._calculate_verts() + self._calculate_cverts() + + def _calculate_verts(self): + if self._calculating_verts.is_set(): + return + self._calculating_verts.set() + try: + self._on_calculate_verts() + finally: + self._calculating_verts.clear() + if callable(self.bounds_callback): + self.bounds_callback() + + def _calculate_cverts(self): + if self._calculating_verts.is_set(): + return + while self._calculating_cverts.is_set(): + sleep(0) # wait for previous calculation + self._calculating_cverts.set() + try: + self._on_calculate_cverts() + finally: + self._calculating_cverts.clear() + + def _get_calculating_verts(self): + return self._calculating_verts.is_set() + + def _get_calculating_verts_pos(self): + return self._calculating_verts_pos + + def _get_calculating_verts_len(self): + return self._calculating_verts_len + + def _get_calculating_cverts(self): + return self._calculating_cverts.is_set() + + def _get_calculating_cverts_pos(self): + return self._calculating_cverts_pos + + def _get_calculating_cverts_len(self): + return self._calculating_cverts_len + + ## Property handlers + def _get_style(self): + return self._style + + @synchronized + def _set_style(self, v): + if v is None: + return + if v == '': + step_max = 0 + for i in self.intervals: + if i.v_steps is None: + continue + step_max = max([step_max, int(i.v_steps)]) + v = ['both', 'solid'][step_max > 40] + if v not in self.styles: + raise ValueError("v should be there in self.styles") + if v == self._style: + return + self._style = v + + def _get_color(self): + return self._color + + @synchronized + def _set_color(self, v): + try: + if v is not None: + if is_sequence(v): + v = ColorScheme(*v) + else: + v = ColorScheme(v) + if repr(v) == repr(self._color): + return + self._on_change_color(v) + self._color = v + except Exception as e: + raise RuntimeError("Color change failed. " + "Reason: %s" % (str(e))) + + style = property(_get_style, _set_style) + color = property(_get_color, _set_color) + + calculating_verts = property(_get_calculating_verts) + calculating_verts_pos = property(_get_calculating_verts_pos) + calculating_verts_len = property(_get_calculating_verts_len) + + calculating_cverts = property(_get_calculating_cverts) + calculating_cverts_pos = property(_get_calculating_cverts_pos) + calculating_cverts_len = property(_get_calculating_cverts_len) + + ## String representations + + def __str__(self): + f = ", ".join(str(d) for d in self.d_vars) + o = "'mode=%s'" % (self.primary_alias) + return ", ".join([f, o]) + + def __repr__(self): + f = ", ".join(str(d) for d in self.d_vars) + i = ", ".join(str(i) for i in self.intervals) + d = [('mode', self.primary_alias), + ('color', str(self.color)), + ('style', str(self.style))] + + o = "'%s'" % ("; ".join("%s=%s" % (k, v) + for k, v in d if v != 'None')) + return ", ".join([f, i, o]) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/pygletplot/plot_modes.py b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/pygletplot/plot_modes.py new file mode 100644 index 0000000000000000000000000000000000000000..e78e0b4ce291b071f684fa3ffc02f456dffe0023 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/pygletplot/plot_modes.py @@ -0,0 +1,209 @@ +from sympy.utilities.lambdify import lambdify +from sympy.core.numbers import pi +from sympy.functions import sin, cos +from sympy.plotting.pygletplot.plot_curve import PlotCurve +from sympy.plotting.pygletplot.plot_surface import PlotSurface + +from math import sin as p_sin +from math import cos as p_cos + + +def float_vec3(f): + def inner(*args): + v = f(*args) + return float(v[0]), float(v[1]), float(v[2]) + return inner + + +class Cartesian2D(PlotCurve): + i_vars, d_vars = 'x', 'y' + intervals = [[-5, 5, 100]] + aliases = ['cartesian'] + is_default = True + + def _get_sympy_evaluator(self): + fy = self.d_vars[0] + x = self.t_interval.v + + @float_vec3 + def e(_x): + return (_x, fy.subs(x, _x), 0.0) + return e + + def _get_lambda_evaluator(self): + fy = self.d_vars[0] + x = self.t_interval.v + return lambdify([x], [x, fy, 0.0]) + + +class Cartesian3D(PlotSurface): + i_vars, d_vars = 'xy', 'z' + intervals = [[-1, 1, 40], [-1, 1, 40]] + aliases = ['cartesian', 'monge'] + is_default = True + + def _get_sympy_evaluator(self): + fz = self.d_vars[0] + x = self.u_interval.v + y = self.v_interval.v + + @float_vec3 + def e(_x, _y): + return (_x, _y, fz.subs(x, _x).subs(y, _y)) + return e + + def _get_lambda_evaluator(self): + fz = self.d_vars[0] + x = self.u_interval.v + y = self.v_interval.v + return lambdify([x, y], [x, y, fz]) + + +class ParametricCurve2D(PlotCurve): + i_vars, d_vars = 't', 'xy' + intervals = [[0, 2*pi, 100]] + aliases = ['parametric'] + is_default = True + + def _get_sympy_evaluator(self): + fx, fy = self.d_vars + t = self.t_interval.v + + @float_vec3 + def e(_t): + return (fx.subs(t, _t), fy.subs(t, _t), 0.0) + return e + + def _get_lambda_evaluator(self): + fx, fy = self.d_vars + t = self.t_interval.v + return lambdify([t], [fx, fy, 0.0]) + + +class ParametricCurve3D(PlotCurve): + i_vars, d_vars = 't', 'xyz' + intervals = [[0, 2*pi, 100]] + aliases = ['parametric'] + is_default = True + + def _get_sympy_evaluator(self): + fx, fy, fz = self.d_vars + t = self.t_interval.v + + @float_vec3 + def e(_t): + return (fx.subs(t, _t), fy.subs(t, _t), fz.subs(t, _t)) + return e + + def _get_lambda_evaluator(self): + fx, fy, fz = self.d_vars + t = self.t_interval.v + return lambdify([t], [fx, fy, fz]) + + +class ParametricSurface(PlotSurface): + i_vars, d_vars = 'uv', 'xyz' + intervals = [[-1, 1, 40], [-1, 1, 40]] + aliases = ['parametric'] + is_default = True + + def _get_sympy_evaluator(self): + fx, fy, fz = self.d_vars + u = self.u_interval.v + v = self.v_interval.v + + @float_vec3 + def e(_u, _v): + return (fx.subs(u, _u).subs(v, _v), + fy.subs(u, _u).subs(v, _v), + fz.subs(u, _u).subs(v, _v)) + return e + + def _get_lambda_evaluator(self): + fx, fy, fz = self.d_vars + u = self.u_interval.v + v = self.v_interval.v + return lambdify([u, v], [fx, fy, fz]) + + +class Polar(PlotCurve): + i_vars, d_vars = 't', 'r' + intervals = [[0, 2*pi, 100]] + aliases = ['polar'] + is_default = False + + def _get_sympy_evaluator(self): + fr = self.d_vars[0] + t = self.t_interval.v + + def e(_t): + _r = float(fr.subs(t, _t)) + return (_r*p_cos(_t), _r*p_sin(_t), 0.0) + return e + + def _get_lambda_evaluator(self): + fr = self.d_vars[0] + t = self.t_interval.v + fx, fy = fr*cos(t), fr*sin(t) + return lambdify([t], [fx, fy, 0.0]) + + +class Cylindrical(PlotSurface): + i_vars, d_vars = 'th', 'r' + intervals = [[0, 2*pi, 40], [-1, 1, 20]] + aliases = ['cylindrical', 'polar'] + is_default = False + + def _get_sympy_evaluator(self): + fr = self.d_vars[0] + t = self.u_interval.v + h = self.v_interval.v + + def e(_t, _h): + _r = float(fr.subs(t, _t).subs(h, _h)) + return (_r*p_cos(_t), _r*p_sin(_t), _h) + return e + + def _get_lambda_evaluator(self): + fr = self.d_vars[0] + t = self.u_interval.v + h = self.v_interval.v + fx, fy = fr*cos(t), fr*sin(t) + return lambdify([t, h], [fx, fy, h]) + + +class Spherical(PlotSurface): + i_vars, d_vars = 'tp', 'r' + intervals = [[0, 2*pi, 40], [0, pi, 20]] + aliases = ['spherical'] + is_default = False + + def _get_sympy_evaluator(self): + fr = self.d_vars[0] + t = self.u_interval.v + p = self.v_interval.v + + def e(_t, _p): + _r = float(fr.subs(t, _t).subs(p, _p)) + return (_r*p_cos(_t)*p_sin(_p), + _r*p_sin(_t)*p_sin(_p), + _r*p_cos(_p)) + return e + + def _get_lambda_evaluator(self): + fr = self.d_vars[0] + t = self.u_interval.v + p = self.v_interval.v + fx = fr * cos(t) * sin(p) + fy = fr * sin(t) * sin(p) + fz = fr * cos(p) + return lambdify([t, p], [fx, fy, fz]) + +Cartesian2D._register() +Cartesian3D._register() +ParametricCurve2D._register() +ParametricCurve3D._register() +ParametricSurface._register() +Polar._register() +Cylindrical._register() +Spherical._register() diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/pygletplot/plot_object.py b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/pygletplot/plot_object.py new file mode 100644 index 0000000000000000000000000000000000000000..e51040fb8b1a52c49d849b96692f6c0dba329d75 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/pygletplot/plot_object.py @@ -0,0 +1,17 @@ +class PlotObject: + """ + Base class for objects which can be displayed in + a Plot. + """ + visible = True + + def _draw(self): + if self.visible: + self.draw() + + def draw(self): + """ + OpenGL rendering code for the plot object. + Override in base class. + """ + pass diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/pygletplot/plot_rotation.py b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/pygletplot/plot_rotation.py new file mode 100644 index 0000000000000000000000000000000000000000..11ede2d1c3e74e5470cf601348e494c35720b9a8 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/pygletplot/plot_rotation.py @@ -0,0 +1,68 @@ +try: + from ctypes import c_float +except ImportError: + pass + +import pyglet.gl as pgl +from math import sqrt as _sqrt, acos as _acos, pi + + +def cross(a, b): + return (a[1] * b[2] - a[2] * b[1], + a[2] * b[0] - a[0] * b[2], + a[0] * b[1] - a[1] * b[0]) + + +def dot(a, b): + return a[0] * b[0] + a[1] * b[1] + a[2] * b[2] + + +def mag(a): + return _sqrt(a[0]**2 + a[1]**2 + a[2]**2) + + +def norm(a): + m = mag(a) + return (a[0] / m, a[1] / m, a[2] / m) + + +def get_sphere_mapping(x, y, width, height): + x = min([max([x, 0]), width]) + y = min([max([y, 0]), height]) + + sr = _sqrt((width/2)**2 + (height/2)**2) + sx = ((x - width / 2) / sr) + sy = ((y - height / 2) / sr) + + sz = 1.0 - sx**2 - sy**2 + + if sz > 0.0: + sz = _sqrt(sz) + return (sx, sy, sz) + else: + sz = 0 + return norm((sx, sy, sz)) + +rad2deg = 180.0 / pi + + +def get_spherical_rotatation(p1, p2, width, height, theta_multiplier): + v1 = get_sphere_mapping(p1[0], p1[1], width, height) + v2 = get_sphere_mapping(p2[0], p2[1], width, height) + + d = min(max([dot(v1, v2), -1]), 1) + + if abs(d - 1.0) < 0.000001: + return None + + raxis = norm( cross(v1, v2) ) + rtheta = theta_multiplier * rad2deg * _acos(d) + + pgl.glPushMatrix() + pgl.glLoadIdentity() + pgl.glRotatef(rtheta, *raxis) + mat = (c_float*16)() + pgl.glGetFloatv(pgl.GL_MODELVIEW_MATRIX, mat) + pgl.glPopMatrix() + + return mat diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/pygletplot/plot_surface.py b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/pygletplot/plot_surface.py new file mode 100644 index 0000000000000000000000000000000000000000..ed421eebb441d193f4d9b763f56e146c11e5a42c --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/pygletplot/plot_surface.py @@ -0,0 +1,102 @@ +import pyglet.gl as pgl + +from sympy.core import S +from sympy.plotting.pygletplot.plot_mode_base import PlotModeBase + + +class PlotSurface(PlotModeBase): + + default_rot_preset = 'perspective' + + def _on_calculate_verts(self): + self.u_interval = self.intervals[0] + self.u_set = list(self.u_interval.frange()) + self.v_interval = self.intervals[1] + self.v_set = list(self.v_interval.frange()) + self.bounds = [[S.Infinity, S.NegativeInfinity, 0], + [S.Infinity, S.NegativeInfinity, 0], + [S.Infinity, S.NegativeInfinity, 0]] + evaluate = self._get_evaluator() + + self._calculating_verts_pos = 0.0 + self._calculating_verts_len = float( + self.u_interval.v_len*self.v_interval.v_len) + + verts = [] + b = self.bounds + for u in self.u_set: + column = [] + for v in self.v_set: + try: + _e = evaluate(u, v) # calculate vertex + except ZeroDivisionError: + _e = None + if _e is not None: # update bounding box + for axis in range(3): + b[axis][0] = min([b[axis][0], _e[axis]]) + b[axis][1] = max([b[axis][1], _e[axis]]) + column.append(_e) + self._calculating_verts_pos += 1.0 + + verts.append(column) + for axis in range(3): + b[axis][2] = b[axis][1] - b[axis][0] + if b[axis][2] == 0.0: + b[axis][2] = 1.0 + + self.verts = verts + self.push_wireframe(self.draw_verts(False, False)) + self.push_solid(self.draw_verts(False, True)) + + def _on_calculate_cverts(self): + if not self.verts or not self.color: + return + + def set_work_len(n): + self._calculating_cverts_len = float(n) + + def inc_work_pos(): + self._calculating_cverts_pos += 1.0 + set_work_len(1) + self._calculating_cverts_pos = 0 + self.cverts = self.color.apply_to_surface(self.verts, + self.u_set, + self.v_set, + set_len=set_work_len, + inc_pos=inc_work_pos) + self.push_solid(self.draw_verts(True, True)) + + def calculate_one_cvert(self, u, v): + vert = self.verts[u][v] + return self.color(vert[0], vert[1], vert[2], + self.u_set[u], self.v_set[v]) + + def draw_verts(self, use_cverts, use_solid_color): + def f(): + for u in range(1, len(self.u_set)): + pgl.glBegin(pgl.GL_QUAD_STRIP) + for v in range(len(self.v_set)): + pa = self.verts[u - 1][v] + pb = self.verts[u][v] + if pa is None or pb is None: + pgl.glEnd() + pgl.glBegin(pgl.GL_QUAD_STRIP) + continue + if use_cverts: + ca = self.cverts[u - 1][v] + cb = self.cverts[u][v] + if ca is None: + ca = (0, 0, 0) + if cb is None: + cb = (0, 0, 0) + else: + if use_solid_color: + ca = cb = self.default_solid_color + else: + ca = cb = self.default_wireframe_color + pgl.glColor3f(*ca) + pgl.glVertex3f(*pa) + pgl.glColor3f(*cb) + pgl.glVertex3f(*pb) + pgl.glEnd() + return f diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/pygletplot/plot_window.py b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/pygletplot/plot_window.py new file mode 100644 index 0000000000000000000000000000000000000000..d9df4cc453acb05d7c2d871e9e8efeb36905de5d --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/pygletplot/plot_window.py @@ -0,0 +1,144 @@ +from time import perf_counter + + +import pyglet.gl as pgl + +from sympy.plotting.pygletplot.managed_window import ManagedWindow +from sympy.plotting.pygletplot.plot_camera import PlotCamera +from sympy.plotting.pygletplot.plot_controller import PlotController + + +class PlotWindow(ManagedWindow): + + def __init__(self, plot, antialiasing=True, ortho=False, + invert_mouse_zoom=False, linewidth=1.5, caption="SymPy Plot", + **kwargs): + """ + Named Arguments + =============== + + antialiasing = True + True OR False + ortho = False + True OR False + invert_mouse_zoom = False + True OR False + """ + self.plot = plot + + self.camera = None + self._calculating = False + + self.antialiasing = antialiasing + self.ortho = ortho + self.invert_mouse_zoom = invert_mouse_zoom + self.linewidth = linewidth + self.title = caption + self.last_caption_update = 0 + self.caption_update_interval = 0.2 + self.drawing_first_object = True + + super().__init__(**kwargs) + + def setup(self): + self.camera = PlotCamera(self, ortho=self.ortho) + self.controller = PlotController(self, + invert_mouse_zoom=self.invert_mouse_zoom) + self.push_handlers(self.controller) + + pgl.glClearColor(1.0, 1.0, 1.0, 0.0) + pgl.glClearDepth(1.0) + + pgl.glDepthFunc(pgl.GL_LESS) + pgl.glEnable(pgl.GL_DEPTH_TEST) + + pgl.glEnable(pgl.GL_LINE_SMOOTH) + pgl.glShadeModel(pgl.GL_SMOOTH) + pgl.glLineWidth(self.linewidth) + + pgl.glEnable(pgl.GL_BLEND) + pgl.glBlendFunc(pgl.GL_SRC_ALPHA, pgl.GL_ONE_MINUS_SRC_ALPHA) + + if self.antialiasing: + pgl.glHint(pgl.GL_LINE_SMOOTH_HINT, pgl.GL_NICEST) + pgl.glHint(pgl.GL_POLYGON_SMOOTH_HINT, pgl.GL_NICEST) + + self.camera.setup_projection() + + def on_resize(self, w, h): + super().on_resize(w, h) + if self.camera is not None: + self.camera.setup_projection() + + def update(self, dt): + self.controller.update(dt) + + def draw(self): + self.plot._render_lock.acquire() + self.camera.apply_transformation() + + calc_verts_pos, calc_verts_len = 0, 0 + calc_cverts_pos, calc_cverts_len = 0, 0 + + should_update_caption = (perf_counter() - self.last_caption_update > + self.caption_update_interval) + + if len(self.plot._functions.values()) == 0: + self.drawing_first_object = True + + iterfunctions = iter(self.plot._functions.values()) + + for r in iterfunctions: + if self.drawing_first_object: + self.camera.set_rot_preset(r.default_rot_preset) + self.drawing_first_object = False + + pgl.glPushMatrix() + r._draw() + pgl.glPopMatrix() + + # might as well do this while we are + # iterating and have the lock rather + # than locking and iterating twice + # per frame: + + if should_update_caption: + try: + if r.calculating_verts: + calc_verts_pos += r.calculating_verts_pos + calc_verts_len += r.calculating_verts_len + if r.calculating_cverts: + calc_cverts_pos += r.calculating_cverts_pos + calc_cverts_len += r.calculating_cverts_len + except ValueError: + pass + + for r in self.plot._pobjects: + pgl.glPushMatrix() + r._draw() + pgl.glPopMatrix() + + if should_update_caption: + self.update_caption(calc_verts_pos, calc_verts_len, + calc_cverts_pos, calc_cverts_len) + self.last_caption_update = perf_counter() + + if self.plot._screenshot: + self.plot._screenshot._execute_saving() + + self.plot._render_lock.release() + + def update_caption(self, calc_verts_pos, calc_verts_len, + calc_cverts_pos, calc_cverts_len): + caption = self.title + if calc_verts_len or calc_cverts_len: + caption += " (calculating" + if calc_verts_len > 0: + p = (calc_verts_pos / calc_verts_len) * 100 + caption += " vertices %i%%" % (p) + if calc_cverts_len > 0: + p = (calc_cverts_pos / calc_cverts_len) * 100 + caption += " colors %i%%" % (p) + caption += ")" + if self.caption != caption: + self.set_caption(caption) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/pygletplot/util.py b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/pygletplot/util.py new file mode 100644 index 0000000000000000000000000000000000000000..43b882ca18274dcdb273cf35680016453db3c698 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/pygletplot/util.py @@ -0,0 +1,188 @@ +try: + from ctypes import c_float, c_int, c_double +except ImportError: + pass + +import pyglet.gl as pgl +from sympy.core import S + + +def get_model_matrix(array_type=c_float, glGetMethod=pgl.glGetFloatv): + """ + Returns the current modelview matrix. + """ + m = (array_type*16)() + glGetMethod(pgl.GL_MODELVIEW_MATRIX, m) + return m + + +def get_projection_matrix(array_type=c_float, glGetMethod=pgl.glGetFloatv): + """ + Returns the current modelview matrix. + """ + m = (array_type*16)() + glGetMethod(pgl.GL_PROJECTION_MATRIX, m) + return m + + +def get_viewport(): + """ + Returns the current viewport. + """ + m = (c_int*4)() + pgl.glGetIntegerv(pgl.GL_VIEWPORT, m) + return m + + +def get_direction_vectors(): + m = get_model_matrix() + return ((m[0], m[4], m[8]), + (m[1], m[5], m[9]), + (m[2], m[6], m[10])) + + +def get_view_direction_vectors(): + m = get_model_matrix() + return ((m[0], m[1], m[2]), + (m[4], m[5], m[6]), + (m[8], m[9], m[10])) + + +def get_basis_vectors(): + return ((1, 0, 0), (0, 1, 0), (0, 0, 1)) + + +def screen_to_model(x, y, z): + m = get_model_matrix(c_double, pgl.glGetDoublev) + p = get_projection_matrix(c_double, pgl.glGetDoublev) + w = get_viewport() + mx, my, mz = c_double(), c_double(), c_double() + pgl.gluUnProject(x, y, z, m, p, w, mx, my, mz) + return float(mx.value), float(my.value), float(mz.value) + + +def model_to_screen(x, y, z): + m = get_model_matrix(c_double, pgl.glGetDoublev) + p = get_projection_matrix(c_double, pgl.glGetDoublev) + w = get_viewport() + mx, my, mz = c_double(), c_double(), c_double() + pgl.gluProject(x, y, z, m, p, w, mx, my, mz) + return float(mx.value), float(my.value), float(mz.value) + + +def vec_subs(a, b): + return tuple(a[i] - b[i] for i in range(len(a))) + + +def billboard_matrix(): + """ + Removes rotational components of + current matrix so that primitives + are always drawn facing the viewer. + + |1|0|0|x| + |0|1|0|x| + |0|0|1|x| (x means left unchanged) + |x|x|x|x| + """ + m = get_model_matrix() + # XXX: for i in range(11): m[i] = i ? + m[0] = 1 + m[1] = 0 + m[2] = 0 + m[4] = 0 + m[5] = 1 + m[6] = 0 + m[8] = 0 + m[9] = 0 + m[10] = 1 + pgl.glLoadMatrixf(m) + + +def create_bounds(): + return [[S.Infinity, S.NegativeInfinity, 0], + [S.Infinity, S.NegativeInfinity, 0], + [S.Infinity, S.NegativeInfinity, 0]] + + +def update_bounds(b, v): + if v is None: + return + for axis in range(3): + b[axis][0] = min([b[axis][0], v[axis]]) + b[axis][1] = max([b[axis][1], v[axis]]) + + +def interpolate(a_min, a_max, a_ratio): + return a_min + a_ratio * (a_max - a_min) + + +def rinterpolate(a_min, a_max, a_value): + a_range = a_max - a_min + if a_max == a_min: + a_range = 1.0 + return (a_value - a_min) / float(a_range) + + +def interpolate_color(color1, color2, ratio): + return tuple(interpolate(color1[i], color2[i], ratio) for i in range(3)) + + +def scale_value(v, v_min, v_len): + return (v - v_min) / v_len + + +def scale_value_list(flist): + v_min, v_max = min(flist), max(flist) + v_len = v_max - v_min + return [scale_value(f, v_min, v_len) for f in flist] + + +def strided_range(r_min, r_max, stride, max_steps=50): + o_min, o_max = r_min, r_max + if abs(r_min - r_max) < 0.001: + return [] + try: + range(int(r_min - r_max)) + except (TypeError, OverflowError): + return [] + if r_min > r_max: + raise ValueError("r_min cannot be greater than r_max") + r_min_s = (r_min % stride) + r_max_s = stride - (r_max % stride) + if abs(r_max_s - stride) < 0.001: + r_max_s = 0.0 + r_min -= r_min_s + r_max += r_max_s + r_steps = int((r_max - r_min)/stride) + if max_steps and r_steps > max_steps: + return strided_range(o_min, o_max, stride*2) + return [r_min] + [r_min + e*stride for e in range(1, r_steps + 1)] + [r_max] + + +def parse_option_string(s): + if not isinstance(s, str): + return None + options = {} + for token in s.split(';'): + pieces = token.split('=') + if len(pieces) == 1: + option, value = pieces[0], "" + elif len(pieces) == 2: + option, value = pieces + else: + raise ValueError("Plot option string '%s' is malformed." % (s)) + options[option.strip()] = value.strip() + return options + + +def dot_product(v1, v2): + return sum(v1[i]*v2[i] for i in range(3)) + + +def vec_sub(v1, v2): + return tuple(v1[i] - v2[i] for i in range(3)) + + +def vec_mag(v): + return sum(v[i]**2 for i in range(3))**(0.5) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/tests/__pycache__/__init__.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/tests/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..96415d4a49397e5ffb87d6dd66950b657b113109 Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/tests/__pycache__/__init__.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/tests/__pycache__/test_experimental_lambdify.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/tests/__pycache__/test_experimental_lambdify.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7ee5a114d38659635a9edd109c35e10932360685 Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/tests/__pycache__/test_experimental_lambdify.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/tests/__pycache__/test_plot.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/tests/__pycache__/test_plot.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..439c2acf483fd51324b68bbc266b6c2365008e36 Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/tests/__pycache__/test_plot.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/tests/__pycache__/test_plot_implicit.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/tests/__pycache__/test_plot_implicit.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..57f5fa8078c1679b521571b7b1392388c08eafaa Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/tests/__pycache__/test_plot_implicit.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/tests/__pycache__/test_textplot.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/tests/__pycache__/test_textplot.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..382b75d9f2d416d2ca54aebd8422125bb88a9e63 Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/tests/__pycache__/test_textplot.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/tests/__pycache__/test_utils.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/tests/__pycache__/test_utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..48c951471e65a1d29a0f572ec195fdae4db3b06f Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/plotting/tests/__pycache__/test_utils.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/testing/__pycache__/__init__.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/testing/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9909b6394a9c480ecdf8020bcbff5a2020eb53d9 Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/testing/__pycache__/__init__.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/testing/__pycache__/matrices.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/testing/__pycache__/matrices.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a9963e07c93b8f1232d36e0e43ddd8ff53a20901 Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/testing/__pycache__/matrices.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/testing/__pycache__/pytest.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/testing/__pycache__/pytest.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0706f62ccfc23ee971713ca3e7d81a69a9df0372 Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/testing/__pycache__/pytest.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/testing/__pycache__/quality_unicode.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/testing/__pycache__/quality_unicode.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..742c59832e1cf911a08770356723916c7929409d Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/testing/__pycache__/quality_unicode.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/testing/__pycache__/randtest.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/testing/__pycache__/randtest.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..96c0093b32c1d56a7776266d6994fd42a31f3cda Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/testing/__pycache__/randtest.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/testing/__pycache__/runtests.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/testing/__pycache__/runtests.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a5444db1216a7a9b4e68a646f278f76abe016de9 Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/testing/__pycache__/runtests.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/testing/__pycache__/runtests_pytest.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/testing/__pycache__/runtests_pytest.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..151c0d5e349712a45a8775412acb91ae28377570 Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/testing/__pycache__/runtests_pytest.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/testing/__pycache__/tmpfiles.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/testing/__pycache__/tmpfiles.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1a946599e28f3fef2bb3eb5a7e800d4018ada6e1 Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/testing/__pycache__/tmpfiles.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/testing/tests/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/testing/tests/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/testing/tests/__pycache__/__init__.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/testing/tests/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..11728aae37a5850b6f9aad1d6498604ad1a2be3b Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/testing/tests/__pycache__/__init__.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/testing/tests/__pycache__/diagnose_imports.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/testing/tests/__pycache__/diagnose_imports.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..69254ff97b6bfd9a641331ce5b2bd2f7c2e34269 Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/testing/tests/__pycache__/diagnose_imports.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/testing/tests/__pycache__/test_code_quality.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/testing/tests/__pycache__/test_code_quality.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a4a0de4659be89378a059050e8e8a82182532c39 Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/testing/tests/__pycache__/test_code_quality.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/testing/tests/__pycache__/test_deprecated.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/testing/tests/__pycache__/test_deprecated.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..45a5d39b880f917a78be8680b80405797479618d Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/testing/tests/__pycache__/test_deprecated.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/testing/tests/__pycache__/test_module_imports.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/testing/tests/__pycache__/test_module_imports.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c51b1f107d2334a68582269d6fdec98fbeb3903e Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/testing/tests/__pycache__/test_module_imports.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/testing/tests/__pycache__/test_pytest.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/testing/tests/__pycache__/test_pytest.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1d9020560613dae2776c4213e02c874d9e76b0e8 Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/testing/tests/__pycache__/test_pytest.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/testing/tests/__pycache__/test_runtests_pytest.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/testing/tests/__pycache__/test_runtests_pytest.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4c718c5c0d9e021e9075a34adb9ad2b3563d4683 Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/testing/tests/__pycache__/test_runtests_pytest.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/testing/tests/diagnose_imports.py b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/testing/tests/diagnose_imports.py new file mode 100644 index 0000000000000000000000000000000000000000..a31b66c66690c082800ae36eee37dad6927e0b37 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/testing/tests/diagnose_imports.py @@ -0,0 +1,216 @@ +#!/usr/bin/env python + +""" +Import diagnostics. Run bin/diagnose_imports.py --help for details. +""" + +from __future__ import annotations + +if __name__ == "__main__": + + import sys + import inspect + import builtins + + import optparse + + from os.path import abspath, dirname, join, normpath + this_file = abspath(__file__) + sympy_dir = join(dirname(this_file), '..', '..', '..') + sympy_dir = normpath(sympy_dir) + sys.path.insert(0, sympy_dir) + + option_parser = optparse.OptionParser( + usage= + "Usage: %prog option [options]\n" + "\n" + "Import analysis for imports between SymPy modules.") + option_group = optparse.OptionGroup( + option_parser, + 'Analysis options', + 'Options that define what to do. Exactly one of these must be given.') + option_group.add_option( + '--problems', + help= + 'Print all import problems, that is: ' + 'If an import pulls in a package instead of a module ' + '(e.g. sympy.core instead of sympy.core.add); ' # see ##PACKAGE## + 'if it imports a symbol that is already present; ' # see ##DUPLICATE## + 'if it imports a symbol ' + 'from somewhere other than the defining module.', # see ##ORIGIN## + action='count') + option_group.add_option( + '--origins', + help= + 'For each imported symbol in each module, ' + 'print the module that defined it. ' + '(This is useful for import refactoring.)', + action='count') + option_parser.add_option_group(option_group) + option_group = optparse.OptionGroup( + option_parser, + 'Sort options', + 'These options define the sort order for output lines. ' + 'At most one of these options is allowed. ' + 'Unsorted output will reflect the order in which imports happened.') + option_group.add_option( + '--by-importer', + help='Sort output lines by name of importing module.', + action='count') + option_group.add_option( + '--by-origin', + help='Sort output lines by name of imported module.', + action='count') + option_parser.add_option_group(option_group) + (options, args) = option_parser.parse_args() + if args: + option_parser.error( + 'Unexpected arguments %s (try %s --help)' % (args, sys.argv[0])) + if options.problems > 1: + option_parser.error('--problems must not be given more than once.') + if options.origins > 1: + option_parser.error('--origins must not be given more than once.') + if options.by_importer > 1: + option_parser.error('--by-importer must not be given more than once.') + if options.by_origin > 1: + option_parser.error('--by-origin must not be given more than once.') + options.problems = options.problems == 1 + options.origins = options.origins == 1 + options.by_importer = options.by_importer == 1 + options.by_origin = options.by_origin == 1 + if not options.problems and not options.origins: + option_parser.error( + 'At least one of --problems and --origins is required') + if options.problems and options.origins: + option_parser.error( + 'At most one of --problems and --origins is allowed') + if options.by_importer and options.by_origin: + option_parser.error( + 'At most one of --by-importer and --by-origin is allowed') + options.by_process = not options.by_importer and not options.by_origin + + builtin_import = builtins.__import__ + + class Definition: + """Information about a symbol's definition.""" + def __init__(self, name, value, definer): + self.name = name + self.value = value + self.definer = definer + def __hash__(self): + return hash(self.name) + def __eq__(self, other): + return self.name == other.name and self.value == other.value + def __ne__(self, other): + return not (self == other) + def __repr__(self): + return 'Definition(%s, ..., %s)' % ( + repr(self.name), repr(self.definer)) + + # Maps each function/variable to name of module to define it + symbol_definers: dict[Definition, str] = {} + + def in_module(a, b): + """Is a the same module as or a submodule of b?""" + return a == b or a != None and b != None and a.startswith(b + '.') + + def relevant(module): + """Is module relevant for import checking? + + Only imports between relevant modules will be checked.""" + return in_module(module, 'sympy') + + sorted_messages = [] + + def msg(msg, *args): + if options.by_process: + print(msg % args) + else: + sorted_messages.append(msg % args) + + def tracking_import(module, globals=globals(), locals=[], fromlist=None, level=-1): + """__import__ wrapper - does not change imports at all, but tracks them. + + Default order is implemented by doing output directly. + All other orders are implemented by collecting output information into + a sorted list that will be emitted after all imports are processed. + + Indirect imports can only occur after the requested symbol has been + imported directly (because the indirect import would not have a module + to pick the symbol up from). + So this code detects indirect imports by checking whether the symbol in + question was already imported. + + Keeps the semantics of __import__ unchanged.""" + caller_frame = inspect.getframeinfo(sys._getframe(1)) + importer_filename = caller_frame.filename + importer_module = globals['__name__'] + if importer_filename == caller_frame.filename: + importer_reference = '%s line %s' % ( + importer_filename, str(caller_frame.lineno)) + else: + importer_reference = importer_filename + result = builtin_import(module, globals, locals, fromlist, level) + importee_module = result.__name__ + # We're only interested if importer and importee are in SymPy + if relevant(importer_module) and relevant(importee_module): + for symbol in result.__dict__.iterkeys(): + definition = Definition( + symbol, result.__dict__[symbol], importer_module) + if definition not in symbol_definers: + symbol_definers[definition] = importee_module + if hasattr(result, '__path__'): + ##PACKAGE## + # The existence of __path__ is documented in the tutorial on modules. + # Python 3.3 documents this in http://docs.python.org/3.3/reference/import.html + if options.by_origin: + msg('Error: %s (a package) is imported by %s', + module, importer_reference) + else: + msg('Error: %s contains package import %s', + importer_reference, module) + if fromlist != None: + symbol_list = fromlist + if '*' in symbol_list: + if (importer_filename.endswith(("__init__.py", "__init__.pyc", "__init__.pyo"))): + # We do not check starred imports inside __init__ + # That's the normal "please copy over its imports to my namespace" + symbol_list = [] + else: + symbol_list = result.__dict__.iterkeys() + for symbol in symbol_list: + if symbol not in result.__dict__: + if options.by_origin: + msg('Error: %s.%s is not defined (yet), but %s tries to import it', + importee_module, symbol, importer_reference) + else: + msg('Error: %s tries to import %s.%s, which did not define it (yet)', + importer_reference, importee_module, symbol) + else: + definition = Definition( + symbol, result.__dict__[symbol], importer_module) + symbol_definer = symbol_definers[definition] + if symbol_definer == importee_module: + ##DUPLICATE## + if options.by_origin: + msg('Error: %s.%s is imported again into %s', + importee_module, symbol, importer_reference) + else: + msg('Error: %s imports %s.%s again', + importer_reference, importee_module, symbol) + else: + ##ORIGIN## + if options.by_origin: + msg('Error: %s.%s is imported by %s, which should import %s.%s instead', + importee_module, symbol, importer_reference, symbol_definer, symbol) + else: + msg('Error: %s imports %s.%s but should import %s.%s instead', + importer_reference, importee_module, symbol, symbol_definer, symbol) + return result + + builtins.__import__ = tracking_import + __import__('sympy') + + sorted_messages.sort() + for message in sorted_messages: + print(message) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/testing/tests/test_code_quality.py b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/testing/tests/test_code_quality.py new file mode 100644 index 0000000000000000000000000000000000000000..9a9f363f0b9a802553f8643186b7d858d1ad0694 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/testing/tests/test_code_quality.py @@ -0,0 +1,510 @@ +# coding=utf-8 +from os import walk, sep, pardir +from os.path import split, join, abspath, exists, isfile +from glob import glob +import re +import random +import ast + +from sympy.testing.pytest import raises +from sympy.testing.quality_unicode import _test_this_file_encoding + +# System path separator (usually slash or backslash) to be +# used with excluded files, e.g. +# exclude = set([ +# "%(sep)smpmath%(sep)s" % sepd, +# ]) +sepd = {"sep": sep} + +# path and sympy_path +SYMPY_PATH = abspath(join(split(__file__)[0], pardir, pardir)) # go to sympy/ +assert exists(SYMPY_PATH) + +TOP_PATH = abspath(join(SYMPY_PATH, pardir)) +BIN_PATH = join(TOP_PATH, "bin") +EXAMPLES_PATH = join(TOP_PATH, "examples") + +# Error messages +message_space = "File contains trailing whitespace: %s, line %s." +message_implicit = "File contains an implicit import: %s, line %s." +message_tabs = "File contains tabs instead of spaces: %s, line %s." +message_carriage = "File contains carriage returns at end of line: %s, line %s" +message_str_raise = "File contains string exception: %s, line %s" +message_gen_raise = "File contains generic exception: %s, line %s" +message_old_raise = "File contains old-style raise statement: %s, line %s, \"%s\"" +message_eof = "File does not end with a newline: %s, line %s" +message_multi_eof = "File ends with more than 1 newline: %s, line %s" +message_test_suite_def = "Function should start with 'test_' or '_': %s, line %s" +message_duplicate_test = "This is a duplicate test function: %s, line %s" +message_self_assignments = "File contains assignments to self/cls: %s, line %s." +message_func_is = "File contains '.func is': %s, line %s." +message_bare_expr = "File contains bare expression: %s, line %s." + +implicit_test_re = re.compile(r'^\s*(>>> )?(\.\.\. )?from .* import .*\*') +str_raise_re = re.compile( + r'^\s*(>>> )?(\.\.\. )?raise(\s+(\'|\")|\s*(\(\s*)+(\'|\"))') +gen_raise_re = re.compile( + r'^\s*(>>> )?(\.\.\. )?raise(\s+Exception|\s*(\(\s*)+Exception)') +old_raise_re = re.compile(r'^\s*(>>> )?(\.\.\. )?raise((\s*\(\s*)|\s+)\w+\s*,') +test_suite_def_re = re.compile(r'^def\s+(?!(_|test))[^(]*\(\s*\)\s*:$') +test_ok_def_re = re.compile(r'^def\s+test_.*:$') +test_file_re = re.compile(r'.*[/\\]test_.*\.py$') +func_is_re = re.compile(r'\.\s*func\s+is') + + +def tab_in_leading(s): + """Returns True if there are tabs in the leading whitespace of a line, + including the whitespace of docstring code samples.""" + n = len(s) - len(s.lstrip()) + if not s[n:n + 3] in ['...', '>>>']: + check = s[:n] + else: + smore = s[n + 3:] + check = s[:n] + smore[:len(smore) - len(smore.lstrip())] + return not (check.expandtabs() == check) + + +def find_self_assignments(s): + """Returns a list of "bad" assignments: if there are instances + of assigning to the first argument of the class method (except + for staticmethod's). + """ + t = [n for n in ast.parse(s).body if isinstance(n, ast.ClassDef)] + + bad = [] + for c in t: + for n in c.body: + if not isinstance(n, ast.FunctionDef): + continue + if any(d.id == 'staticmethod' + for d in n.decorator_list if isinstance(d, ast.Name)): + continue + if n.name == '__new__': + continue + if not n.args.args: + continue + first_arg = n.args.args[0].arg + + for m in ast.walk(n): + if isinstance(m, ast.Assign): + for a in m.targets: + if isinstance(a, ast.Name) and a.id == first_arg: + bad.append(m) + elif (isinstance(a, ast.Tuple) and + any(q.id == first_arg for q in a.elts + if isinstance(q, ast.Name))): + bad.append(m) + + return bad + + +def check_directory_tree(base_path, file_check, exclusions=set(), pattern="*.py"): + """ + Checks all files in the directory tree (with base_path as starting point) + with the file_check function provided, skipping files that contain + any of the strings in the set provided by exclusions. + """ + if not base_path: + return + for root, dirs, files in walk(base_path): + check_files(glob(join(root, pattern)), file_check, exclusions) + + +def check_files(files, file_check, exclusions=set(), pattern=None): + """ + Checks all files with the file_check function provided, skipping files + that contain any of the strings in the set provided by exclusions. + """ + if not files: + return + for fname in files: + if not exists(fname) or not isfile(fname): + continue + if any(ex in fname for ex in exclusions): + continue + if pattern is None or re.match(pattern, fname): + file_check(fname) + + +class _Visit(ast.NodeVisitor): + """return the line number corresponding to the + line on which a bare expression appears if it is a binary op + or a comparison that is not in a with block. + + EXAMPLES + ======== + + >>> import ast + >>> class _Visit(ast.NodeVisitor): + ... def visit_Expr(self, node): + ... if isinstance(node.value, (ast.BinOp, ast.Compare)): + ... print(node.lineno) + ... def visit_With(self, node): + ... pass # no checking there + ... + >>> code='''x = 1 # line 1 + ... for i in range(3): + ... x == 2 # <-- 3 + ... if x == 2: + ... x == 3 # <-- 5 + ... x + 1 # <-- 6 + ... x = 1 + ... if x == 1: + ... print(1) + ... while x != 1: + ... x == 1 # <-- 11 + ... with raises(TypeError): + ... c == 1 + ... raise TypeError + ... assert x == 1 + ... ''' + >>> _Visit().visit(ast.parse(code)) + 3 + 5 + 6 + 11 + """ + def visit_Expr(self, node): + if isinstance(node.value, (ast.BinOp, ast.Compare)): + assert None, message_bare_expr % ('', node.lineno) + def visit_With(self, node): + pass + + +BareExpr = _Visit() + + +def line_with_bare_expr(code): + """return None or else 0-based line number of code on which + a bare expression appeared. + """ + tree = ast.parse(code) + try: + BareExpr.visit(tree) + except AssertionError as msg: + assert msg.args + msg = msg.args[0] + assert msg.startswith(message_bare_expr.split(':', 1)[0]) + return int(msg.rsplit(' ', 1)[1].rstrip('.')) # the line number + + +def test_files(): + """ + This test tests all files in SymPy and checks that: + o no lines contains a trailing whitespace + o no lines end with \r\n + o no line uses tabs instead of spaces + o that the file ends with a single newline + o there are no general or string exceptions + o there are no old style raise statements + o name of arg-less test suite functions start with _ or test_ + o no duplicate function names that start with test_ + o no assignments to self variable in class methods + o no lines contain ".func is" except in the test suite + o there is no do-nothing expression like `a == b` or `x + 1` + """ + + def test(fname): + with open(fname, encoding="utf8") as test_file: + test_this_file(fname, test_file) + with open(fname, encoding='utf8') as test_file: + _test_this_file_encoding(fname, test_file) + + def test_this_file(fname, test_file): + idx = None + code = test_file.read() + test_file.seek(0) # restore reader to head + py = fname if sep not in fname else fname.rsplit(sep, 1)[-1] + if py.startswith('test_'): + idx = line_with_bare_expr(code) + if idx is not None: + assert False, message_bare_expr % (fname, idx + 1) + + line = None # to flag the case where there were no lines in file + tests = 0 + test_set = set() + for idx, line in enumerate(test_file): + if test_file_re.match(fname): + if test_suite_def_re.match(line): + assert False, message_test_suite_def % (fname, idx + 1) + if test_ok_def_re.match(line): + tests += 1 + test_set.add(line[3:].split('(')[0].strip()) + if len(test_set) != tests: + assert False, message_duplicate_test % (fname, idx + 1) + if line.endswith((" \n", "\t\n")): + assert False, message_space % (fname, idx + 1) + if line.endswith("\r\n"): + assert False, message_carriage % (fname, idx + 1) + if tab_in_leading(line): + assert False, message_tabs % (fname, idx + 1) + if str_raise_re.search(line): + assert False, message_str_raise % (fname, idx + 1) + if gen_raise_re.search(line): + assert False, message_gen_raise % (fname, idx + 1) + if (implicit_test_re.search(line) and + not list(filter(lambda ex: ex in fname, import_exclude))): + assert False, message_implicit % (fname, idx + 1) + if func_is_re.search(line) and not test_file_re.search(fname): + assert False, message_func_is % (fname, idx + 1) + + result = old_raise_re.search(line) + + if result is not None: + assert False, message_old_raise % ( + fname, idx + 1, result.group(2)) + + if line is not None: + if line == '\n' and idx > 0: + assert False, message_multi_eof % (fname, idx + 1) + elif not line.endswith('\n'): + # eof newline check + assert False, message_eof % (fname, idx + 1) + + + # Files to test at top level + top_level_files = [join(TOP_PATH, file) for file in [ + "isympy.py", + "build.py", + "setup.py", + ]] + # Files to exclude from all tests + exclude = { + "%(sep)ssympy%(sep)sparsing%(sep)sautolev%(sep)s_antlr%(sep)sautolevparser.py" % sepd, + "%(sep)ssympy%(sep)sparsing%(sep)sautolev%(sep)s_antlr%(sep)sautolevlexer.py" % sepd, + "%(sep)ssympy%(sep)sparsing%(sep)sautolev%(sep)s_antlr%(sep)sautolevlistener.py" % sepd, + "%(sep)ssympy%(sep)sparsing%(sep)slatex%(sep)s_antlr%(sep)slatexparser.py" % sepd, + "%(sep)ssympy%(sep)sparsing%(sep)slatex%(sep)s_antlr%(sep)slatexlexer.py" % sepd, + } + # Files to exclude from the implicit import test + import_exclude = { + # glob imports are allowed in top-level __init__.py: + "%(sep)ssympy%(sep)s__init__.py" % sepd, + # these __init__.py should be fixed: + # XXX: not really, they use useful import pattern (DRY) + "%(sep)svector%(sep)s__init__.py" % sepd, + "%(sep)smechanics%(sep)s__init__.py" % sepd, + "%(sep)squantum%(sep)s__init__.py" % sepd, + "%(sep)spolys%(sep)s__init__.py" % sepd, + "%(sep)spolys%(sep)sdomains%(sep)s__init__.py" % sepd, + # interactive SymPy executes ``from sympy import *``: + "%(sep)sinteractive%(sep)ssession.py" % sepd, + # isympy.py executes ``from sympy import *``: + "%(sep)sisympy.py" % sepd, + # these two are import timing tests: + "%(sep)sbin%(sep)ssympy_time.py" % sepd, + "%(sep)sbin%(sep)ssympy_time_cache.py" % sepd, + # Taken from Python stdlib: + "%(sep)sparsing%(sep)ssympy_tokenize.py" % sepd, + # this one should be fixed: + "%(sep)splotting%(sep)spygletplot%(sep)s" % sepd, + # False positive in the docstring + "%(sep)sbin%(sep)stest_external_imports.py" % sepd, + "%(sep)sbin%(sep)stest_submodule_imports.py" % sepd, + # These are deprecated stubs that can be removed at some point: + "%(sep)sutilities%(sep)sruntests.py" % sepd, + "%(sep)sutilities%(sep)spytest.py" % sepd, + "%(sep)sutilities%(sep)srandtest.py" % sepd, + "%(sep)sutilities%(sep)stmpfiles.py" % sepd, + "%(sep)sutilities%(sep)squality_unicode.py" % sepd, + } + check_files(top_level_files, test) + check_directory_tree(BIN_PATH, test, {"~", ".pyc", ".sh"}, "*") + check_directory_tree(SYMPY_PATH, test, exclude) + check_directory_tree(EXAMPLES_PATH, test, exclude) + + +def _with_space(c): + # return c with a random amount of leading space + return random.randint(0, 10)*' ' + c + + +def test_raise_statement_regular_expression(): + candidates_ok = [ + "some text # raise Exception, 'text'", + "raise ValueError('text') # raise Exception, 'text'", + "raise ValueError('text')", + "raise ValueError", + "raise ValueError('text')", + "raise ValueError('text') #,", + # Talking about an exception in a docstring + ''''"""This function will raise ValueError, except when it doesn't"""''', + "raise (ValueError('text')", + ] + str_candidates_fail = [ + "raise 'exception'", + "raise 'Exception'", + 'raise "exception"', + 'raise "Exception"', + "raise 'ValueError'", + ] + gen_candidates_fail = [ + "raise Exception('text') # raise Exception, 'text'", + "raise Exception('text')", + "raise Exception", + "raise Exception('text')", + "raise Exception('text') #,", + "raise Exception, 'text'", + "raise Exception, 'text' # raise Exception('text')", + "raise Exception, 'text' # raise Exception, 'text'", + ">>> raise Exception, 'text'", + ">>> raise Exception, 'text' # raise Exception('text')", + ">>> raise Exception, 'text' # raise Exception, 'text'", + ] + old_candidates_fail = [ + "raise Exception, 'text'", + "raise Exception, 'text' # raise Exception('text')", + "raise Exception, 'text' # raise Exception, 'text'", + ">>> raise Exception, 'text'", + ">>> raise Exception, 'text' # raise Exception('text')", + ">>> raise Exception, 'text' # raise Exception, 'text'", + "raise ValueError, 'text'", + "raise ValueError, 'text' # raise Exception('text')", + "raise ValueError, 'text' # raise Exception, 'text'", + ">>> raise ValueError, 'text'", + ">>> raise ValueError, 'text' # raise Exception('text')", + ">>> raise ValueError, 'text' # raise Exception, 'text'", + "raise(ValueError,", + "raise (ValueError,", + "raise( ValueError,", + "raise ( ValueError,", + "raise(ValueError ,", + "raise (ValueError ,", + "raise( ValueError ,", + "raise ( ValueError ,", + ] + + for c in candidates_ok: + assert str_raise_re.search(_with_space(c)) is None, c + assert gen_raise_re.search(_with_space(c)) is None, c + assert old_raise_re.search(_with_space(c)) is None, c + for c in str_candidates_fail: + assert str_raise_re.search(_with_space(c)) is not None, c + for c in gen_candidates_fail: + assert gen_raise_re.search(_with_space(c)) is not None, c + for c in old_candidates_fail: + assert old_raise_re.search(_with_space(c)) is not None, c + + +def test_implicit_imports_regular_expression(): + candidates_ok = [ + "from sympy import something", + ">>> from sympy import something", + "from sympy.somewhere import something", + ">>> from sympy.somewhere import something", + "import sympy", + ">>> import sympy", + "import sympy.something.something", + "... import sympy", + "... import sympy.something.something", + "... from sympy import something", + "... from sympy.somewhere import something", + ">> from sympy import *", # To allow 'fake' docstrings + "# from sympy import *", + "some text # from sympy import *", + ] + candidates_fail = [ + "from sympy import *", + ">>> from sympy import *", + "from sympy.somewhere import *", + ">>> from sympy.somewhere import *", + "... from sympy import *", + "... from sympy.somewhere import *", + ] + for c in candidates_ok: + assert implicit_test_re.search(_with_space(c)) is None, c + for c in candidates_fail: + assert implicit_test_re.search(_with_space(c)) is not None, c + + +def test_test_suite_defs(): + candidates_ok = [ + " def foo():\n", + "def foo(arg):\n", + "def _foo():\n", + "def test_foo():\n", + ] + candidates_fail = [ + "def foo():\n", + "def foo() :\n", + "def foo( ):\n", + "def foo():\n", + ] + for c in candidates_ok: + assert test_suite_def_re.search(c) is None, c + for c in candidates_fail: + assert test_suite_def_re.search(c) is not None, c + + +def test_test_duplicate_defs(): + candidates_ok = [ + "def foo():\ndef foo():\n", + "def test():\ndef test_():\n", + "def test_():\ndef test__():\n", + ] + candidates_fail = [ + "def test_():\ndef test_ ():\n", + "def test_1():\ndef test_1():\n", + ] + ok = (None, 'check') + def check(file): + tests = 0 + test_set = set() + for idx, line in enumerate(file.splitlines()): + if test_ok_def_re.match(line): + tests += 1 + test_set.add(line[3:].split('(')[0].strip()) + if len(test_set) != tests: + return False, message_duplicate_test % ('check', idx + 1) + return None, 'check' + for c in candidates_ok: + assert check(c) == ok + for c in candidates_fail: + assert check(c) != ok + + +def test_find_self_assignments(): + candidates_ok = [ + "class A(object):\n def foo(self, arg): arg = self\n", + "class A(object):\n def foo(self, arg): self.prop = arg\n", + "class A(object):\n def foo(self, arg): obj, obj2 = arg, self\n", + "class A(object):\n @classmethod\n def bar(cls, arg): arg = cls\n", + "class A(object):\n def foo(var, arg): arg = var\n", + ] + candidates_fail = [ + "class A(object):\n def foo(self, arg): self = arg\n", + "class A(object):\n def foo(self, arg): obj, self = arg, arg\n", + "class A(object):\n def foo(self, arg):\n if arg: self = arg", + "class A(object):\n @classmethod\n def foo(cls, arg): cls = arg\n", + "class A(object):\n def foo(var, arg): var = arg\n", + ] + + for c in candidates_ok: + assert find_self_assignments(c) == [] + for c in candidates_fail: + assert find_self_assignments(c) != [] + + +def test_test_unicode_encoding(): + unicode_whitelist = ['foo'] + unicode_strict_whitelist = ['bar'] + + fname = 'abc' + test_file = ['α'] + raises(AssertionError, lambda: _test_this_file_encoding( + fname, test_file, unicode_whitelist, unicode_strict_whitelist)) + + fname = 'abc' + test_file = ['abc'] + _test_this_file_encoding( + fname, test_file, unicode_whitelist, unicode_strict_whitelist) + + fname = 'foo' + test_file = ['abc'] + raises(AssertionError, lambda: _test_this_file_encoding( + fname, test_file, unicode_whitelist, unicode_strict_whitelist)) + + fname = 'bar' + test_file = ['abc'] + _test_this_file_encoding( + fname, test_file, unicode_whitelist, unicode_strict_whitelist) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/testing/tests/test_deprecated.py b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/testing/tests/test_deprecated.py new file mode 100644 index 0000000000000000000000000000000000000000..696933d96d6232ea869da1002ec9ebee5309724d --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/testing/tests/test_deprecated.py @@ -0,0 +1,5 @@ +from sympy.testing.pytest import warns_deprecated_sympy + +def test_deprecated_testing_randtest(): + with warns_deprecated_sympy(): + import sympy.testing.randtest # noqa:F401 diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/testing/tests/test_module_imports.py b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/testing/tests/test_module_imports.py new file mode 100644 index 0000000000000000000000000000000000000000..d16dbaa98156c287c18b46ff07c0ede5d26e069a --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/testing/tests/test_module_imports.py @@ -0,0 +1,42 @@ +""" +Checks that SymPy does not contain indirect imports. + +An indirect import is importing a symbol from a module that itself imported the +symbol from elsewhere. Such a constellation makes it harder to diagnose +inter-module dependencies and import order problems, and is therefore strongly +discouraged. + +(Indirect imports from end-user code is fine and in fact a best practice.) + +Implementation note: Forcing Python into actually unloading already-imported +submodules is a tricky and partly undocumented process. To avoid these issues, +the actual diagnostic code is in bin/diagnose_imports, which is run as a +separate, pristine Python process. +""" + +import subprocess +import sys +from os.path import abspath, dirname, join, normpath +import inspect + +from sympy.testing.pytest import XFAIL + +@XFAIL +def test_module_imports_are_direct(): + my_filename = abspath(inspect.getfile(inspect.currentframe())) + my_dirname = dirname(my_filename) + diagnose_imports_filename = join(my_dirname, 'diagnose_imports.py') + diagnose_imports_filename = normpath(diagnose_imports_filename) + + process = subprocess.Popen( + [ + sys.executable, + normpath(diagnose_imports_filename), + '--problems', + '--by-importer' + ], + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + bufsize=-1) + output, _ = process.communicate() + assert output == '', "There are import problems:\n" + output.decode() diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/testing/tests/test_pytest.py b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/testing/tests/test_pytest.py new file mode 100644 index 0000000000000000000000000000000000000000..7080a4ed4602707a3efb4d9025e8e9cbe23a5ef8 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/testing/tests/test_pytest.py @@ -0,0 +1,211 @@ +import warnings + +from sympy.testing.pytest import (raises, warns, ignore_warnings, + warns_deprecated_sympy, Failed) +from sympy.utilities.exceptions import sympy_deprecation_warning + + + +# Test callables + + +def test_expected_exception_is_silent_callable(): + def f(): + raise ValueError() + raises(ValueError, f) + + +# Under pytest raises will raise Failed rather than AssertionError +def test_lack_of_exception_triggers_AssertionError_callable(): + try: + raises(Exception, lambda: 1 + 1) + assert False + except Failed as e: + assert "DID NOT RAISE" in str(e) + + +def test_unexpected_exception_is_passed_through_callable(): + def f(): + raise ValueError("some error message") + try: + raises(TypeError, f) + assert False + except ValueError as e: + assert str(e) == "some error message" + +# Test with statement + +def test_expected_exception_is_silent_with(): + with raises(ValueError): + raise ValueError() + + +def test_lack_of_exception_triggers_AssertionError_with(): + try: + with raises(Exception): + 1 + 1 + assert False + except Failed as e: + assert "DID NOT RAISE" in str(e) + + +def test_unexpected_exception_is_passed_through_with(): + try: + with raises(TypeError): + raise ValueError("some error message") + assert False + except ValueError as e: + assert str(e) == "some error message" + +# Now we can use raises() instead of try/catch +# to test that a specific exception class is raised + + +def test_second_argument_should_be_callable_or_string(): + raises(TypeError, lambda: raises("irrelevant", 42)) + + +def test_warns_catches_warning(): + with warnings.catch_warnings(record=True) as w: + with warns(UserWarning): + warnings.warn('this is the warning message') + assert len(w) == 0 + + +def test_warns_raises_without_warning(): + with raises(Failed): + with warns(UserWarning): + pass + + +def test_warns_hides_other_warnings(): + with raises(RuntimeWarning): + with warns(UserWarning): + warnings.warn('this is the warning message', UserWarning) + warnings.warn('this is the other message', RuntimeWarning) + + +def test_warns_continues_after_warning(): + with warnings.catch_warnings(record=True) as w: + finished = False + with warns(UserWarning): + warnings.warn('this is the warning message') + finished = True + assert finished + assert len(w) == 0 + + +def test_warns_many_warnings(): + with warns(UserWarning): + warnings.warn('this is the warning message', UserWarning) + warnings.warn('this is the other warning message', UserWarning) + + +def test_warns_match_matching(): + with warnings.catch_warnings(record=True) as w: + with warns(UserWarning, match='this is the warning message'): + warnings.warn('this is the warning message', UserWarning) + assert len(w) == 0 + + +def test_warns_match_non_matching(): + with warnings.catch_warnings(record=True) as w: + with raises(Failed): + with warns(UserWarning, match='this is the warning message'): + warnings.warn('this is not the expected warning message', UserWarning) + assert len(w) == 0 + +def _warn_sympy_deprecation(stacklevel=3): + sympy_deprecation_warning( + "feature", + active_deprecations_target="active-deprecations", + deprecated_since_version="0.0.0", + stacklevel=stacklevel, + ) + +def test_warns_deprecated_sympy_catches_warning(): + with warnings.catch_warnings(record=True) as w: + with warns_deprecated_sympy(): + _warn_sympy_deprecation() + assert len(w) == 0 + + +def test_warns_deprecated_sympy_raises_without_warning(): + with raises(Failed): + with warns_deprecated_sympy(): + pass + +def test_warns_deprecated_sympy_wrong_stacklevel(): + with raises(Failed): + with warns_deprecated_sympy(): + _warn_sympy_deprecation(stacklevel=1) + +def test_warns_deprecated_sympy_doesnt_hide_other_warnings(): + # Unlike pytest's deprecated_call, we should not hide other warnings. + with raises(RuntimeWarning): + with warns_deprecated_sympy(): + _warn_sympy_deprecation() + warnings.warn('this is the other message', RuntimeWarning) + + +def test_warns_deprecated_sympy_continues_after_warning(): + with warnings.catch_warnings(record=True) as w: + finished = False + with warns_deprecated_sympy(): + _warn_sympy_deprecation() + finished = True + assert finished + assert len(w) == 0 + +def test_ignore_ignores_warning(): + with warnings.catch_warnings(record=True) as w: + with ignore_warnings(UserWarning): + warnings.warn('this is the warning message') + assert len(w) == 0 + + +def test_ignore_does_not_raise_without_warning(): + with warnings.catch_warnings(record=True) as w: + with ignore_warnings(UserWarning): + pass + assert len(w) == 0 + + +def test_ignore_allows_other_warnings(): + with warnings.catch_warnings(record=True) as w: + # This is needed when pytest is run as -Werror + # the setting is reverted at the end of the catch_Warnings block. + warnings.simplefilter("always") + with ignore_warnings(UserWarning): + warnings.warn('this is the warning message', UserWarning) + warnings.warn('this is the other message', RuntimeWarning) + assert len(w) == 1 + assert isinstance(w[0].message, RuntimeWarning) + assert str(w[0].message) == 'this is the other message' + + +def test_ignore_continues_after_warning(): + with warnings.catch_warnings(record=True) as w: + finished = False + with ignore_warnings(UserWarning): + warnings.warn('this is the warning message') + finished = True + assert finished + assert len(w) == 0 + + +def test_ignore_many_warnings(): + with warnings.catch_warnings(record=True) as w: + # This is needed when pytest is run as -Werror + # the setting is reverted at the end of the catch_Warnings block. + warnings.simplefilter("always") + with ignore_warnings(UserWarning): + warnings.warn('this is the warning message', UserWarning) + warnings.warn('this is the other message', RuntimeWarning) + warnings.warn('this is the warning message', UserWarning) + warnings.warn('this is the other message', RuntimeWarning) + warnings.warn('this is the other message', RuntimeWarning) + assert len(w) == 3 + for wi in w: + assert isinstance(wi.message, RuntimeWarning) + assert str(wi.message) == 'this is the other message' diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/testing/tests/test_runtests_pytest.py b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/testing/tests/test_runtests_pytest.py new file mode 100644 index 0000000000000000000000000000000000000000..cd56d831f3618c9dbb8a1dffe63d95a0befc6adf --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/sympy/testing/tests/test_runtests_pytest.py @@ -0,0 +1,171 @@ +import pathlib +from typing import List + +import pytest + +from sympy.testing.runtests_pytest import ( + make_absolute_path, + sympy_dir, + update_args_with_paths, +) + + +class TestMakeAbsolutePath: + + @staticmethod + @pytest.mark.parametrize( + 'partial_path', ['sympy', 'sympy/core', 'sympy/nonexistant_directory'], + ) + def test_valid_partial_path(partial_path: str): + """Paths that start with `sympy` are valid.""" + _ = make_absolute_path(partial_path) + + @staticmethod + @pytest.mark.parametrize( + 'partial_path', ['not_sympy', 'also/not/sympy'], + ) + def test_invalid_partial_path_raises_value_error(partial_path: str): + """A `ValueError` is raises on paths that don't start with `sympy`.""" + with pytest.raises(ValueError): + _ = make_absolute_path(partial_path) + + +class TestUpdateArgsWithPaths: + + @staticmethod + def test_no_paths(): + """If no paths are passed, only `sympy` and `doc/src` are appended. + + `sympy` and `doc/src` are the `testpaths` stated in `pytest.ini`. They + need to be manually added as if any path-related arguments are passed + to `pytest.main` then the settings in `pytest.ini` may be ignored. + + """ + paths = [] + args = update_args_with_paths(paths=paths, keywords=None, args=[]) + expected = [ + str(pathlib.Path(sympy_dir(), 'sympy')), + str(pathlib.Path(sympy_dir(), 'doc/src')), + ] + assert args == expected + + @staticmethod + @pytest.mark.parametrize( + 'path', + ['sympy/core/tests/test_basic.py', '_basic'] + ) + def test_one_file(path: str): + """Single files/paths, full or partial, are matched correctly.""" + args = update_args_with_paths(paths=[path], keywords=None, args=[]) + expected = [ + str(pathlib.Path(sympy_dir(), 'sympy/core/tests/test_basic.py')), + ] + assert args == expected + + @staticmethod + def test_partial_path_from_root(): + """Partial paths from the root directly are matched correctly.""" + args = update_args_with_paths(paths=['sympy/functions'], keywords=None, args=[]) + expected = [str(pathlib.Path(sympy_dir(), 'sympy/functions'))] + assert args == expected + + @staticmethod + def test_multiple_paths_from_root(): + """Multiple paths, partial or full, are matched correctly.""" + paths = ['sympy/core/tests/test_basic.py', 'sympy/functions'] + args = update_args_with_paths(paths=paths, keywords=None, args=[]) + expected = [ + str(pathlib.Path(sympy_dir(), 'sympy/core/tests/test_basic.py')), + str(pathlib.Path(sympy_dir(), 'sympy/functions')), + ] + assert args == expected + + @staticmethod + @pytest.mark.parametrize( + 'paths, expected_paths', + [ + ( + ['/core', '/util'], + [ + 'doc/src/modules/utilities', + 'doc/src/reference/public/utilities', + 'sympy/core', + 'sympy/logic/utilities', + 'sympy/utilities', + ] + ), + ] + ) + def test_multiple_paths_from_non_root(paths: List[str], expected_paths: List[str]): + """Multiple partial paths are matched correctly.""" + args = update_args_with_paths(paths=paths, keywords=None, args=[]) + assert len(args) == len(expected_paths) + for arg, expected in zip(sorted(args), expected_paths): + assert expected in arg + + @staticmethod + @pytest.mark.parametrize( + 'paths', + [ + + [], + ['sympy/physics'], + ['sympy/physics/mechanics'], + ['sympy/physics/mechanics/tests'], + ['sympy/physics/mechanics/tests/test_kane3.py'], + ] + ) + def test_string_as_keyword(paths: List[str]): + """String keywords are matched correctly.""" + keywords = ('bicycle', ) + args = update_args_with_paths(paths=paths, keywords=keywords, args=[]) + expected_args = ['sympy/physics/mechanics/tests/test_kane3.py::test_bicycle'] + assert len(args) == len(expected_args) + for arg, expected in zip(sorted(args), expected_args): + assert expected in arg + + @staticmethod + @pytest.mark.parametrize( + 'paths', + [ + + [], + ['sympy/core'], + ['sympy/core/tests'], + ['sympy/core/tests/test_sympify.py'], + ] + ) + def test_integer_as_keyword(paths: List[str]): + """Integer keywords are matched correctly.""" + keywords = ('3538', ) + args = update_args_with_paths(paths=paths, keywords=keywords, args=[]) + expected_args = ['sympy/core/tests/test_sympify.py::test_issue_3538'] + assert len(args) == len(expected_args) + for arg, expected in zip(sorted(args), expected_args): + assert expected in arg + + @staticmethod + def test_multiple_keywords(): + """Multiple keywords are matched correctly.""" + keywords = ('bicycle', '3538') + args = update_args_with_paths(paths=[], keywords=keywords, args=[]) + expected_args = [ + 'sympy/core/tests/test_sympify.py::test_issue_3538', + 'sympy/physics/mechanics/tests/test_kane3.py::test_bicycle', + ] + assert len(args) == len(expected_args) + for arg, expected in zip(sorted(args), expected_args): + assert expected in arg + + @staticmethod + def test_keyword_match_in_multiple_files(): + """Keywords are matched across multiple files.""" + keywords = ('1130', ) + args = update_args_with_paths(paths=[], keywords=keywords, args=[]) + expected_args = [ + 'sympy/integrals/tests/test_heurisch.py::test_heurisch_symbolic_coeffs_1130', + 'sympy/utilities/tests/test_lambdify.py::test_python_div_zero_issue_11306', + ] + assert len(args) == len(expected_args) + for arg, expected in zip(sorted(args), expected_args): + assert expected in arg diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/__pycache__/__init__.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fe5c2b932981b98e56fe31ed1acc663069baefad Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/__pycache__/__init__.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/aimv2/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/aimv2/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..c013363996040eb995d7924b20e8f86d6b318a83 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/aimv2/__init__.py @@ -0,0 +1,27 @@ +# Copyright 2025 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_aimv2 import * + from .modeling_aimv2 import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/aimv2/configuration_aimv2.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/aimv2/configuration_aimv2.py new file mode 100644 index 0000000000000000000000000000000000000000..adab18c7447a803117cb0d89abdf38b4dc96167f --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/aimv2/configuration_aimv2.py @@ -0,0 +1,284 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/aimv2/modular_aimv2.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_aimv2.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright 2025 Apple Inc. and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Optional + +from ...configuration_utils import PretrainedConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class Aimv2VisionConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`Aimv2VisionModel`]. It is used to instantiate a + AIMv2 vision encoder according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of the vision encoder of the AIMv2 + [apple/aimv2-large-patch14-224](https://huggingface.co/apple/aimv2-large-patch14-224) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + hidden_size (`int`, *optional*, defaults to 1024): + Dimensionality of the encoder layers and the pooler layer. + intermediate_size (`int`, *optional*, defaults to 2816): + Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + num_hidden_layers (`int`, *optional*, defaults to 24): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 8): + Number of attention heads for each attention layer in the Transformer encoder. + num_channels (`int`, *optional*, defaults to 3): + Number of channels in the input images. + image_size (`int`, *optional*, defaults to 224): + The size (resolution) of each image. + patch_size (`int`, *optional*, defaults to 14): + The size (resolution) of each patch. + rms_norm_eps (`float`, *optional*, defaults to 1e-05): + The epsilon used by the rms normalization layers. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + qkv_bias (`bool`, *optional*, defaults to `False`): + Whether to add a bias to the queries, keys and values. + mlp_bias (`bool`, *optional*, defaults to `False`): + Whether to add a bias to the Linear layers or Not. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the for initializing all weight matrices. + use_head (`str`, *optional*, defaults to `True`): + Whether to use Attention Pooling Head or Not. + is_native (`str`, *optional*, defaults to `False`): + Whether to use ckpt trained for image native resolution or not. + Example: + + ```python + >>> from transformers import SiglipVisionConfig, SiglipVisionModel + + >>> # Initializing a Aimv2VisionConfig with apple/aimv2-large-patch14-224 style configuration + >>> configuration = Aimv2VisionConfig() + + >>> # Initializing a Aimv2VisionModel (with random weights) from the apple/aimv2-large-patch14-224 style configuration + >>> model = Aimv2VisionModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "aimv2_vision_model" + base_config_key = "vision_config" + + def __init__( + self, + hidden_size: int = 1024, + intermediate_size: int = 2816, + num_hidden_layers: int = 24, + num_attention_heads: int = 8, + num_channels: int = 3, + image_size: int = 224, + patch_size: int = 14, + rms_norm_eps: float = 1e-5, + attention_dropout: float = 0.0, + qkv_bias: bool = False, + mlp_bias: bool = False, + hidden_act: str = "silu", + initializer_range: float = 0.02, + use_head: bool = True, + is_native: bool = False, + **kwargs, + ): + super().__init__(**kwargs) + + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.num_channels = num_channels + self.patch_size = patch_size + self.image_size = image_size + self.attention_dropout = attention_dropout + self.hidden_act = hidden_act + + self.use_head = use_head + self.initializer_range = initializer_range + self.mlp_bias = mlp_bias + self.qkv_bias = qkv_bias + self.rms_norm_eps = rms_norm_eps + self.is_native = is_native + + +class Aimv2TextConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`Aimv2TextModel`]. It is used to instantiate a + AIMv2 text encoder according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of the text encoder of the AIMv2 + [apple/aimv2-large-patch14-224-lit](https://huggingface.co/apple/aimv2-large-patch14-224-lit) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + vocab_size (`int`, *optional*, defaults to 49408): + Vocabulary size of the AIMv2 text model. Defines the number of different tokens that can be represented by + the `inputs_ids` passed when calling [`Aimv2Model`]. + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + intermediate_size (`int`, *optional*, defaults to 2048): + Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 6): + Number of attention heads for each attention layer in the Transformer encoder. + rms_norm_eps (`float`, *optional*, defaults to 1e-05): + The epsilon used by the rms normalization layers. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + qkv_bias (`bool`, *optional*, defaults to `False`): + Whether to add a bias to the queries, keys and values. + mlp_bias (`bool`, *optional*, defaults to `False`): + Whether to add a bias to the Linear layers or Not. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported. + pad_token_id (`int`, *optional*, defaults to 1): + The id of the padding token in the vocabulary. + bos_token_id (`int`, *optional*, defaults to 49406): + The id of the beginning-of-sequence token in the vocabulary. + eos_token_id (`int`, *optional*, defaults to 49407): + The id of the end-of-sequence token in the vocabulary. + max_position_embeddings (`int`, *optional*, defaults to 77): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the for initializing all weight matrices. + """ + + model_type = "aimv2_text_model" + base_config_key = "text_config" + + def __init__( + self, + vocab_size: int = 49408, + hidden_size: int = 768, + intermediate_size: int = 2048, + num_hidden_layers: int = 12, + num_attention_heads: int = 6, + rms_norm_eps: float = 1e-5, + attention_dropout: float = 0.0, + qkv_bias: bool = False, + mlp_bias: bool = False, + hidden_act: str = "silu", + pad_token_id: Optional[int] = None, + bos_token_id: Optional[int] = None, + eos_token_id: int = 49407, + max_position_embeddings: int = 77, + initializer_range: bool = 0.02, + **kwargs, + ): + super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) + + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.max_position_embeddings = max_position_embeddings + self.hidden_act = hidden_act + self.attention_dropout = attention_dropout + + self.initializer_range = initializer_range + self.mlp_bias = mlp_bias + self.qkv_bias = qkv_bias + self.rms_norm_eps = rms_norm_eps + + +class Aimv2Config(PretrainedConfig): + r""" + [`Aimv2Config`] is the configuration class to store the configuration of a [`Aimv2Model`]. It is used to + instantiate a AIMv2 model according to the specified arguments, defining the text model and vision model configs. + Instantiating a configuration with the defaults will yield a similar configuration to that of the AIMv2 + [apple/aimv2-large-patch14-224-lit](https://huggingface.co/apple/aimv2-large-patch14-224-lit) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + text_config (`dict`, *optional*): + Dictionary of configuration options used to initialize [`Aimv2TextConfig`]. + vision_config (`dict`, *optional*): + Dictionary of configuration options used to initialize [`Aimv2VisionConfig`]. + projection_dim (`int`, *optional*, defaults to 512): + Dimensionality of text and vision projection layers. + logit_scale_init_value (`float`, *optional*, defaults to 2.6592): + The initial value of the *logit_scale* parameter. + kwargs (*optional*): + Dictionary of keyword arguments. + + Example: + + ```python + >>> from transformers import Aimv2Config, Aimv2Model + + >>> # Initializing a Aimv2Config with apple/aimv2-large-patch14-224-lit style configuration + >>> configuration = Aimv2Config() + + >>> # Initializing a Aimv2Model (with random weights) from the apple/aimv2-large-patch14-224-lit style configuration + >>> model = Aimv2Model(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + + >>> # We can also initialize a Aimv2Config from a Aimv2TextConfig and a Aimv2VisionConfig + >>> from transformers import Aimv2TextConfig, Aimv2VisionConfig + + >>> # Initializing a AIMv2Text and AIMv2Vision configuration + >>> config_text = Aimv2TextConfig() + >>> config_vision = Aimv2VisionConfig() + + >>> config = Aimv2Config(text_config=config_text, vision_config=config_vision) + ```""" + + model_type = "aimv2" + sub_configs = {"text_config": Aimv2TextConfig, "vision_config": Aimv2VisionConfig} + + def __init__( + self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs + ): + super().__init__(**kwargs) + + if text_config is None: + text_config = {} + logger.info("`text_config` is `None`. Initializing the `Aimv2TextConfig` with default values.") + + if vision_config is None: + vision_config = {} + logger.info("`vision_config` is `None`. initializing the `Aimv2VisionConfig` with default values.") + + self.text_config = Aimv2TextConfig(**text_config) + self.vision_config = Aimv2VisionConfig(**vision_config) + self.projection_dim = projection_dim + self.logit_scale_init_value = logit_scale_init_value + self.max_logit_scale = 100.0 + + +__all__ = ["Aimv2Config", "Aimv2VisionConfig", "Aimv2TextConfig"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/aimv2/modeling_aimv2.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/aimv2/modeling_aimv2.py new file mode 100644 index 0000000000000000000000000000000000000000..c29270b5687db779ecffa3bc90fb9955b453ac05 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/aimv2/modeling_aimv2.py @@ -0,0 +1,744 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/aimv2/modular_aimv2.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_aimv2.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright 2025 Apple Inc. and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import math +from dataclasses import dataclass +from typing import Any, Callable, Optional + +import torch +import torch.nn.functional as F +from torch import nn + +from ...activations import ACT2FN +from ...integrations import use_kernel_forward_from_hub +from ...masking_utils import create_causal_mask +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling +from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel +from ...processing_utils import Unpack +from ...utils import ModelOutput, TransformersKwargs, auto_docstring, can_return_tuple, filter_out_non_signature_kwargs +from ...utils.deprecation import deprecate_kwarg +from ...utils.generic import check_model_inputs +from .configuration_aimv2 import Aimv2Config, Aimv2TextConfig, Aimv2VisionConfig + + +@dataclass +@auto_docstring +class Aimv2Output(ModelOutput): + r""" + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`): + Contrastive loss for image-text similarity. + logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`): + The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text + similarity scores. + logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`): + The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image + similarity scores. + text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`): + The text embeddings obtained by applying the projection layer to the pooled output of [`Aimv2TextModel`]. + image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`): + The image embeddings obtained by applying the projection layer to the pooled output of [`Aimv2VisionModel`]. + text_model_output (`BaseModelOutputWithPooling`): + The output of the [`Aimv2TextModel`]. + vision_model_output (`BaseModelOutputWithPooling`): + The output of the [`Aimv2VisionModel`]. + """ + + loss: Optional[torch.FloatTensor] = None + logits_per_image: Optional[torch.FloatTensor] = None + logits_per_text: Optional[torch.FloatTensor] = None + text_embeds: Optional[torch.FloatTensor] = None + image_embeds: Optional[torch.FloatTensor] = None + text_model_output: BaseModelOutputWithPooling = None + vision_model_output: BaseModelOutputWithPooling = None + + def to_tuple(self) -> tuple[Any]: + return tuple( + self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple() + for k in self.keys() + ) + + +@use_kernel_forward_from_hub("RMSNorm") +class Aimv2RMSNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + """ + Aimv2RMSNorm is equivalent to T5LayerNorm + """ + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + return self.weight * hidden_states.to(input_dtype) + + def extra_repr(self): + return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}" + + +class Aimv2MLP(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.intermediate_size = config.intermediate_size + self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias) + self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias) + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias) + self.act_fn = ACT2FN[config.hidden_act] + + def forward(self, x): + down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) + return down_proj + + +class Aimv2VisionEmbeddings(nn.Module): + def __init__(self, config: Aimv2VisionConfig): + super().__init__() + self.config = config + self.patch_size = config.patch_size + self.patch_embed = nn.Conv2d( + config.num_channels, config.hidden_size, kernel_size=config.patch_size, stride=config.patch_size + ) + self.rms_norm = Aimv2RMSNorm(config.hidden_size, config.rms_norm_eps) + + num_patches = (config.image_size // config.patch_size) ** 2 + if not self.config.is_native: + self.position_embedding = nn.Embedding(num_patches, config.hidden_size) + self.register_buffer("position_ids", torch.arange(num_patches).expand((1, -1)), persistent=False) + + @staticmethod + def build_2d_sincos_position_embedding( + height, width, embed_dim=256, temperature=10000.0, device="cpu", dtype=torch.float32 + ) -> torch.Tensor: + grid_w = torch.arange(int(width), dtype=dtype, device=device) + grid_h = torch.arange(int(height), dtype=dtype, device=device) + grid_h, grid_w = torch.meshgrid(grid_w, grid_h, indexing="xy") + + pos_dim = embed_dim // 4 + omega = torch.arange(pos_dim, dtype=dtype, device=device) / pos_dim + omega = 1.0 / (temperature**omega) + + out_h = grid_h.flatten()[..., None] @ omega[None, :] + out_w = grid_w.flatten()[..., None] @ omega[None, :] + + return torch.concat([out_h.sin(), out_h.cos(), out_w.sin(), out_w.cos()], dim=1)[None, :, :] + + def forward(self, pixel_values: torch.Tensor) -> torch.Tensor: + _, _, height, width = pixel_values.size() + hidden_states = self.patch_embed(pixel_values).flatten(2).transpose(1, 2) + hidden_states = self.rms_norm(hidden_states) + + if self.config.is_native: + pos_embed = self.build_2d_sincos_position_embedding( + height // self.patch_size, + width // self.patch_size, + embed_dim=self.config.hidden_size, + device=hidden_states.device, + dtype=hidden_states.dtype, + ) + else: + pos_embed = self.position_embedding(self.position_ids) + + hidden_states = hidden_states + pos_embed + return hidden_states + + +class Aimv2TextEmbeddings(nn.Module): + def __init__(self, config: Aimv2TextConfig): + super().__init__() + embed_dim = config.hidden_size + + self.token_embedding = nn.Embedding(config.vocab_size, embed_dim) + self.position_embedding = nn.Embedding(config.max_position_embeddings, embed_dim) + + # position_ids (1, len position emb) is contiguous in memory and exported when serialized + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + ) -> torch.Tensor: + seq_length = input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2] + max_position_embedding = self.position_embedding.weight.shape[0] + + if seq_length > max_position_embedding: + raise ValueError( + f"Sequence length must be less than max_position_embeddings (got `sequence length`: " + f"{seq_length} and max_position_embeddings: {max_position_embedding}" + ) + + if position_ids is None: + position_ids = self.position_ids[:, :seq_length] + + if inputs_embeds is None: + inputs_embeds = self.token_embedding(input_ids) + + position_embeddings = self.position_embedding(position_ids) + embeddings = inputs_embeds + position_embeddings + + return embeddings + + +def eager_attention_forward( + module: nn.Module, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attention_mask: Optional[torch.Tensor], + scaling: float, + dropout: float = 0.0, + **kwargs, +): + attn_weights = torch.matmul(query, key.transpose(-1, -2)) * scaling + if attention_mask is not None: + attn_weights = attn_weights + attention_mask + + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype) + attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) + + attn_output = torch.matmul(attn_weights, value) + attn_output = attn_output.transpose(1, 2).contiguous() + + return attn_output, attn_weights + + +class Aimv2Attention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.embed_dim // self.num_heads + if self.head_dim * self.num_heads != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:" + f" {self.num_heads})." + ) + self.scale = self.head_dim**-0.5 + self.dropout = config.attention_dropout + self.is_causal = False + self.k_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.qkv_bias) + self.v_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.qkv_bias) + self.q_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.qkv_bias) + self.out_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.qkv_bias) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + **kwargs, + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: + """Input shape: Batch x Time x Channel""" + + batch_size, seq_length, embed_dim = hidden_states.shape + + queries = self.q_proj(hidden_states) + keys = self.k_proj(hidden_states) + values = self.v_proj(hidden_states) + + queries = queries.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2) + keys = keys.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2) + values = values.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2) + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + attn_output, attn_weights = attention_interface( + self, + queries, + keys, + values, + attention_mask, + is_causal=self.is_causal, + scaling=self.scale, + dropout=0.0 if not self.training else self.dropout, + ) + + attn_output = attn_output.reshape(batch_size, seq_length, embed_dim).contiguous() + attn_output = self.out_proj(attn_output) + + return attn_output, attn_weights + + +class Aimv2EncoderLayer(GradientCheckpointingLayer): + def __init__(self, config: Aimv2VisionConfig): + super().__init__() + self.attention = Aimv2Attention(config) + self.ffn = Aimv2MLP(config) + self.rms_norm1 = Aimv2RMSNorm(config.hidden_size, config.rms_norm_eps) + self.rms_norm2 = Aimv2RMSNorm(config.hidden_size, config.rms_norm_eps) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> torch.Tensor: + norm_hidden_states = self.rms_norm1(hidden_states) + attn_output, _ = self.attention(hidden_states=norm_hidden_states, attention_mask=attention_mask, **kwargs) + + hidden_states = hidden_states + attn_output + norm_hidden_states = self.rms_norm2(hidden_states) + mlp_output = self.ffn(norm_hidden_states) + + hidden_states = hidden_states + mlp_output + return hidden_states + + +class Aimv2Encoder(nn.Module): + """ + Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a + [`Aimv2EncoderLayer`]. + + Args: + config: Aimv2Config + """ + + def __init__(self, config: Aimv2Config): + super().__init__() + self.config = config + self.layers = nn.ModuleList([Aimv2EncoderLayer(config) for _ in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + + # Ignore copy + @auto_docstring + def forward( + self, + inputs_embeds, + attention_mask: Optional[torch.Tensor] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> BaseModelOutput: + hidden_states = inputs_embeds + for encoder_layer in self.layers: + hidden_states = encoder_layer( + hidden_states, + attention_mask, + **kwargs, + ) + + return BaseModelOutput(last_hidden_state=hidden_states) + + +class Aimv2AttentionPoolingHead(nn.Module): + def __init__(self, config: Aimv2VisionConfig): + super().__init__() + self.hidden_size = config.hidden_size + self.num_heads = config.num_attention_heads + + self.k_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=config.qkv_bias) + self.v_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=config.qkv_bias) + + self.cls_token = nn.Parameter(torch.zeros(1, 1, self.hidden_size)) + self.output_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=True) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + batch_size, seq_len, hidden_dim = hidden_states.shape + + cls_token = self.cls_token.expand(batch_size, -1, -1) + + key = self.k_proj(hidden_states).reshape(batch_size, seq_len, self.num_heads, hidden_dim // self.num_heads) + value = self.v_proj(hidden_states).reshape(batch_size, seq_len, self.num_heads, hidden_dim // self.num_heads) + query = cls_token.reshape(batch_size, 1, self.num_heads, hidden_dim // self.num_heads) + + key = key.permute(0, 2, 1, 3) + value = value.permute(0, 2, 1, 3) + query = query.permute(0, 2, 1, 3) + + attn_output = F.scaled_dot_product_attention(query, key, value) + + attn_output = attn_output.transpose(1, 2).reshape(batch_size, 1, hidden_dim) + attn_output = attn_output.mean(dim=1) + + output = self.output_proj(attn_output) + return output + + +@auto_docstring +class Aimv2PreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. The model is only intended for inference and doesn't support finetuning. + """ + + config: Aimv2Config + base_model_prefix = "aimv2" + supports_gradient_checkpointing = True + _no_split_modules = [ + "Aimv2EncoderLayer", + "Aimv2AttentionPoolingHead", + "Aimv2VisionEmbeddings", + "Aimv2TextEmbeddings", + ] + _supports_sdpa = True + _supports_flash_attn = True + _supports_flex_attn = True + + def _init_weights(self, module): + super()._init_weights(module) + if hasattr(module, "logit_scale"): + if isinstance(module.logit_scale, nn.Parameter): + module.logit_scale.data.fill_(math.log(1 / 0.07)) + elif isinstance(module, Aimv2AttentionPoolingHead): + module.cls_token.data.normal_(mean=0.0, std=self.config.initializer_range) + + +@auto_docstring( + custom_intro=""" + The Vision model from AIMv2 without any head or projection on top. + """ +) +class Aimv2VisionModel(Aimv2PreTrainedModel): + config: Aimv2VisionConfig + main_input_name = "pixel_values" + _can_record_outputs = { + "hidden_states": Aimv2EncoderLayer, + "attentions": Aimv2Attention, + } + + def __init__(self, config: Aimv2VisionConfig): + super().__init__(config) + self.config = config + self.embeddings = Aimv2VisionEmbeddings(config) + self.encoder = Aimv2Encoder(config) + # The only change from SiglipVisionTransformer is, layernorm -> rms_norm. + self.rms_norm = Aimv2RMSNorm(config.hidden_size, config.rms_norm_eps) + + self.use_head = config.use_head + if self.use_head: + self.head = Aimv2AttentionPoolingHead(config) + + self.post_init() + + def get_input_embeddings(self) -> nn.Module: + return self.embeddings.patch_embed + + @deprecate_kwarg("attention_mask", version="v4.58.0") + @check_model_inputs(tie_last_hidden_states=False) + @auto_docstring + def forward( + self, + pixel_values, + attention_mask: Optional[torch.Tensor] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> BaseModelOutputWithPooling: + r""" + Examples: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, Siglip2VisionModel + + >>> model = Aimv2VisionModel.from_pretrained("apple/aimv2-large-patch14-native") + >>> processor = AutoProcessor.from_pretrained("apple/aimv2-large-patch14-native") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> inputs = processor(images=image, return_tensors="pt") + + >>> outputs = model(**inputs) + >>> last_hidden_state = outputs.last_hidden_state + >>> pooled_output = outputs.pooler_output # pooled features + ```""" + hidden_states = self.embeddings(pixel_values) + + encoder_outputs: BaseModelOutput = self.encoder( + inputs_embeds=hidden_states, + **kwargs, + ) + + last_hidden_state = encoder_outputs.last_hidden_state + last_hidden_state = self.rms_norm(last_hidden_state) + + pooler_output = self.head(last_hidden_state) if self.use_head else None + + return BaseModelOutputWithPooling( + last_hidden_state=last_hidden_state, + pooler_output=pooler_output, + ) + + +@auto_docstring( + custom_intro=""" + The text model from AIMv2 without any head or projection on top. + """ +) +class Aimv2TextModel(Aimv2PreTrainedModel): + main_input_name = "input_ids" + + _can_record_outputs = { + "hidden_states": Aimv2EncoderLayer, + "attentions": Aimv2Attention, + } + + def __init__(self, config: Aimv2TextConfig): + super().__init__(config) + self.config = config + self.embeddings = Aimv2TextEmbeddings(config) + self.encoder = Aimv2Encoder(config) + self.rms_norm = Aimv2RMSNorm(config.hidden_size, config.rms_norm_eps) + + self.eos_token_id = config.eos_token_id + + self.post_init() + + def get_input_embeddings(self) -> nn.Module: + return self.embeddings.token_embedding + + def set_input_embeddings(self, value): + self.embeddings.token_embedding = value + + @check_model_inputs(tie_last_hidden_states=False) + @auto_docstring + def forward( + self, + input_ids, + attention_mask: Optional[torch.Tensor] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> BaseModelOutputWithPooling: + hidden_states = self.embeddings(input_ids) + batch_size, seq_len, _ = hidden_states.shape + + cache_position = torch.arange(seq_len, dtype=torch.long, device=hidden_states.device) + position_ids = cache_position.unsqueeze(0).expand(batch_size, -1) + if attention_mask is not None: + attention_mask = create_causal_mask( + config=self.config, + input_embeds=hidden_states, + position_ids=position_ids, + attention_mask=attention_mask, + cache_position=cache_position, + past_key_values=None, + ) + + encoder_outputs = self.encoder( + inputs_embeds=hidden_states, + attention_mask=attention_mask, + **kwargs, + ) + + last_hidden_state = encoder_outputs.last_hidden_state + last_hidden_state = self.rms_norm(last_hidden_state) + + # Get pooled output + pooled_output = last_hidden_state[ + torch.arange(last_hidden_state.shape[0], device=last_hidden_state.device), + (input_ids.to(dtype=torch.int, device=last_hidden_state.device) == self.eos_token_id).int().argmax(dim=-1), + ] + + return BaseModelOutputWithPooling( + last_hidden_state=last_hidden_state, + pooler_output=pooled_output, + ) + + +def _get_vector_norm(tensor: torch.Tensor) -> torch.Tensor: + """ + This method is equivalent to tensor.norm(p=2, dim=-1, keepdim=True) and used to make + model `executorch` exportable. See issue https://github.com/pytorch/executorch/issues/3566 + """ + square_tensor = torch.pow(tensor, 2) + sum_tensor = torch.sum(square_tensor, dim=-1, keepdim=True) + normed_tensor = torch.pow(sum_tensor, 0.5) + return normed_tensor + + +@auto_docstring +class Aimv2Model(Aimv2PreTrainedModel): + config: Aimv2Config + _no_split_modules = ["Aimv2TextEmbeddings", "Aimv2EncoderLayer", "Aimv2VisionEmbeddings"] + _supports_flash_attn = True + + def __init__(self, config: Aimv2Config): + super().__init__(config) + + self.projection_dim = config.projection_dim + self.vision_embed_dim = config.vision_config.hidden_size + self.text_embed_dim = config.text_config.hidden_size + + self.vision_model = Aimv2VisionModel._from_config(config.vision_config) + self.text_model = Aimv2TextModel._from_config(config.text_config) + + self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False) + self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False) + + self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value)) + self.max_log_logit_scale = math.log(config.max_logit_scale) + + self.post_init() + + @filter_out_non_signature_kwargs() + @auto_docstring + def get_text_features( + self, + input_ids: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + ) -> torch.FloatTensor: + r""" + Returns: + text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by + applying the projection layer to the pooled output of [`Aimv2TextModel`]. + + Examples: + + ```python + >>> import torch + >>> from transformers import AutoTokenizer, Aimv2Model + + >>> model = Aimv2Model.from_pretrained("openai/aimv2-vit-base-patch32") + >>> tokenizer = AutoTokenizer.from_pretrained("openai/aimv2-vit-base-patch32") + + >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt") + + >>> with torch.inference_mode(): + ... text_features = model.get_text_features(**inputs) + ```""" + text_outputs: BaseModelOutputWithPooling = self.text_model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + ) + pooled_output = text_outputs.pooler_output + text_features = self.text_projection(pooled_output) + + return text_features + + @filter_out_non_signature_kwargs() + @auto_docstring + def get_image_features( + self, + pixel_values: torch.FloatTensor, + interpolate_pos_encoding: bool = False, + ) -> torch.FloatTensor: + r""" + Returns: + image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by + applying the projection layer to the pooled output of [`Aimv2VisionModel`]. + + Examples: + + ```python + >>> import torch + >>> from transformers import AutoProcessor, Aimv2Model + >>> from transformers.image_utils import load_image + + >>> model = Aimv2Model.from_pretrained("openai/aimv2-vit-base-patch32") + >>> processor = AutoProcessor.from_pretrained("openai/aimv2-vit-base-patch32") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = load_image(url) + + >>> inputs = processor(images=image, return_tensors="pt") + + >>> with torch.inference_mode(): + ... image_features = model.get_image_features(**inputs) + ```""" + vision_outputs: BaseModelOutputWithPooling = self.vision_model( + pixel_values=pixel_values, + interpolate_pos_encoding=interpolate_pos_encoding, + ) + pooled_output = vision_outputs.pooler_output + image_features = self.visual_projection(pooled_output) + + return image_features + + @auto_docstring + @can_return_tuple + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> Aimv2Output: + r""" + Examples: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, Aimv2Model + + >>> model = Aimv2Model.from_pretrained("apple/aimv2-large-patch14-224-lit") + >>> processor = AutoProcessor.from_pretrained("apple/aimv2-large-patch14-224-lit") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> inputs = processor( + ... text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True + ... ) + + >>> outputs = model(**inputs) + >>> logits_per_image = outputs.logits_per_image # this is the image-text similarity score + >>> probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities + ```""" + vision_outputs: BaseModelOutputWithPooling = self.vision_model( + pixel_values=pixel_values, + **kwargs, + ) + + text_outputs: BaseModelOutputWithPooling = self.text_model( + input_ids=input_ids, + attention_mask=attention_mask, + **kwargs, + ) + + image_embeds = vision_outputs.pooler_output + image_embeds = self.visual_projection(image_embeds) + + text_embeds = text_outputs.pooler_output + text_embeds = self.text_projection(text_embeds) + + # normalized features + image_embeds = image_embeds / _get_vector_norm(image_embeds) + text_embeds = text_embeds / _get_vector_norm(text_embeds) + + logit_scale = self.logit_scale.clamp(0.0, self.max_log_logit_scale).exp().to(text_embeds.device) + logits_per_text = (logit_scale * text_embeds) @ image_embeds.t() + logits_per_image = logits_per_text.t() + + return Aimv2Output( + logits_per_image=logits_per_image, + logits_per_text=logits_per_text, + text_embeds=text_embeds, + image_embeds=image_embeds, + text_model_output=text_outputs, + vision_model_output=vision_outputs, + ) + + +__all__ = ["Aimv2VisionModel", "Aimv2Model", "Aimv2PreTrainedModel", "Aimv2TextModel"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/aimv2/modular_aimv2.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/aimv2/modular_aimv2.py new file mode 100644 index 0000000000000000000000000000000000000000..45a19d27d041649143dc970737502c5267080d05 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/aimv2/modular_aimv2.py @@ -0,0 +1,707 @@ +# coding=utf-8 +# Copyright 2025 Apple Inc. and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Pytorch implementation of AIMv2 Model""" + +import math +from typing import Optional + +import torch +import torch.nn.functional as F +from torch import nn + +from ...masking_utils import create_causal_mask +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling +from ...modeling_utils import PreTrainedModel +from ...processing_utils import Unpack +from ...utils import ( + TransformersKwargs, + auto_docstring, + can_return_tuple, +) +from ...utils.deprecation import deprecate_kwarg +from ...utils.generic import check_model_inputs +from ..clip.modeling_clip import CLIPModel, CLIPTextEmbeddings, _get_vector_norm +from ..llama.modeling_llama import LlamaMLP, LlamaRMSNorm +from ..siglip.configuration_siglip import SiglipConfig, SiglipTextConfig, SiglipVisionConfig +from ..siglip.modeling_siglip import SiglipAttention, SiglipEncoder, SiglipOutput + + +class Aimv2VisionConfig(SiglipVisionConfig): + r""" + This is the configuration class to store the configuration of a [`Aimv2VisionModel`]. It is used to instantiate a + AIMv2 vision encoder according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of the vision encoder of the AIMv2 + [apple/aimv2-large-patch14-224](https://huggingface.co/apple/aimv2-large-patch14-224) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + hidden_size (`int`, *optional*, defaults to 1024): + Dimensionality of the encoder layers and the pooler layer. + intermediate_size (`int`, *optional*, defaults to 2816): + Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + num_hidden_layers (`int`, *optional*, defaults to 24): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 8): + Number of attention heads for each attention layer in the Transformer encoder. + num_channels (`int`, *optional*, defaults to 3): + Number of channels in the input images. + image_size (`int`, *optional*, defaults to 224): + The size (resolution) of each image. + patch_size (`int`, *optional*, defaults to 14): + The size (resolution) of each patch. + rms_norm_eps (`float`, *optional*, defaults to 1e-05): + The epsilon used by the rms normalization layers. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + qkv_bias (`bool`, *optional*, defaults to `False`): + Whether to add a bias to the queries, keys and values. + mlp_bias (`bool`, *optional*, defaults to `False`): + Whether to add a bias to the Linear layers or Not. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the for initializing all weight matrices. + use_head (`str`, *optional*, defaults to `True`): + Whether to use Attention Pooling Head or Not. + is_native (`str`, *optional*, defaults to `False`): + Whether to use ckpt trained for image native resolution or not. + Example: + + ```python + >>> from transformers import SiglipVisionConfig, SiglipVisionModel + + >>> # Initializing a Aimv2VisionConfig with apple/aimv2-large-patch14-224 style configuration + >>> configuration = Aimv2VisionConfig() + + >>> # Initializing a Aimv2VisionModel (with random weights) from the apple/aimv2-large-patch14-224 style configuration + >>> model = Aimv2VisionModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + def __init__( + self, + hidden_size: int = 1024, + intermediate_size: int = 2816, + num_hidden_layers: int = 24, + num_attention_heads: int = 8, + num_channels: int = 3, + image_size: int = 224, + patch_size: int = 14, + rms_norm_eps: float = 1e-5, + attention_dropout: float = 0.0, + qkv_bias: bool = False, + mlp_bias: bool = False, + hidden_act: str = "silu", + initializer_range: float = 0.02, + use_head: bool = True, + is_native: bool = False, + **kwargs, + ): + super().__init__( + hidden_size=hidden_size, + intermediate_size=intermediate_size, + num_hidden_layers=num_hidden_layers, + num_attention_heads=num_attention_heads, + hidden_act=hidden_act, + num_channels=num_channels, + image_size=image_size, + patch_size=patch_size, + qkv_bias=qkv_bias, + **kwargs, + ) + + self.use_head = use_head + self.initializer_range = initializer_range + self.attention_dropout = attention_dropout + self.mlp_bias = mlp_bias + self.qkv_bias = qkv_bias + self.rms_norm_eps = rms_norm_eps + self.is_native = is_native + + del self.layer_norm_eps + + +class Aimv2TextConfig(SiglipTextConfig): + r""" + This is the configuration class to store the configuration of a [`Aimv2TextModel`]. It is used to instantiate a + AIMv2 text encoder according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of the text encoder of the AIMv2 + [apple/aimv2-large-patch14-224-lit](https://huggingface.co/apple/aimv2-large-patch14-224-lit) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + vocab_size (`int`, *optional*, defaults to 49408): + Vocabulary size of the AIMv2 text model. Defines the number of different tokens that can be represented by + the `inputs_ids` passed when calling [`Aimv2Model`]. + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + intermediate_size (`int`, *optional*, defaults to 2048): + Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 6): + Number of attention heads for each attention layer in the Transformer encoder. + rms_norm_eps (`float`, *optional*, defaults to 1e-05): + The epsilon used by the rms normalization layers. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + qkv_bias (`bool`, *optional*, defaults to `False`): + Whether to add a bias to the queries, keys and values. + mlp_bias (`bool`, *optional*, defaults to `False`): + Whether to add a bias to the Linear layers or Not. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported. + pad_token_id (`int`, *optional*, defaults to 1): + The id of the padding token in the vocabulary. + bos_token_id (`int`, *optional*, defaults to 49406): + The id of the beginning-of-sequence token in the vocabulary. + eos_token_id (`int`, *optional*, defaults to 49407): + The id of the end-of-sequence token in the vocabulary. + max_position_embeddings (`int`, *optional*, defaults to 77): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the for initializing all weight matrices. + """ + + def __init__( + self, + vocab_size: int = 49408, + hidden_size: int = 768, + intermediate_size: int = 2048, + num_hidden_layers: int = 12, + num_attention_heads: int = 6, + rms_norm_eps: float = 1e-5, + attention_dropout: float = 0.0, + qkv_bias: bool = False, + mlp_bias: bool = False, + hidden_act: str = "silu", + pad_token_id: Optional[int] = None, + bos_token_id: Optional[int] = None, + eos_token_id: int = 49407, + max_position_embeddings: int = 77, + initializer_range: bool = 0.02, + **kwargs, + ): + super().__init__( + vocab_size=vocab_size, + hidden_size=hidden_size, + intermediate_size=intermediate_size, + num_hidden_layers=num_hidden_layers, + num_attention_heads=num_attention_heads, + hidden_act=hidden_act, + max_position_embeddings=max_position_embeddings, + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + **kwargs, + ) + + self.initializer_range = initializer_range + self.attention_dropout = attention_dropout + self.mlp_bias = mlp_bias + self.qkv_bias = qkv_bias + self.rms_norm_eps = rms_norm_eps + + del self.bos_token_id + del self.pad_token_id + del self.projection_size + del self.layer_norm_eps + + +class Aimv2Config(SiglipConfig): + r""" + [`Aimv2Config`] is the configuration class to store the configuration of a [`Aimv2Model`]. It is used to + instantiate a AIMv2 model according to the specified arguments, defining the text model and vision model configs. + Instantiating a configuration with the defaults will yield a similar configuration to that of the AIMv2 + [apple/aimv2-large-patch14-224-lit](https://huggingface.co/apple/aimv2-large-patch14-224-lit) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + text_config (`dict`, *optional*): + Dictionary of configuration options used to initialize [`Aimv2TextConfig`]. + vision_config (`dict`, *optional*): + Dictionary of configuration options used to initialize [`Aimv2VisionConfig`]. + projection_dim (`int`, *optional*, defaults to 512): + Dimensionality of text and vision projection layers. + logit_scale_init_value (`float`, *optional*, defaults to 2.6592): + The initial value of the *logit_scale* parameter. + kwargs (*optional*): + Dictionary of keyword arguments. + + Example: + + ```python + >>> from transformers import Aimv2Config, Aimv2Model + + >>> # Initializing a Aimv2Config with apple/aimv2-large-patch14-224-lit style configuration + >>> configuration = Aimv2Config() + + >>> # Initializing a Aimv2Model (with random weights) from the apple/aimv2-large-patch14-224-lit style configuration + >>> model = Aimv2Model(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + + >>> # We can also initialize a Aimv2Config from a Aimv2TextConfig and a Aimv2VisionConfig + >>> from transformers import Aimv2TextConfig, Aimv2VisionConfig + + >>> # Initializing a AIMv2Text and AIMv2Vision configuration + >>> config_text = Aimv2TextConfig() + >>> config_vision = Aimv2VisionConfig() + + >>> config = Aimv2Config(text_config=config_text, vision_config=config_vision) + ```""" + + def __init__( + self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs + ): + super().__init__(text_config, vision_config, **kwargs) + self.projection_dim = projection_dim + self.logit_scale_init_value = logit_scale_init_value + self.max_logit_scale = 100.0 + + del self.initializer_factor + + +class Aimv2Output(SiglipOutput): + pass + + +class Aimv2RMSNorm(LlamaRMSNorm): + pass + + +class Aimv2MLP(LlamaMLP): + pass + + +class Aimv2VisionEmbeddings(nn.Module): + def __init__(self, config: Aimv2VisionConfig): + super().__init__() + self.config = config + self.patch_size = config.patch_size + self.patch_embed = nn.Conv2d( + config.num_channels, config.hidden_size, kernel_size=config.patch_size, stride=config.patch_size + ) + self.rms_norm = Aimv2RMSNorm(config.hidden_size, config.rms_norm_eps) + + num_patches = (config.image_size // config.patch_size) ** 2 + if not self.config.is_native: + self.position_embedding = nn.Embedding(num_patches, config.hidden_size) + self.register_buffer("position_ids", torch.arange(num_patches).expand((1, -1)), persistent=False) + + @staticmethod + def build_2d_sincos_position_embedding( + height, width, embed_dim=256, temperature=10000.0, device="cpu", dtype=torch.float32 + ) -> torch.Tensor: + grid_w = torch.arange(int(width), dtype=dtype, device=device) + grid_h = torch.arange(int(height), dtype=dtype, device=device) + grid_h, grid_w = torch.meshgrid(grid_w, grid_h, indexing="xy") + + pos_dim = embed_dim // 4 + omega = torch.arange(pos_dim, dtype=dtype, device=device) / pos_dim + omega = 1.0 / (temperature**omega) + + out_h = grid_h.flatten()[..., None] @ omega[None, :] + out_w = grid_w.flatten()[..., None] @ omega[None, :] + + return torch.concat([out_h.sin(), out_h.cos(), out_w.sin(), out_w.cos()], dim=1)[None, :, :] + + def forward(self, pixel_values: torch.Tensor) -> torch.Tensor: + _, _, height, width = pixel_values.size() + hidden_states = self.patch_embed(pixel_values).flatten(2).transpose(1, 2) + hidden_states = self.rms_norm(hidden_states) + + if self.config.is_native: + pos_embed = self.build_2d_sincos_position_embedding( + height // self.patch_size, + width // self.patch_size, + embed_dim=self.config.hidden_size, + device=hidden_states.device, + dtype=hidden_states.dtype, + ) + else: + pos_embed = self.position_embedding(self.position_ids) + + hidden_states = hidden_states + pos_embed + return hidden_states + + +class Aimv2TextEmbeddings(CLIPTextEmbeddings): + pass + + +class Aimv2Attention(SiglipAttention): + def __init__(self, config): + super().__init__(config) + self.k_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.qkv_bias) + self.v_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.qkv_bias) + self.q_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.qkv_bias) + self.out_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.qkv_bias) + + +class Aimv2EncoderLayer(GradientCheckpointingLayer): + def __init__(self, config: Aimv2VisionConfig): + super().__init__() + self.attention = Aimv2Attention(config) + self.ffn = Aimv2MLP(config) + self.rms_norm1 = Aimv2RMSNorm(config.hidden_size, config.rms_norm_eps) + self.rms_norm2 = Aimv2RMSNorm(config.hidden_size, config.rms_norm_eps) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> torch.Tensor: + norm_hidden_states = self.rms_norm1(hidden_states) + attn_output, _ = self.attention(hidden_states=norm_hidden_states, attention_mask=attention_mask, **kwargs) + + hidden_states = hidden_states + attn_output + norm_hidden_states = self.rms_norm2(hidden_states) + mlp_output = self.ffn(norm_hidden_states) + + hidden_states = hidden_states + mlp_output + return hidden_states + + +class Aimv2Encoder(SiglipEncoder): + pass + + +class Aimv2AttentionPoolingHead(nn.Module): + def __init__(self, config: Aimv2VisionConfig): + super().__init__() + self.hidden_size = config.hidden_size + self.num_heads = config.num_attention_heads + + self.k_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=config.qkv_bias) + self.v_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=config.qkv_bias) + + self.cls_token = nn.Parameter(torch.zeros(1, 1, self.hidden_size)) + self.output_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=True) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + batch_size, seq_len, hidden_dim = hidden_states.shape + + cls_token = self.cls_token.expand(batch_size, -1, -1) + + key = self.k_proj(hidden_states).reshape(batch_size, seq_len, self.num_heads, hidden_dim // self.num_heads) + value = self.v_proj(hidden_states).reshape(batch_size, seq_len, self.num_heads, hidden_dim // self.num_heads) + query = cls_token.reshape(batch_size, 1, self.num_heads, hidden_dim // self.num_heads) + + key = key.permute(0, 2, 1, 3) + value = value.permute(0, 2, 1, 3) + query = query.permute(0, 2, 1, 3) + + attn_output = F.scaled_dot_product_attention(query, key, value) + + attn_output = attn_output.transpose(1, 2).reshape(batch_size, 1, hidden_dim) + attn_output = attn_output.mean(dim=1) + + output = self.output_proj(attn_output) + return output + + +@auto_docstring +class Aimv2PreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. The model is only intended for inference and doesn't support finetuning. + """ + + config: Aimv2Config + base_model_prefix = "aimv2" + supports_gradient_checkpointing = True + _no_split_modules = [ + "Aimv2EncoderLayer", + "Aimv2AttentionPoolingHead", + "Aimv2VisionEmbeddings", + "Aimv2TextEmbeddings", + ] + _supports_sdpa = True + _supports_flash_attn = True + _supports_flex_attn = True + + def _init_weights(self, module): + super()._init_weights(module) + if hasattr(module, "logit_scale"): + if isinstance(module.logit_scale, nn.Parameter): + module.logit_scale.data.fill_(math.log(1 / 0.07)) + elif isinstance(module, Aimv2AttentionPoolingHead): + module.cls_token.data.normal_(mean=0.0, std=self.config.initializer_range) + + +@auto_docstring( + custom_intro=""" + The Vision model from AIMv2 without any head or projection on top. + """ +) +class Aimv2VisionModel(Aimv2PreTrainedModel): + config: Aimv2VisionConfig + main_input_name = "pixel_values" + _can_record_outputs = { + "hidden_states": Aimv2EncoderLayer, + "attentions": Aimv2Attention, + } + + def __init__(self, config: Aimv2VisionConfig): + super().__init__(config) + self.config = config + self.embeddings = Aimv2VisionEmbeddings(config) + self.encoder = Aimv2Encoder(config) + # The only change from SiglipVisionTransformer is, layernorm -> rms_norm. + self.rms_norm = Aimv2RMSNorm(config.hidden_size, config.rms_norm_eps) + + self.use_head = config.use_head + if self.use_head: + self.head = Aimv2AttentionPoolingHead(config) + + self.post_init() + + def get_input_embeddings(self) -> nn.Module: + return self.embeddings.patch_embed + + @deprecate_kwarg("attention_mask", version="v4.58.0") + @check_model_inputs(tie_last_hidden_states=False) + @auto_docstring + def forward( + self, + pixel_values, + attention_mask: Optional[torch.Tensor] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> BaseModelOutputWithPooling: + r""" + Examples: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, Siglip2VisionModel + + >>> model = Aimv2VisionModel.from_pretrained("apple/aimv2-large-patch14-native") + >>> processor = AutoProcessor.from_pretrained("apple/aimv2-large-patch14-native") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> inputs = processor(images=image, return_tensors="pt") + + >>> outputs = model(**inputs) + >>> last_hidden_state = outputs.last_hidden_state + >>> pooled_output = outputs.pooler_output # pooled features + ```""" + hidden_states = self.embeddings(pixel_values) + + encoder_outputs: BaseModelOutput = self.encoder( + inputs_embeds=hidden_states, + **kwargs, + ) + + last_hidden_state = encoder_outputs.last_hidden_state + last_hidden_state = self.rms_norm(last_hidden_state) + + pooler_output = self.head(last_hidden_state) if self.use_head else None + + return BaseModelOutputWithPooling( + last_hidden_state=last_hidden_state, + pooler_output=pooler_output, + ) + + +@auto_docstring( + custom_intro=""" + The text model from AIMv2 without any head or projection on top. + """ +) +class Aimv2TextModel(Aimv2PreTrainedModel): + main_input_name = "input_ids" + + _can_record_outputs = { + "hidden_states": Aimv2EncoderLayer, + "attentions": Aimv2Attention, + } + + def __init__(self, config: Aimv2TextConfig): + super().__init__(config) + self.config = config + self.embeddings = Aimv2TextEmbeddings(config) + self.encoder = Aimv2Encoder(config) + self.rms_norm = Aimv2RMSNorm(config.hidden_size, config.rms_norm_eps) + + self.eos_token_id = config.eos_token_id + + self.post_init() + + def get_input_embeddings(self) -> nn.Module: + return self.embeddings.token_embedding + + def set_input_embeddings(self, value): + self.embeddings.token_embedding = value + + @check_model_inputs(tie_last_hidden_states=False) + @auto_docstring + def forward( + self, + input_ids, + attention_mask: Optional[torch.Tensor] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> BaseModelOutputWithPooling: + hidden_states = self.embeddings(input_ids) + batch_size, seq_len, _ = hidden_states.shape + + cache_position = torch.arange(seq_len, dtype=torch.long, device=hidden_states.device) + position_ids = cache_position.unsqueeze(0).expand(batch_size, -1) + if attention_mask is not None: + attention_mask = create_causal_mask( + config=self.config, + input_embeds=hidden_states, + position_ids=position_ids, + attention_mask=attention_mask, + cache_position=cache_position, + past_key_values=None, + ) + + encoder_outputs = self.encoder( + inputs_embeds=hidden_states, + attention_mask=attention_mask, + **kwargs, + ) + + last_hidden_state = encoder_outputs.last_hidden_state + last_hidden_state = self.rms_norm(last_hidden_state) + + # Get pooled output + pooled_output = last_hidden_state[ + torch.arange(last_hidden_state.shape[0], device=last_hidden_state.device), + (input_ids.to(dtype=torch.int, device=last_hidden_state.device) == self.eos_token_id).int().argmax(dim=-1), + ] + + return BaseModelOutputWithPooling( + last_hidden_state=last_hidden_state, + pooler_output=pooled_output, + ) + + +@auto_docstring +class Aimv2Model(CLIPModel): + _supports_flash_attn = True + + def __init__(self, config: Aimv2Config): + PreTrainedModel.__init__(self, config) + + self.projection_dim = config.projection_dim + self.vision_embed_dim = config.vision_config.hidden_size + self.text_embed_dim = config.text_config.hidden_size + + self.vision_model = Aimv2VisionModel._from_config(config.vision_config) + self.text_model = Aimv2TextModel._from_config(config.text_config) + + self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False) + self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False) + + self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value)) + self.max_log_logit_scale = math.log(config.max_logit_scale) + + self.post_init() + + @auto_docstring + @can_return_tuple + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> Aimv2Output: + r""" + Examples: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, Aimv2Model + + >>> model = Aimv2Model.from_pretrained("apple/aimv2-large-patch14-224-lit") + >>> processor = AutoProcessor.from_pretrained("apple/aimv2-large-patch14-224-lit") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> inputs = processor( + ... text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True + ... ) + + >>> outputs = model(**inputs) + >>> logits_per_image = outputs.logits_per_image # this is the image-text similarity score + >>> probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities + ```""" + vision_outputs: BaseModelOutputWithPooling = self.vision_model( + pixel_values=pixel_values, + **kwargs, + ) + + text_outputs: BaseModelOutputWithPooling = self.text_model( + input_ids=input_ids, + attention_mask=attention_mask, + **kwargs, + ) + + image_embeds = vision_outputs.pooler_output + image_embeds = self.visual_projection(image_embeds) + + text_embeds = text_outputs.pooler_output + text_embeds = self.text_projection(text_embeds) + + # normalized features + image_embeds = image_embeds / _get_vector_norm(image_embeds) + text_embeds = text_embeds / _get_vector_norm(text_embeds) + + logit_scale = self.logit_scale.clamp(0.0, self.max_log_logit_scale).exp().to(text_embeds.device) + logits_per_text = (logit_scale * text_embeds) @ image_embeds.t() + logits_per_image = logits_per_text.t() + + return Aimv2Output( + logits_per_image=logits_per_image, + logits_per_text=logits_per_text, + text_embeds=text_embeds, + image_embeds=image_embeds, + text_model_output=text_outputs, + vision_model_output=vision_outputs, + ) + + +__all__ = [ + "Aimv2Config", + "Aimv2VisionConfig", + "Aimv2TextConfig", + "Aimv2VisionModel", + "Aimv2Model", + "Aimv2PreTrainedModel", + "Aimv2TextModel", +] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/audio_spectrogram_transformer/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/audio_spectrogram_transformer/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..618fceef70d3891f4313958cdadede27d605f12c --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/audio_spectrogram_transformer/__init__.py @@ -0,0 +1,28 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_audio_spectrogram_transformer import * + from .feature_extraction_audio_spectrogram_transformer import * + from .modeling_audio_spectrogram_transformer import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/audio_spectrogram_transformer/configuration_audio_spectrogram_transformer.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/audio_spectrogram_transformer/configuration_audio_spectrogram_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..ecd8f4858fe7add5841fc2e4cdf4ed00bc480a92 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/audio_spectrogram_transformer/configuration_audio_spectrogram_transformer.py @@ -0,0 +1,131 @@ +# coding=utf-8 +# Copyright 2022 Google AI and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Audio Spectogram Transformer (AST) model configuration""" + +from typing import Any + +from ...configuration_utils import PretrainedConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class ASTConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`ASTModel`]. It is used to instantiate an AST + model according to the specified arguments, defining the model architecture. Instantiating a configuration with the + defaults will yield a similar configuration to that of the AST + [MIT/ast-finetuned-audioset-10-10-0.4593](https://huggingface.co/MIT/ast-finetuned-audioset-10-10-0.4593) + architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + intermediate_size (`int`, *optional*, defaults to 3072): + Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"` and `"gelu_new"` are supported. + hidden_dropout_prob (`float`, *optional*, defaults to 0.0): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-12): + The epsilon used by the layer normalization layers. + patch_size (`int`, *optional*, defaults to 16): + The size (resolution) of each patch. + qkv_bias (`bool`, *optional*, defaults to `True`): + Whether to add a bias to the queries, keys and values. + frequency_stride (`int`, *optional*, defaults to 10): + Frequency stride to use when patchifying the spectrograms. + time_stride (`int`, *optional*, defaults to 10): + Temporal stride to use when patchifying the spectrograms. + max_length (`int`, *optional*, defaults to 1024): + Temporal dimension of the spectrograms. + num_mel_bins (`int`, *optional*, defaults to 128): + Frequency dimension of the spectrograms (number of Mel-frequency bins). + + Example: + + ```python + >>> from transformers import ASTConfig, ASTModel + + >>> # Initializing a AST MIT/ast-finetuned-audioset-10-10-0.4593 style configuration + >>> configuration = ASTConfig() + + >>> # Initializing a model (with random weights) from the MIT/ast-finetuned-audioset-10-10-0.4593 style configuration + >>> model = ASTModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "audio-spectrogram-transformer" + + def __init__( + self, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + hidden_act="gelu", + hidden_dropout_prob=0.0, + attention_probs_dropout_prob=0.0, + initializer_range=0.02, + layer_norm_eps=1e-12, + patch_size=16, + qkv_bias=True, + frequency_stride=10, + time_stride=10, + max_length=1024, + num_mel_bins=128, + **kwargs, + ): + super().__init__(**kwargs) + + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + self.patch_size = patch_size + self.qkv_bias = qkv_bias + self.frequency_stride = frequency_stride + self.time_stride = time_stride + self.max_length = max_length + self.num_mel_bins = num_mel_bins + + # Overwritten from the parent class: AST is not compatible with `generate`, but has a config parameter sharing the + # same name (`max_length`). Sharing the same name triggers checks regarding the config -> generation_config + # generative parameters deprecation cycle, overwriting this function prevents this from happening. + def _get_non_default_generation_parameters(self) -> dict[str, Any]: + return {} + + +__all__ = ["ASTConfig"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..f56c0c3213b7e4a920b271df612e7aaad1934a9f --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py @@ -0,0 +1,239 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Feature extractor class for Audio Spectrogram Transformer. +""" + +from typing import Optional, Union + +import numpy as np + +from ...audio_utils import mel_filter_bank, spectrogram, window_function +from ...feature_extraction_sequence_utils import SequenceFeatureExtractor +from ...feature_extraction_utils import BatchFeature +from ...utils import TensorType, is_speech_available, is_torch_available, logging + + +if is_speech_available(): + import torchaudio.compliance.kaldi as ta_kaldi + +if is_torch_available(): + import torch + + +logger = logging.get_logger(__name__) + + +class ASTFeatureExtractor(SequenceFeatureExtractor): + r""" + Constructs a Audio Spectrogram Transformer (AST) feature extractor. + + This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains + most of the main methods. Users should refer to this superclass for more information regarding those methods. + + This class extracts mel-filter bank features from raw speech using TorchAudio if installed or using numpy + otherwise, pads/truncates them to a fixed length and normalizes them using a mean and standard deviation. + + Args: + feature_size (`int`, *optional*, defaults to 1): + The feature dimension of the extracted features. + sampling_rate (`int`, *optional*, defaults to 16000): + The sampling rate at which the audio files should be digitalized expressed in hertz (Hz). + num_mel_bins (`int`, *optional*, defaults to 128): + Number of Mel-frequency bins. + max_length (`int`, *optional*, defaults to 1024): + Maximum length to which to pad/truncate the extracted features. + do_normalize (`bool`, *optional*, defaults to `True`): + Whether or not to normalize the log-Mel features using `mean` and `std`. + mean (`float`, *optional*, defaults to -4.2677393): + The mean value used to normalize the log-Mel features. Uses the AudioSet mean by default. + std (`float`, *optional*, defaults to 4.5689974): + The standard deviation value used to normalize the log-Mel features. Uses the AudioSet standard deviation + by default. + return_attention_mask (`bool`, *optional*, defaults to `False`): + Whether or not [`~ASTFeatureExtractor.__call__`] should return `attention_mask`. + """ + + model_input_names = ["input_values", "attention_mask"] + + def __init__( + self, + feature_size=1, + sampling_rate=16000, + num_mel_bins=128, + max_length=1024, + padding_value=0.0, + do_normalize=True, + mean=-4.2677393, + std=4.5689974, + return_attention_mask=False, + **kwargs, + ): + super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs) + self.num_mel_bins = num_mel_bins + self.max_length = max_length + self.do_normalize = do_normalize + self.mean = mean + self.std = std + self.return_attention_mask = return_attention_mask + + if not is_speech_available(): + mel_filters = mel_filter_bank( + num_frequency_bins=257, + num_mel_filters=self.num_mel_bins, + min_frequency=20, + max_frequency=sampling_rate // 2, + sampling_rate=sampling_rate, + norm=None, + mel_scale="kaldi", + triangularize_in_mel_space=True, + ) + + self.mel_filters = mel_filters + self.window = window_function(400, "hann", periodic=False) + + def _extract_fbank_features( + self, + waveform: np.ndarray, + max_length: int, + ) -> np.ndarray: + """ + Get mel-filter bank features using TorchAudio. Note that TorchAudio requires 16-bit signed integers as inputs + and hence the waveform should not be normalized before feature extraction. + """ + # waveform = waveform * (2**15) # Kaldi compliance: 16-bit signed integers + if is_speech_available(): + waveform = torch.from_numpy(waveform).unsqueeze(0) + fbank = ta_kaldi.fbank( + waveform, + sample_frequency=self.sampling_rate, + window_type="hanning", + num_mel_bins=self.num_mel_bins, + ) + else: + waveform = np.squeeze(waveform) + fbank = spectrogram( + waveform, + self.window, + frame_length=400, + hop_length=160, + fft_length=512, + power=2.0, + center=False, + preemphasis=0.97, + mel_filters=self.mel_filters, + log_mel="log", + mel_floor=1.192092955078125e-07, + remove_dc_offset=True, + ).T + + fbank = torch.from_numpy(fbank) + + n_frames = fbank.shape[0] + difference = max_length - n_frames + + # pad or truncate, depending on difference + if difference > 0: + pad_module = torch.nn.ZeroPad2d((0, 0, 0, difference)) + fbank = pad_module(fbank) + elif difference < 0: + fbank = fbank[0:max_length, :] + + fbank = fbank.numpy() + + return fbank + + def normalize(self, input_values: np.ndarray) -> np.ndarray: + return (input_values - (self.mean)) / (self.std * 2) + + def __call__( + self, + raw_speech: Union[np.ndarray, list[float], list[np.ndarray], list[list[float]]], + sampling_rate: Optional[int] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + **kwargs, + ) -> BatchFeature: + """ + Main method to featurize and prepare for the model one or several sequence(s). + + Args: + raw_speech (`np.ndarray`, `list[float]`, `list[np.ndarray]`, `list[list[float]]`): + The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float + values, a list of numpy arrays or a list of list of float values. Must be mono channel audio, not + stereo, i.e. single float per timestep. + sampling_rate (`int`, *optional*): + The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass + `sampling_rate` at the forward call to prevent silent errors. + return_tensors (`str` or [`~utils.TensorType`], *optional*): + If set, will return tensors instead of list of python integers. Acceptable values are: + + - `'tf'`: Return TensorFlow `tf.constant` objects. + - `'pt'`: Return PyTorch `torch.Tensor` objects. + - `'np'`: Return Numpy `np.ndarray` objects. + """ + + if sampling_rate is not None: + if sampling_rate != self.sampling_rate: + raise ValueError( + f"The model corresponding to this feature extractor: {self} was trained using a sampling rate of" + f" {self.sampling_rate}. Please make sure that the provided `raw_speech` input was sampled with" + f" {self.sampling_rate} and not {sampling_rate}." + ) + else: + logger.warning( + f"It is strongly recommended to pass the `sampling_rate` argument to `{self.__class__.__name__}()`. " + "Failing to do so can result in silent errors that might be hard to debug." + ) + + is_batched_numpy = isinstance(raw_speech, np.ndarray) and len(raw_speech.shape) > 1 + if is_batched_numpy and len(raw_speech.shape) > 2: + raise ValueError(f"Only mono-channel audio is supported for input to {self}") + is_batched = is_batched_numpy or ( + isinstance(raw_speech, (list, tuple)) and (isinstance(raw_speech[0], (np.ndarray, tuple, list))) + ) + + if is_batched: + raw_speech = [np.asarray(speech, dtype=np.float32) for speech in raw_speech] + elif not is_batched and not isinstance(raw_speech, np.ndarray): + raw_speech = np.asarray(raw_speech, dtype=np.float32) + elif isinstance(raw_speech, np.ndarray) and raw_speech.dtype is np.dtype(np.float64): + raw_speech = raw_speech.astype(np.float32) + + # always return batch + if not is_batched: + raw_speech = [raw_speech] + + # extract fbank features and pad/truncate to max_length + features = [self._extract_fbank_features(waveform, max_length=self.max_length) for waveform in raw_speech] + + # convert into BatchFeature + padded_inputs = BatchFeature({"input_values": features}) + + # make sure list is in array format + input_values = padded_inputs.get("input_values") + if isinstance(input_values[0], list): + padded_inputs["input_values"] = [np.asarray(feature, dtype=np.float32) for feature in input_values] + + # normalization + if self.do_normalize: + padded_inputs["input_values"] = [self.normalize(feature) for feature in input_values] + + if return_tensors is not None: + padded_inputs = padded_inputs.convert_to_tensors(return_tensors) + + return padded_inputs + + +__all__ = ["ASTFeatureExtractor"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..c445fbb0e36d363b490beb2c9f3b6b28ca8957f8 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py @@ -0,0 +1,481 @@ +# coding=utf-8 +# Copyright 2022 MIT and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch Audio Spectrogram Transformer (AST) model.""" + +from typing import Callable, Optional, Union + +import torch +from torch import nn + +from ...activations import ACT2FN +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, SequenceClassifierOutput +from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel +from ...processing_utils import Unpack +from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer +from ...utils import TransformersKwargs, auto_docstring, logging +from ...utils.generic import can_return_tuple, check_model_inputs +from .configuration_audio_spectrogram_transformer import ASTConfig + + +logger = logging.get_logger(__name__) + + +class ASTEmbeddings(nn.Module): + """ + Construct the CLS token, position and patch embeddings. + """ + + def __init__(self, config: ASTConfig) -> None: + super().__init__() + + self.cls_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size)) + self.distillation_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size)) + self.patch_embeddings = ASTPatchEmbeddings(config) + + frequency_out_dimension, time_out_dimension = self.get_shape(config) + num_patches = frequency_out_dimension * time_out_dimension + self.position_embeddings = nn.Parameter(torch.zeros(1, num_patches + 2, config.hidden_size)) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.config = config + + def get_shape(self, config): + # see Karpathy's cs231n blog on how to calculate the output dimensions + # https://cs231n.github.io/convolutional-networks/#conv + frequency_out_dimension = (config.num_mel_bins - config.patch_size) // config.frequency_stride + 1 + time_out_dimension = (config.max_length - config.patch_size) // config.time_stride + 1 + + return frequency_out_dimension, time_out_dimension + + def forward(self, input_values: torch.Tensor) -> torch.Tensor: + batch_size = input_values.shape[0] + embeddings = self.patch_embeddings(input_values) + + cls_tokens = self.cls_token.expand(batch_size, -1, -1) + distillation_tokens = self.distillation_token.expand(batch_size, -1, -1) + embeddings = torch.cat((cls_tokens, distillation_tokens, embeddings), dim=1) + embeddings = embeddings + self.position_embeddings + embeddings = self.dropout(embeddings) + + return embeddings + + +class ASTPatchEmbeddings(nn.Module): + """ + This class turns `input_values` into the initial `hidden_states` (patch embeddings) of shape `(batch_size, + seq_length, hidden_size)` to be consumed by a Transformer. + """ + + def __init__(self, config: ASTConfig): + super().__init__() + + patch_size = config.patch_size + frequency_stride = config.frequency_stride + time_stride = config.time_stride + + self.projection = nn.Conv2d( + 1, config.hidden_size, kernel_size=(patch_size, patch_size), stride=(frequency_stride, time_stride) + ) + + def forward(self, input_values: torch.Tensor) -> torch.Tensor: + input_values = input_values.unsqueeze(1) + input_values = input_values.transpose(2, 3) + embeddings = self.projection(input_values).flatten(2).transpose(1, 2) + return embeddings + + +# Copied from transformers.models.vit.modeling_vit.eager_attention_forward +def eager_attention_forward( + module: nn.Module, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attention_mask: Optional[torch.Tensor], + scaling: float, + dropout: float = 0.0, + **kwargs, +): + # Take the dot product between "query" and "key" to get the raw attention scores. + attn_weights = torch.matmul(query, key.transpose(-1, -2)) * scaling + + # Normalize the attention scores to probabilities. + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) + + # Mask heads if we want to + if attention_mask is not None: + attn_weights = attn_weights * attention_mask + + attn_output = torch.matmul(attn_weights, value) + attn_output = attn_output.transpose(1, 2).contiguous() + + return attn_output, attn_weights + + +# Copied from transformers.models.vit.modeling_vit.ViTSelfAttention with ViT->AST +class ASTSelfAttention(nn.Module): + def __init__(self, config: ASTConfig): + super().__init__() + if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): + raise ValueError( + f"The hidden size {config.hidden_size} is not a multiple of the number of attention " + f"heads {config.num_attention_heads}." + ) + + self.config = config + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + self.dropout_prob = config.attention_probs_dropout_prob + self.scaling = self.attention_head_size**-0.5 + self.is_causal = False + + self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias) + self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias) + self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias) + + def forward( + self, hidden_states: torch.Tensor, head_mask: Optional[torch.Tensor] = None + ) -> tuple[torch.Tensor, torch.Tensor]: + batch_size = hidden_states.shape[0] + new_shape = batch_size, -1, self.num_attention_heads, self.attention_head_size + + key_layer = self.key(hidden_states).view(*new_shape).transpose(1, 2) + value_layer = self.value(hidden_states).view(*new_shape).transpose(1, 2) + query_layer = self.query(hidden_states).view(*new_shape).transpose(1, 2) + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + context_layer, attention_probs = attention_interface( + self, + query_layer, + key_layer, + value_layer, + head_mask, + is_causal=self.is_causal, + scaling=self.scaling, + dropout=0.0 if not self.training else self.dropout_prob, + ) + + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.reshape(new_context_layer_shape) + + return context_layer, attention_probs + + +# Copied from transformers.models.vit.modeling_vit.ViTSelfOutput with ViT->AST +class ASTSelfOutput(nn.Module): + """ + The residual connection is defined in ASTLayer instead of here (as is the case with other models), due to the + layernorm applied before each block. + """ + + def __init__(self, config: ASTConfig): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + return hidden_states + + +# Copied from transformers.models.vit.modeling_vit.ViTAttention with ViT->AST +class ASTAttention(nn.Module): + def __init__(self, config: ASTConfig): + super().__init__() + self.attention = ASTSelfAttention(config) + self.output = ASTSelfOutput(config) + self.pruned_heads = set() + + def prune_heads(self, heads: set[int]): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads + ) + + # Prune linear layers + self.attention.query = prune_linear_layer(self.attention.query, index) + self.attention.key = prune_linear_layer(self.attention.key, index) + self.attention.value = prune_linear_layer(self.attention.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads) + self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + def forward(self, hidden_states: torch.Tensor, head_mask: Optional[torch.Tensor] = None) -> torch.Tensor: + self_attn_output, _ = self.attention(hidden_states, head_mask) + output = self.output(self_attn_output, hidden_states) + return output + + +# Copied from transformers.models.vit.modeling_vit.ViTIntermediate with ViT->AST +class ASTIntermediate(nn.Module): + def __init__(self, config: ASTConfig): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +# Copied from transformers.models.vit.modeling_vit.ViTOutput with ViT->AST +class ASTOutput(nn.Module): + def __init__(self, config: ASTConfig): + super().__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = hidden_states + input_tensor + return hidden_states + + +# Copied from transformers.models.vit.modeling_vit.ViTLayer with ViT->AST,VIT->AST +class ASTLayer(GradientCheckpointingLayer): + """This corresponds to the Block class in the timm implementation.""" + + def __init__(self, config: ASTConfig): + super().__init__() + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 + self.attention = ASTAttention(config) + self.intermediate = ASTIntermediate(config) + self.output = ASTOutput(config) + self.layernorm_before = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.layernorm_after = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + def forward(self, hidden_states: torch.Tensor, head_mask: Optional[torch.Tensor] = None) -> torch.Tensor: + hidden_states_norm = self.layernorm_before(hidden_states) + attention_output = self.attention(hidden_states_norm, head_mask) + + # first residual connection + hidden_states = attention_output + hidden_states + + # in AST, layernorm is also applied after self-attention + layer_output = self.layernorm_after(hidden_states) + layer_output = self.intermediate(layer_output) + + # second residual connection is done here + layer_output = self.output(layer_output, hidden_states) + + return layer_output + + +# Copied from transformers.models.vit.modeling_vit.ViTEncoder with ViT->AST +class ASTEncoder(nn.Module): + def __init__(self, config: ASTConfig): + super().__init__() + self.config = config + self.layer = nn.ModuleList([ASTLayer(config) for _ in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + + def forward(self, hidden_states: torch.Tensor, head_mask: Optional[torch.Tensor] = None) -> BaseModelOutput: + for i, layer_module in enumerate(self.layer): + layer_head_mask = head_mask[i] if head_mask is not None else None + hidden_states = layer_module(hidden_states, layer_head_mask) + + return BaseModelOutput(last_hidden_state=hidden_states) + + +@auto_docstring +class ASTPreTrainedModel(PreTrainedModel): + config: ASTConfig + base_model_prefix = "audio_spectrogram_transformer" + main_input_name = "input_values" + supports_gradient_checkpointing = True + _supports_sdpa = True + _supports_flash_attn = True + _supports_flex_attn = True + _supports_attention_backend = True + _can_record_outputs = { + "hidden_states": ASTLayer, + "attentions": ASTSelfAttention, + } + + def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None: + """Initialize the weights""" + if isinstance(module, (nn.Linear, nn.Conv2d)): + # Upcast the input in `fp32` and cast it back to desired `dtype` to avoid + # `trunc_normal_cpu` not implemented in `half` issues + module.weight.data = nn.init.trunc_normal_( + module.weight.data.to(torch.float32), mean=0.0, std=self.config.initializer_range + ).to(module.weight.dtype) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + elif isinstance(module, ASTEmbeddings): + module.cls_token.data.zero_() + module.position_embeddings.data.zero_() + module.distillation_token.data.zero_() + + +@auto_docstring +class ASTModel(ASTPreTrainedModel): + def __init__(self, config: ASTConfig) -> None: + super().__init__(config) + self.config = config + + self.embeddings = ASTEmbeddings(config) + self.encoder = ASTEncoder(config) + + self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self) -> ASTPatchEmbeddings: + return self.embeddings.patch_embeddings + + def _prune_heads(self, heads_to_prune: dict[int, list[int]]) -> None: + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + @check_model_inputs + @auto_docstring + def forward( + self, + input_values: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> BaseModelOutputWithPooling: + r""" + input_values (`torch.FloatTensor` of shape `(batch_size, max_length, num_mel_bins)`): + Float values mel features extracted from the raw audio waveform. Raw audio waveform can be obtained by + loading a `.flac` or `.wav` audio file into an array of type `list[float]`, a `numpy.ndarray` or a + `torch.Tensor`, *e.g.* via the torchcodec library (`pip install torchcodec`) or the soundfile library + (`pip install soundfile`). + To prepare the array into `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the + mel features, padding and conversion into a tensor of type `torch.FloatTensor`. + See [`~ASTFeatureExtractor.__call__`] + """ + + if input_values is None: + raise ValueError("You have to specify input_values") + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + + embedding_output = self.embeddings(input_values) + + encoder_outputs: BaseModelOutput = self.encoder(embedding_output, head_mask=head_mask) + sequence_output = encoder_outputs.last_hidden_state + sequence_output = self.layernorm(sequence_output) + + pooled_output = (sequence_output[:, 0] + sequence_output[:, 1]) / 2 + + return BaseModelOutputWithPooling(last_hidden_state=sequence_output, pooler_output=pooled_output) + + +class ASTMLPHead(nn.Module): + def __init__(self, config: ASTConfig): + super().__init__() + self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dense = nn.Linear(config.hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity() + + def forward(self, hidden_state): + hidden_state = self.layernorm(hidden_state) + hidden_state = self.dense(hidden_state) + return hidden_state + + +@auto_docstring( + custom_intro=""" + Audio Spectrogram Transformer model with an audio classification head on top (a linear layer on top of the pooled + output) e.g. for datasets like AudioSet, Speech Commands v2. + """ +) +class ASTForAudioClassification(ASTPreTrainedModel): + def __init__(self, config: ASTConfig) -> None: + super().__init__(config) + + self.num_labels = config.num_labels + self.audio_spectrogram_transformer = ASTModel(config) + + # Classifier head + self.classifier = ASTMLPHead(config) + + # Initialize weights and apply final processing + self.post_init() + + @can_return_tuple + @auto_docstring + def forward( + self, + input_values: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> SequenceClassifierOutput: + r""" + input_values (`torch.FloatTensor` of shape `(batch_size, max_length, num_mel_bins)`): + Float values mel features extracted from the raw audio waveform. Raw audio waveform can be obtained by + loading a `.flac` or `.wav` audio file into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via + the torchcodec library (`pip install torchcodec`) or the soundfile library (`pip install soundfile`). + To prepare the array into `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the + mel features, padding and conversion into a tensor of type `torch.FloatTensor`. + See [`~ASTFeatureExtractor.__call__`] + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the audio classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + outputs: BaseModelOutputWithPooling = self.audio_spectrogram_transformer( + input_values, head_mask=head_mask, **kwargs + ) + + pooled_output = outputs.pooler_output + logits = self.classifier(pooled_output) + + loss = None + if labels is not None: + loss = self.loss_function(labels, logits, self.config, **kwargs) + + return SequenceClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +__all__ = ["ASTForAudioClassification", "ASTModel", "ASTPreTrainedModel"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/aya_vision/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/aya_vision/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..f8be47cb228b19f02b87d195747e78c2a87de752 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/aya_vision/__init__.py @@ -0,0 +1,28 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_aya_vision import * + from .modeling_aya_vision import * + from .processing_aya_vision import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/aya_vision/configuration_aya_vision.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/aya_vision/configuration_aya_vision.py new file mode 100644 index 0000000000000000000000000000000000000000..a8c1965ec463f5854656f3b5064b23f0d981cacd --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/aya_vision/configuration_aya_vision.py @@ -0,0 +1,110 @@ +# coding=utf-8 +# Copyright 2025 Cohere team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""AyaVision model configuration""" + +from ...configuration_utils import PretrainedConfig +from ...utils import logging +from ..auto import CONFIG_MAPPING, AutoConfig + + +logger = logging.get_logger(__name__) + + +class AyaVisionConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`AyaVisionForConditionalGeneration`]. It is used to instantiate an + AyaVision model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of AyaVision. + e.g. [CohereForAI/aya-vision-8b](https://huggingface.co/CohereForAI/aya-vision-8b) + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + vision_config (`Union[AutoConfig, dict]`, *optional*, defaults to `SiglipVisionConfig`): + The config object or dictionary of the vision backbone. + text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `Cohere2Config`): + The config object or dictionary of the text backbone. + vision_feature_select_strategy (`str`, *optional*, defaults to `"full"`): + The feature selection strategy used to select the vision feature from the vision backbone. + Can be one of `"default"` or `"full"`. If `"default"`, the CLS token is removed from the vision features. + If `"full"`, the full vision features are used. + vision_feature_layer (`int`, *optional*, defaults to -1): + The index of the layer to select the vision feature. + downsample_factor (`int`, *optional*, defaults to 2): + The downsample factor to apply to the vision features. + adapter_layer_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon value used for layer normalization in the adapter. + image_token_index (`int`, *optional*, defaults to 255036): + The image token index to encode the image prompt. + """ + + model_type = "aya_vision" + attribute_map = { + "image_token_id": "image_token_index", + } + sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig} + + def __init__( + self, + vision_config=None, + text_config=None, + vision_feature_select_strategy="full", + vision_feature_layer=-1, + downsample_factor=2, + adapter_layer_norm_eps=1e-6, + image_token_index=255036, + **kwargs, + ): + self.image_token_index = image_token_index + self.downsample_factor = downsample_factor + self.adapter_layer_norm_eps = adapter_layer_norm_eps + if vision_feature_select_strategy not in ["default", "full"]: + raise ValueError( + "vision_feature_select_strategy should be one of 'default', 'full'." + f"Got: {vision_feature_select_strategy}" + ) + + self.vision_feature_select_strategy = vision_feature_select_strategy + self.vision_feature_layer = vision_feature_layer + + if isinstance(vision_config, dict): + vision_config["model_type"] = vision_config.get("model_type", "siglip_vision_model") + vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config) + elif vision_config is None: + vision_config = CONFIG_MAPPING["siglip_vision_model"]( + hidden_size=1152, + intermediate_size=4304, + patch_size=14, + image_size=384, + num_hidden_layers=26, + num_attention_heads=14, + vision_use_head=False, + ) + + self.vision_config = vision_config + + if isinstance(text_config, dict): + text_config["model_type"] = text_config.get("model_type", "cohere2") + text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config) + elif text_config is None: + text_config = CONFIG_MAPPING["cohere2"]() + + self.text_config = text_config + + super().__init__(**kwargs) + + +__all__ = ["AyaVisionConfig"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/aya_vision/modeling_aya_vision.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/aya_vision/modeling_aya_vision.py new file mode 100644 index 0000000000000000000000000000000000000000..fe9c2f72b05b980b6ad187afcdc2f591ce5361ef --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/aya_vision/modeling_aya_vision.py @@ -0,0 +1,518 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/aya_vision/modular_aya_vision.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_aya_vision.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright 2025 the Cohere Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass +from typing import Optional, Union + +import torch +from torch import nn + +from ...activations import ACT2FN +from ...cache_utils import Cache +from ...generation import GenerationMixin +from ...modeling_outputs import BaseModelOutputWithPast, ModelOutput +from ...modeling_utils import PreTrainedModel +from ...processing_utils import Unpack +from ...utils import TransformersKwargs, auto_docstring, can_return_tuple +from ...utils.generic import check_model_inputs +from ..auto import AutoModel +from .configuration_aya_vision import AyaVisionConfig + + +class AyaVisionMultiModalProjector(nn.Module): + def __init__(self, config: AyaVisionConfig): + super().__init__() + self.config = config + self.downsample_factor = config.downsample_factor + self.alignment_intermediate_size = getattr( + config, "alignment_intermediate_size", config.text_config.hidden_size + ) + self.layernorm = nn.LayerNorm( + config.vision_config.hidden_size * (config.downsample_factor**2), eps=config.adapter_layer_norm_eps + ) + + self.linear_1 = nn.Linear( + config.vision_config.hidden_size * (config.downsample_factor**2), + self.alignment_intermediate_size, + bias=True, + ) + + self.act = ACT2FN["silu"] # SwiGLU uses SiLU activation + # For SwiGLU, project down to half size since we split intermediate dim + self.linear_2 = nn.Linear(self.alignment_intermediate_size // 2, config.text_config.hidden_size, bias=True) + + def forward(self, image_features): + image_features = self.pixel_shuffle(image_features) + image_features = self.layernorm(image_features) + hidden_states = self.linear_1(image_features) + + # Split along last dimension and apply SwiGLU + x, gate = hidden_states.chunk(2, dim=-1) + hidden_states = self.act(gate) * x + + hidden_states = self.linear_2(hidden_states) + return hidden_states + + def pixel_shuffle(self, image_features): # B, S, D + batch_size, seq_length, feature_dim = image_features.shape + height = width = int(seq_length**0.5) + image_features = image_features.reshape(image_features.shape[0], width, height, -1) + channels = image_features.shape[-1] + image_features = image_features.reshape( + batch_size, width, int(height / self.downsample_factor), int(channels * self.downsample_factor) + ) + image_features = image_features.permute(0, 2, 1, 3) + image_features = image_features.reshape( + batch_size, int(height / self.downsample_factor), int(width / self.downsample_factor), -1 + ) + image_features = image_features.permute(0, 2, 1, 3) + return image_features + + +@auto_docstring +class AyaVisionPreTrainedModel(PreTrainedModel): + config: AyaVisionConfig + base_model_prefix = "" + supports_gradient_checkpointing = True + _skip_keys_device_placement = "past_key_values" + + _supports_flash_attn = True + _supports_sdpa = True + _can_compile_fullgraph = False + _supports_flex_attn = True + _supports_attention_backend = True + _can_record_outputs = { + "hidden_states": "DecoderLayer", + "attentions": "Attention", + } + + +@dataclass +@auto_docstring( + custom_intro=""" + Base class for AyaVision causal language model (or autoregressive) outputs. + """ +) +class AyaVisionCausalLMOutputWithPast(ModelOutput): + r""" + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Language modeling loss (for next-token prediction). + logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache). + + Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see + `past_key_values` input) to speed up sequential decoding. + image_hidden_states (`torch.FloatTensor`, *optional*): + A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`. + image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state. + """ + + loss: Optional[torch.FloatTensor] = None + logits: Optional[torch.FloatTensor] = None + past_key_values: Optional[Cache] = None + hidden_states: Optional[tuple[torch.FloatTensor]] = None + attentions: Optional[tuple[torch.FloatTensor]] = None + image_hidden_states: Optional[torch.FloatTensor] = None + + +@dataclass +@auto_docstring( + custom_intro=""" + Base class for AyaVision outputs, with hidden states and attentions. + """ +) +class AyaVisionModelOutputWithPast(BaseModelOutputWithPast): + r""" + past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache). + + Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see + `past_key_values` input) to speed up sequential decoding. + image_hidden_states (`torch.FloatTensor`, *optional*): + A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`. + image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state. + """ + + image_hidden_states: Optional[torch.FloatTensor] = None + + +@auto_docstring( + custom_intro=""" + The AyaVision model which consists of a vision backbone and a language model, without a language modeling head. + """ +) +class AyaVisionModel(AyaVisionPreTrainedModel): + _checkpoint_conversion_mapping = {"language_model.model": "language_model"} + + def __init__(self, config: AyaVisionConfig): + super().__init__(config) + self.vision_tower = AutoModel.from_config(config.vision_config) + + self.multi_modal_projector = AyaVisionMultiModalProjector(config) + self.language_model = AutoModel.from_config(config.text_config) + self.post_init() + + def get_input_embeddings(self): + return self.language_model.get_input_embeddings() + + def set_input_embeddings(self, value): + self.language_model.set_input_embeddings(value) + + def set_decoder(self, decoder): + self.language_model = decoder + + def get_decoder(self): + return self.language_model + + def get_image_features( + self, + pixel_values: torch.FloatTensor, + vision_feature_layer: Optional[Union[int, list[int]]] = None, + vision_feature_select_strategy: Optional[str] = None, + **kwargs, + ): + """ + Obtains image last hidden states from the vision tower and apply multimodal projection. + + Args: + pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`): + The tensors corresponding to the input images. + vision_feature_layer (`Union[int, list[int]]`, *optional*): + The index of the layer to select the vision feature. If multiple indices are provided, + the vision feature of the corresponding indices will be concatenated to form the + vision features. + vision_feature_select_strategy (`str`, *optional*): + The feature selection strategy used to select the vision feature from the vision backbone. + Can be one of `"default"` or `"full"` + Returns: + image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`). + """ + vision_feature_layer = ( + vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer + ) + vision_feature_select_strategy = ( + vision_feature_select_strategy + if vision_feature_select_strategy is not None + else self.config.vision_feature_select_strategy + ) + + if vision_feature_select_strategy not in ["default", "full"]: + raise ValueError(f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}") + + kwargs = {k: v for k, v in kwargs.items() if v is not None} + # this is not memory efficient at all (output_hidden_states=True) will save all the hidden states. + image_outputs = self.vision_tower(pixel_values, output_hidden_states=True, **kwargs) + + # If we have one vision feature layer, return the corresponding hidden states, + # otherwise, select the hidden states of each feature layer and concatenate them + if isinstance(vision_feature_layer, int): + selected_image_feature = image_outputs.hidden_states[vision_feature_layer] + if vision_feature_select_strategy == "default": + selected_image_feature = selected_image_feature[:, 1:] + else: + hs_pool = [image_outputs.hidden_states[layer_idx] for layer_idx in vision_feature_layer] + # For default; crop CLS from each hidden state in the hidden state pool + if vision_feature_select_strategy == "default": + hs_pool = [hs[:, 1:] for hs in hs_pool] + selected_image_feature = torch.cat(hs_pool, dim=-1) + + image_features = self.multi_modal_projector(selected_image_feature) + return image_features + + def get_placeholder_mask( + self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor, image_features: torch.FloatTensor + ): + """ + Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is + equal to the length of multimodal features. If the lengths are different, an error is raised. + """ + if input_ids is None: + special_image_mask = inputs_embeds == self.get_input_embeddings()( + torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device) + ) + special_image_mask = special_image_mask.all(-1) + else: + special_image_mask = input_ids == self.config.image_token_id + + n_image_tokens = special_image_mask.sum() + special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device) + n_image_features = image_features.shape[0] * image_features.shape[1] + if inputs_embeds[special_image_mask].numel() != image_features.numel(): + raise ValueError( + f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}" + ) + return special_image_mask + + @check_model_inputs + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + vision_feature_layer: Optional[Union[int, list[int]]] = None, + vision_feature_select_strategy: Optional[str] = None, + use_cache: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> Union[tuple, AyaVisionModelOutputWithPast]: + vision_feature_layer = ( + vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer + ) + vision_feature_select_strategy = ( + vision_feature_select_strategy + if vision_feature_select_strategy is not None + else self.config.vision_feature_select_strategy + ) + + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") + + if inputs_embeds is None: + inputs_embeds = self.get_input_embeddings()(input_ids) + + if pixel_values is not None: + image_features = self.get_image_features( + pixel_values=pixel_values, + vision_feature_layer=vision_feature_layer, + vision_feature_select_strategy=vision_feature_select_strategy, + ) + image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype) + special_image_mask = self.get_placeholder_mask( + input_ids, inputs_embeds=inputs_embeds, image_features=image_features + ) + inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features) + + outputs = self.language_model( + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + cache_position=cache_position, + **kwargs, + ) + + return AyaVisionModelOutputWithPast( + last_hidden_state=outputs.last_hidden_state, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + image_hidden_states=image_features if pixel_values is not None else None, + ) + + +@auto_docstring( + custom_intro=""" + The AYA_VISION model which consists of a vision backbone and a language model. + """ +) +class AyaVisionForConditionalGeneration(AyaVisionPreTrainedModel, GenerationMixin): + _checkpoint_conversion_mapping = { + "^language_model.model": "model.language_model", + "^vision_tower": "model.vision_tower", + "^multi_modal_projector": "model.multi_modal_projector", + "^language_model.lm_head": "lm_head", + } + _tied_weights_keys = ["lm_head.weight"] + + def __init__(self, config: AyaVisionConfig): + super().__init__(config) + self.model = AyaVisionModel(config) + self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False) + self.post_init() + + def get_input_embeddings(self): + return self.model.get_input_embeddings() + + def set_input_embeddings(self, value): + self.model.set_input_embeddings(value) + + def get_output_embeddings(self) -> nn.Module: + return self.lm_head + + def set_decoder(self, decoder): + self.model.set_decoder(decoder) + + def get_decoder(self): + return self.model.get_decoder() + + def get_image_features( + self, + pixel_values: torch.FloatTensor, + vision_feature_layer: Optional[Union[int, list[int]]] = None, + vision_feature_select_strategy: Optional[str] = None, + **kwargs, + ): + return self.model.get_image_features( + pixel_values=pixel_values, + vision_feature_layer=vision_feature_layer, + vision_feature_select_strategy=vision_feature_select_strategy, + **kwargs, + ) + + # Make modules available through conditional class for BC + @property + def language_model(self): + return self.model.language_model + + @property + def vision_tower(self): + return self.model.vision_tower + + @property + def multi_modal_projector(self): + return self.model.multi_modal_projector + + @can_return_tuple + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + vision_feature_layer: Optional[Union[int, list[int]]] = None, + vision_feature_select_strategy: Optional[str] = None, + labels: Optional[torch.LongTensor] = None, + cache_position: Optional[torch.LongTensor] = None, + logits_to_keep: Union[int, torch.Tensor] = 0, + image_sizes: Optional[torch.Tensor] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> Union[tuple, AyaVisionCausalLMOutputWithPast]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Example: + + ```python + >>> from transformers import AutoProcessor, AyaVisionForConditionalGeneration + >>> import torch + + >>> torch_device = "cuda:0" + >>> processor = AutoProcessor.from_pretrained("CohereForAI/aya-vision-8b", use_fast=True) + >>> model = AyaVisionForConditionalGeneration.from_pretrained("CohereForAI/aya-vision-8b", device_map=torch_device) + + >>> messages = [ + ... { + ... "role": "user", + ... "content": [ + ... { + ... "type": "image", + ... "url": "https://pbs.twimg.com/media/Fx7YvfQWYAIp6rZ?format=jpg&name=medium", + ... }, + ... {"type": "text", "text": "चित्र में लिखा पाठ क्या कहता है?"}, + ... ], + ... } + ... ] + + >>> inputs = processor.apply_chat_template( + ... messages, padding=True, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", device=torch_device + ... ).to(model.device) + + >>> gen_tokens = model.generate(**inputs, max_new_tokens=300, do_sample=True, temperature=0.3) + >>> processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True) + ```""" + vision_feature_layer = ( + vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer + ) + vision_feature_select_strategy = ( + vision_feature_select_strategy + if vision_feature_select_strategy is not None + else self.config.vision_feature_select_strategy + ) + + outputs = self.model( + input_ids=input_ids, + pixel_values=pixel_values, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + vision_feature_layer=vision_feature_layer, + vision_feature_select_strategy=vision_feature_select_strategy, + cache_position=cache_position, + image_sizes=image_sizes, + **kwargs, + ) + + hidden_states = outputs[0] + # Only compute necessary logits, and do not upcast them to float if we are not computing the loss + slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep + logits = self.lm_head(hidden_states[:, slice_indices, :]) + + loss = None + if labels is not None: + loss = self.loss_function( + logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **kwargs + ) + + return AyaVisionCausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + image_hidden_states=outputs.image_hidden_states, + ) + + def prepare_inputs_for_generation( + self, + input_ids, + past_key_values=None, + inputs_embeds=None, + pixel_values=None, + attention_mask=None, + cache_position=None, + logits_to_keep=None, + **kwargs, + ): + # Overwritten -- in specific circumstances we don't want to forward image inputs to the model + + model_inputs = super().prepare_inputs_for_generation( + input_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + cache_position=cache_position, + logits_to_keep=logits_to_keep, + **kwargs, + ) + + if cache_position[0] == 0: + # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore + # Otherwise we need pixel values to be passed to model + model_inputs["pixel_values"] = pixel_values + + return model_inputs + + +__all__ = ["AyaVisionForConditionalGeneration", "AyaVisionPreTrainedModel", "AyaVisionModel"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/aya_vision/modular_aya_vision.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/aya_vision/modular_aya_vision.py new file mode 100644 index 0000000000000000000000000000000000000000..f76e046e0b948f0e4b33e361082bbd7332d254d1 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/aya_vision/modular_aya_vision.py @@ -0,0 +1,297 @@ +# coding=utf-8 +# Copyright 2025 the Cohere Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch AyaVision model.""" + +from typing import Optional, Union + +import torch +from torch import nn + +from transformers.models.llava.modeling_llava import ( + LlavaCausalLMOutputWithPast, + LlavaForConditionalGeneration, + LlavaModel, + LlavaModelOutputWithPast, + LlavaPreTrainedModel, + TransformersKwargs, +) + +from ...activations import ACT2FN +from ...cache_utils import Cache +from ...processing_utils import Unpack +from ...utils import auto_docstring, logging +from ...utils.generic import check_model_inputs +from .configuration_aya_vision import AyaVisionConfig + + +logger = logging.get_logger(__name__) + + +class AyaVisionMultiModalProjector(nn.Module): + def __init__(self, config: AyaVisionConfig): + super().__init__() + self.config = config + self.downsample_factor = config.downsample_factor + self.alignment_intermediate_size = getattr( + config, "alignment_intermediate_size", config.text_config.hidden_size + ) + self.layernorm = nn.LayerNorm( + config.vision_config.hidden_size * (config.downsample_factor**2), eps=config.adapter_layer_norm_eps + ) + + self.linear_1 = nn.Linear( + config.vision_config.hidden_size * (config.downsample_factor**2), + self.alignment_intermediate_size, + bias=True, + ) + + self.act = ACT2FN["silu"] # SwiGLU uses SiLU activation + # For SwiGLU, project down to half size since we split intermediate dim + self.linear_2 = nn.Linear(self.alignment_intermediate_size // 2, config.text_config.hidden_size, bias=True) + + def forward(self, image_features): + image_features = self.pixel_shuffle(image_features) + image_features = self.layernorm(image_features) + hidden_states = self.linear_1(image_features) + + # Split along last dimension and apply SwiGLU + x, gate = hidden_states.chunk(2, dim=-1) + hidden_states = self.act(gate) * x + + hidden_states = self.linear_2(hidden_states) + return hidden_states + + def pixel_shuffle(self, image_features): # B, S, D + batch_size, seq_length, feature_dim = image_features.shape + height = width = int(seq_length**0.5) + image_features = image_features.reshape(image_features.shape[0], width, height, -1) + channels = image_features.shape[-1] + image_features = image_features.reshape( + batch_size, width, int(height / self.downsample_factor), int(channels * self.downsample_factor) + ) + image_features = image_features.permute(0, 2, 1, 3) + image_features = image_features.reshape( + batch_size, int(height / self.downsample_factor), int(width / self.downsample_factor), -1 + ) + image_features = image_features.permute(0, 2, 1, 3) + return image_features + + +class AyaVisionPreTrainedModel(LlavaPreTrainedModel): + _can_compile_fullgraph = False + _can_record_outputs = { + "hidden_states": "DecoderLayer", + "attentions": "Attention", + } + + +class AyaVisionCausalLMOutputWithPast(LlavaCausalLMOutputWithPast): + pass + + +class AyaVisionModelOutputWithPast(LlavaModelOutputWithPast): + pass + + +class AyaVisionModel(LlavaModel): + # Unlike LLaVA, the model doesn't have to deal with Pixtral-style image states + def get_image_features( + self, + pixel_values: torch.FloatTensor, + vision_feature_layer: Optional[Union[int, list[int]]] = None, + vision_feature_select_strategy: Optional[str] = None, + **kwargs, + ): + """ + Obtains image last hidden states from the vision tower and apply multimodal projection. + + Args: + pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`): + The tensors corresponding to the input images. + vision_feature_layer (`Union[int, list[int]]`, *optional*): + The index of the layer to select the vision feature. If multiple indices are provided, + the vision feature of the corresponding indices will be concatenated to form the + vision features. + vision_feature_select_strategy (`str`, *optional*): + The feature selection strategy used to select the vision feature from the vision backbone. + Can be one of `"default"` or `"full"` + Returns: + image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`). + """ + vision_feature_layer = ( + vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer + ) + vision_feature_select_strategy = ( + vision_feature_select_strategy + if vision_feature_select_strategy is not None + else self.config.vision_feature_select_strategy + ) + + if vision_feature_select_strategy not in ["default", "full"]: + raise ValueError(f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}") + + kwargs = {k: v for k, v in kwargs.items() if v is not None} + # this is not memory efficient at all (output_hidden_states=True) will save all the hidden states. + image_outputs = self.vision_tower(pixel_values, output_hidden_states=True, **kwargs) + + # If we have one vision feature layer, return the corresponding hidden states, + # otherwise, select the hidden states of each feature layer and concatenate them + if isinstance(vision_feature_layer, int): + selected_image_feature = image_outputs.hidden_states[vision_feature_layer] + if vision_feature_select_strategy == "default": + selected_image_feature = selected_image_feature[:, 1:] + else: + hs_pool = [image_outputs.hidden_states[layer_idx] for layer_idx in vision_feature_layer] + # For default; crop CLS from each hidden state in the hidden state pool + if vision_feature_select_strategy == "default": + hs_pool = [hs[:, 1:] for hs in hs_pool] + selected_image_feature = torch.cat(hs_pool, dim=-1) + + image_features = self.multi_modal_projector(selected_image_feature) + return image_features + + @check_model_inputs + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + vision_feature_layer: Optional[Union[int, list[int]]] = None, + vision_feature_select_strategy: Optional[str] = None, + use_cache: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> Union[tuple, AyaVisionModelOutputWithPast]: + vision_feature_layer = ( + vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer + ) + vision_feature_select_strategy = ( + vision_feature_select_strategy + if vision_feature_select_strategy is not None + else self.config.vision_feature_select_strategy + ) + + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") + + if inputs_embeds is None: + inputs_embeds = self.get_input_embeddings()(input_ids) + + if pixel_values is not None: + image_features = self.get_image_features( + pixel_values=pixel_values, + vision_feature_layer=vision_feature_layer, + vision_feature_select_strategy=vision_feature_select_strategy, + ) + image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype) + special_image_mask = self.get_placeholder_mask( + input_ids, inputs_embeds=inputs_embeds, image_features=image_features + ) + inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features) + + outputs = self.language_model( + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + cache_position=cache_position, + **kwargs, + ) + + return AyaVisionModelOutputWithPast( + last_hidden_state=outputs.last_hidden_state, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + image_hidden_states=image_features if pixel_values is not None else None, + ) + + +class AyaVisionForConditionalGeneration(LlavaForConditionalGeneration): + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + vision_feature_layer: Optional[Union[int, list[int]]] = None, + vision_feature_select_strategy: Optional[str] = None, + labels: Optional[torch.LongTensor] = None, + cache_position: Optional[torch.LongTensor] = None, + logits_to_keep: Union[int, torch.Tensor] = 0, + image_sizes: Optional[torch.Tensor] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> Union[tuple, AyaVisionCausalLMOutputWithPast]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Example: + + ```python + >>> from transformers import AutoProcessor, AyaVisionForConditionalGeneration + >>> import torch + + >>> torch_device = "cuda:0" + >>> processor = AutoProcessor.from_pretrained("CohereForAI/aya-vision-8b", use_fast=True) + >>> model = AyaVisionForConditionalGeneration.from_pretrained("CohereForAI/aya-vision-8b", device_map=torch_device) + + >>> messages = [ + ... { + ... "role": "user", + ... "content": [ + ... { + ... "type": "image", + ... "url": "https://pbs.twimg.com/media/Fx7YvfQWYAIp6rZ?format=jpg&name=medium", + ... }, + ... {"type": "text", "text": "चित्र में लिखा पाठ क्या कहता है?"}, + ... ], + ... } + ... ] + + >>> inputs = processor.apply_chat_template( + ... messages, padding=True, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", device=torch_device + ... ).to(model.device) + + >>> gen_tokens = model.generate(**inputs, max_new_tokens=300, do_sample=True, temperature=0.3) + >>> processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True) + ```""" + super().forward( + input_ids=input_ids, + pixel_values=pixel_values, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + vision_feature_layer=vision_feature_layer, + vision_feature_select_strategy=vision_feature_select_strategy, + labels=labels, + cache_position=cache_position, + logits_to_keep=logits_to_keep, + image_sizes=image_sizes, + **kwargs, + ) + + +__all__ = ["AyaVisionForConditionalGeneration", "AyaVisionPreTrainedModel", "AyaVisionModel"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/aya_vision/processing_aya_vision.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/aya_vision/processing_aya_vision.py new file mode 100644 index 0000000000000000000000000000000000000000..7045c967046da163fa2d7daef04f5c6e6bebc794 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/aya_vision/processing_aya_vision.py @@ -0,0 +1,257 @@ +# coding=utf-8 +# Copyright 2025 HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Optional, Union + +import numpy as np + +from ...image_processing_utils import BatchFeature +from ...image_utils import ImageInput, make_flat_list_of_images +from ...processing_utils import ImagesKwargs, MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack +from ...tokenization_utils_base import PreTokenizedInput, TextInput + + +class AyaVisionImagesKwargs(ImagesKwargs, total=False): + crop_to_patches: Optional[bool] + min_patches: Optional[int] + max_patches: Optional[int] + + +class AyaVisionProcessorKwargs(ProcessingKwargs, total=False): + images_kwargs: AyaVisionImagesKwargs + _defaults = { + "text_kwargs": { + "padding_side": "left", + "padding": True, + "return_mm_token_type_ids": False, + }, + "images_kwargs": { + "crop_to_patches": True, + }, + } + + +class AyaVisionProcessor(ProcessorMixin): + r""" + Constructs a AyaVision processor which wraps a [`AutoImageProcessor`] and + [`PretrainedTokenizerFast`] tokenizer into a single processor that inherits both the image processor and + tokenizer functionalities. See the [`~AyaVisionProcessor.__call__`] and [`~AyaVisionProcessor.decode`] for more information. + Args: + image_processor ([`AutoImageProcessor`], *optional*): + The image processor is a required input. + tokenizer ([`PreTrainedTokenizer`, `PreTrainedTokenizerFast`], *optional*): + The tokenizer is a required input. + patch_size (`int`, *optional*, defaults to 28): + The size of image patches for tokenization. + img_size (`int`, *optional*, defaults to 364): + The size of the image to be tokenized. This should correspond to the size given to the image processor. + image_token (`str`, *optional*, defaults to `""`): + The token to be used to represent an image in the text. + downsample_factor (`int`, *optional*, defaults to 1): + The factor by which to scale the patch size. + start_of_img_token (`str`, *optional*, defaults to `"<|START_OF_IMG|>"`): + The token to be used to represent the start of an image in the text. + end_of_img_token (`str`, *optional*, defaults to `"<|END_OF_IMG|>"`): + The token to be used to represent the end of an image in the text. + img_patch_token (`str`, *optional*, defaults to `"<|IMG_PATCH|>"`): + The token to be used to represent an image patch in the text. + img_line_break_token (`str`, *optional*, defaults to `"<|IMG_LINE_BREAK|>"`): + The token to be used to represent a line break in the text. + tile_token (`str`, *optional*, defaults to `"TILE"`): + The token to be used to represent an image patch in the text. + tile_global_token (`str`, *optional*, defaults to `"TILE_GLOBAL"`): + The token to be used to represent the cover image in the text. + chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages + in a chat into a tokenizable string. + """ + + attributes = ["image_processor", "tokenizer"] + image_processor_class = "AutoImageProcessor" + tokenizer_class = "AutoTokenizer" + + def __init__( + self, + image_processor=None, + tokenizer=None, + patch_size: int = 28, + img_size: int = 364, + image_token="", # set the default and let users change if they have peculiar special tokens in rare cases + downsample_factor: int = 1, + start_of_img_token="<|START_OF_IMG|>", + end_of_img_token="<|END_OF_IMG|>", + img_patch_token="<|IMG_PATCH|>", + img_line_break_token="<|IMG_LINE_BREAK|>", + tile_token="TILE", + tile_global_token="TILE_GLOBAL", + chat_template=None, + **kwargs, + ): + super().__init__(image_processor, tokenizer, chat_template=chat_template) + + self.image_token = image_token + self.patch_size = patch_size * downsample_factor + self.img_size = img_size + + self.start_of_img_token = start_of_img_token + self.end_of_img_token = end_of_img_token + self.img_patch_token = img_patch_token + self.img_line_break_token = img_line_break_token + self.tile_token = tile_token + self.tile_global_token = tile_global_token + self.image_token_id = tokenizer.convert_tokens_to_ids(self.img_patch_token) + self.image_ids = tokenizer.convert_tokens_to_ids( + [img_patch_token, tile_token, tile_global_token, start_of_img_token, end_of_img_token] + ) + + def _prompt_split_image(self, num_patches): + """ + Create a structured string representation of image tokens + + Args: + num_patches: Number of patches in the image + + Returns: + String with appropriate image tokens + """ + + img_patches_per_tile = (self.img_size // self.patch_size) ** 2 + img_string = f"{self.start_of_img_token}" + if num_patches > 1: + for idx in range(1, num_patches): + img_string += f"{self.tile_token}_{idx}" + f"{self.img_patch_token}" * img_patches_per_tile + + img_string += f"{self.tile_global_token}" + f"{self.img_patch_token}" * img_patches_per_tile + img_string += f"{self.end_of_img_token}" + return img_string + + def __call__( + self, + images: Optional[ImageInput] = None, + text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None, + audio=None, + videos=None, + **kwargs: Unpack[AyaVisionProcessorKwargs], + ) -> BatchFeature: + """ + Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text` + and `kwargs` arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizerFast.__call__`] to encode the text. + To prepare the vision inputs, this method forwards the `images` and `kwargs` arguments to + GotOcr2ImageProcessor's [`~GotOcr2ImageProcessor.__call__`] if `images` is not `None`. + + Args: + images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`): + The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch + tensor. Both channels-first and channels-last formats are supported. + text (`str`, `list[str]`, `list[list[str]]`): + The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings + (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set + `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). + return_tensors (`str` or [`~utils.TensorType`], *optional*): + If set, will return tensors of a particular framework. Acceptable values are: + - `'tf'`: Return TensorFlow `tf.constant` objects. + - `'pt'`: Return PyTorch `torch.Tensor` objects. + - `'np'`: Return NumPy `np.ndarray` objects. + - `'jax'`: Return JAX `jnp.ndarray` objects. + + Returns: + [`BatchFeature`]: A [`BatchFeature`] with the following fields: + + - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`. + - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when + `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not + `None`). + - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`. + """ + if text is None: + raise ValueError("You have to specify text.") + + output_kwargs = self._merge_kwargs( + AyaVisionProcessorKwargs, + tokenizer_init_kwargs=self.tokenizer.init_kwargs, + **kwargs, + ) + + if not isinstance(text, (list, tuple)): + text = [text] + + # Process images + image_inputs = {} + if images is not None: + images = self.image_processor.fetch_images(images) + images = make_flat_list_of_images(images) + image_inputs = self.image_processor(images=images, **output_kwargs["images_kwargs"]) + num_patches = image_inputs.pop("num_patches") + image_index = 0 + processed_text = [] + for prompt in text: + new_prompt = prompt + while "" in new_prompt: + # Replace the image placeholder with structured image tokens + image_tokens = self._prompt_split_image(num_patches[image_index]) + new_prompt = new_prompt.replace("", image_tokens, 1) + image_index += 1 + processed_text.append(new_prompt) + + if image_index != len(images): + raise ValueError("Number of image placeholders in the prompt does not match the number of images.") + + text = processed_text + + return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None) + return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", False) + text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"], return_tensors=None) + + if return_mm_token_type_ids: + array_ids = np.array(text_inputs["input_ids"]) + mm_token_type_ids = np.zeros_like(text_inputs["input_ids"]) + mm_token_type_ids[np.isin(array_ids, self.image_ids)] = 1 + text_inputs["mm_token_type_ids"] = mm_token_type_ids.tolist() + + return BatchFeature(data={**text_inputs, **image_inputs}, tensor_type=return_tensors) + + def _get_num_multimodal_tokens(self, image_sizes=None, **kwargs): + """ + Computes the number of placeholder tokens needed for multimodal inputs with the given sizes. + + Args: + image_sizes (`list[list[int]]`, *optional*): + The input sizes formatted as (height, width) per each image. + + Returns: + `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided + input modalities, along with other useful data. + """ + + vision_data = {} + if image_sizes is not None: + images_kwargs = AyaVisionProcessorKwargs._defaults.get("images_kwargs", {}) + images_kwargs.update(kwargs) + + num_image_patches = [ + self.image_processor.get_number_of_image_patches(*image_size, images_kwargs) + for image_size in image_sizes + ] + + token_per_patch = (self.img_size // self.patch_size) ** 2 + num_image_tokens = [ + token_per_patch + 3 + sum(token_per_patch + 1 for _ in range(1, num_patches)) + for num_patches in num_image_patches + ] # Add +3 and +1 for BOI/EOI and image tile tokens + vision_data.update({"num_image_tokens": num_image_tokens, "num_image_patches": num_image_patches}) + + return MultiModalData(**vision_data) + + +__all__ = ["AyaVisionProcessor"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bamba/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bamba/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..c3920da849a33318e626f6df3fe65d1abc71aac7 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bamba/__init__.py @@ -0,0 +1,28 @@ +# Copyright 2024 IBM and the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_bamba import * + from .modeling_bamba import * + from .processing_bamba import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bamba/configuration_bamba.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bamba/configuration_bamba.py new file mode 100644 index 0000000000000000000000000000000000000000..efa9d5f3d08a0ce0e1cdb496457316369badce95 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bamba/configuration_bamba.py @@ -0,0 +1,210 @@ +# coding=utf-8 +# Copyright 2024 IBM and the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Bamba model configuration""" + +from ...configuration_utils import PretrainedConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class BambaConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`BambaModel`]. It is used to instantiate a + BambaModel model according to the specified arguments, defining the model architecture. Instantiating a configuration + with defaults taken from [ibm-fms/Bamba-9.8b-2.2T-hf](https://huggingface.co/ibm-fms/Bamba-9.8b-2.2T-hf). + + The BambaModel is a hybrid [mamba2](https://github.com/state-spaces/mamba) architecture with SwiGLU. + The checkpoints are jointly trained by IBM, Princeton, and UIUC. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + vocab_size (`int`, *optional*, defaults to 128000): + Vocabulary size of the Bamba model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`BambaModel`] + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether the model's input and output word embeddings should be tied. Note that this is only relevant if the + model has an output word embedding layer. + hidden_size (`int`, *optional*, defaults to 4096): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 14336): + Dimension of the MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 32): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 32): + Number of attention heads for each attention layer in the Transformer encoder. + num_key_value_heads (`int`, *optional*, defaults to 8): + This is the number of key_value heads that should be used to implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When + converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed + by meanpooling all the original heads within that group. For more details, check out [this + paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to `8`. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the decoder. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + rms_norm_eps (`float`, *optional*, defaults to 1e-05): + The epsilon used by the rms normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + num_logits_to_keep (`int` or `None`, *optional*, defaults to 1): + Number of prompt logits to calculate during generation. If `None`, all logits will be calculated. If an + integer value, only last `num_logits_to_keep` logits will be calculated. Default is 1 because only the + logits of the last prompt token are needed for generation. For long sequences, the logits for the entire + sequence may use a lot of memory so, setting `num_logits_to_keep=1` will reduce memory footprint + significantly. + pad_token_id (`int`, *optional*, defaults to 0): + The id of the padding token. + bos_token_id (`int`, *optional*, defaults to 1): + The id of the "beginning-of-sequence" token. + eos_token_id (`int`, *optional*, defaults to 2): + The id of the "end-of-sequence" token. + max_position_embeddings (`int`, *optional*, defaults to 262144): + Max cached sequence length for the model + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + attn_layer_indices (`list`, *optional*): + Specifies the layer indices that will have full attention. Must contain values at most num_hidden_layers. + mamba_n_heads (`int`, *optional*, defaults to 128): + The number of mamba heads used in the v2 implementation. + mamba_d_head (`int`, *optional*, defaults to `"auto"`): + Head embedding dimension size + mamba_n_groups (`int`, *optional*, defaults to 1): + The number of the mamba groups used in the v2 implementation. + mamba_d_state (`int`, *optional*, defaults to 256): + The dimension the mamba state space latents + mamba_d_conv (`int`, *optional*, defaults to 4): + The size of the mamba convolution kernel + mamba_expand (`int`, *optional*, defaults to 2): + Expanding factor (relative to hidden_size) used to determine the mamba intermediate size + mamba_chunk_size (`int`, *optional*, defaults to 256): + The chunks in which to break the sequence when doing prefill/training + mamba_conv_bias (`bool`, *optional*, defaults to `True`): + Flag indicating whether or not to use bias in the convolution layer of the mamba mixer block. + mamba_proj_bias (`bool`, *optional*, defaults to `False`): + Flag indicating whether or not to use bias in the input and output projections (["in_proj", "out_proj"]) of the mamba mixer block + z_loss_coefficient (`float`, *optional*, defaults to 0.0): + Coefficient for auxiliary z-loss used to control logit growth during training + + """ + + model_type = "bamba" + keys_to_ignore_at_inference = ["past_key_values"] + + def __init__( + self, + vocab_size=128000, + tie_word_embeddings=False, + hidden_size=4096, + intermediate_size=14336, + num_hidden_layers=32, + num_attention_heads=32, + num_key_value_heads=8, + hidden_act="silu", + initializer_range=0.02, + rms_norm_eps=1e-5, + use_cache=True, + num_logits_to_keep=1, + pad_token_id=0, + bos_token_id=1, + eos_token_id=2, + max_position_embeddings=262144, + attention_dropout=0.0, + attn_layer_indices=None, + mamba_n_heads=128, + mamba_d_head="auto", + mamba_n_groups=1, + mamba_d_state=256, + mamba_d_conv=4, + mamba_expand=2, + mamba_chunk_size=256, + mamba_conv_bias=True, + mamba_proj_bias=False, + z_loss_coefficient=0.0, + **kwargs, + ): + self.vocab_size = vocab_size + self.tie_word_embeddings = tie_word_embeddings + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.max_position_embeddings = max_position_embeddings + self.attention_dropout = attention_dropout + self.attention_bias = False + self.mlp_bias = False + + # for backward compatibility + if num_key_value_heads is None: + num_key_value_heads = num_attention_heads + + self.num_key_value_heads = num_key_value_heads + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + + self.use_cache = use_cache + self.num_logits_to_keep = num_logits_to_keep + + self.attn_layer_indices = attn_layer_indices + self.rope_theta = 10000.0 + self.rope_scaling = None + self.partial_rotary_factor = 0.5 + + mamba_intermediate = mamba_expand * hidden_size + + if mamba_intermediate % mamba_n_heads != 0: + raise ValueError("mamba_n_heads must divide mamba_expand * hidden_size") + + # for the mamba_v2, must satisfy the following + if mamba_d_head == "auto": + mamba_d_head = mamba_intermediate // mamba_n_heads + + if mamba_d_head * mamba_n_heads != mamba_intermediate: + raise ValueError("The dimensions for the Mamba head state do not match the model intermediate_size") + + self.mamba_n_heads = mamba_n_heads + self.mamba_d_head = mamba_d_head + self.mamba_n_groups = mamba_n_groups + self.mamba_d_state = mamba_d_state + self.mamba_d_conv = mamba_d_conv + self.mamba_expand = mamba_expand + self.mamba_chunk_size = mamba_chunk_size + self.mamba_conv_bias = mamba_conv_bias + self.mamba_proj_bias = mamba_proj_bias + self.z_loss_coefficient = z_loss_coefficient + + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) + + @property + def layers_block_type(self): + return [ + "attention" if (self.attn_layer_indices and i in self.attn_layer_indices) else "mamba" + for i in range(self.num_hidden_layers) + ] + + +__all__ = ["BambaConfig"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bamba/modeling_bamba.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bamba/modeling_bamba.py new file mode 100644 index 0000000000000000000000000000000000000000..60bf385bf49423e3e34ba1c6bd3d737f509eed10 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bamba/modeling_bamba.py @@ -0,0 +1,1504 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/bamba/modular_bamba.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_bamba.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright 2024 IBM and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Any, Callable, Optional, TypedDict, Union + +import torch +from torch import nn + +from transformers.activations import ACT2FN + +from ...cache_utils import Cache +from ...generation import GenerationMixin +from ...integrations import use_kernel_forward_from_hub +from ...modeling_attn_mask_utils import AttentionMaskConverter +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast +from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update +from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel +from ...processing_utils import Unpack +from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging +from ...utils.deprecation import deprecate_kwarg +from ...utils.import_utils import is_causal_conv1d_available, is_mamba_2_ssm_available +from .configuration_bamba import BambaConfig + + +if is_mamba_2_ssm_available(): + from mamba_ssm.ops.triton.selective_state_update import selective_state_update + from mamba_ssm.ops.triton.ssd_combined import mamba_chunk_scan_combined, mamba_split_conv1d_scan_combined +else: + selective_state_update = None + +if is_causal_conv1d_available(): + from causal_conv1d import causal_conv1d_fn, causal_conv1d_update +else: + causal_conv1d_update, causal_conv1d_fn = None, None + + +logger = logging.get_logger(__name__) + + +class BambaFlashAttentionKwargs(TypedDict, total=False): + """ + Keyword arguments for advanced Flash Attention, causal-conv1d, and mamba_ssm kernel usage. + Use cases include padding-free training and fewer `torch.compile` graph breaks. + + Attributes: + cu_seq_lens_q (`torch.LongTensor`) + Gets cumulative sequence length for query state. + cu_seq_lens_k (`torch.LongTensor`) + Gets cumulative sequence length for key state. + max_length_q (`int`): + Maximum sequence length for query state. + max_length_k (`int`): + Maximum sequence length for key state. + seq_idx (`torch.IntTensor): + Index of each packed sequence. + """ + + cu_seq_lens_q: torch.LongTensor + cu_seq_lens_k: torch.LongTensor + max_length_q: int + max_length_k: int + seq_idx: torch.IntTensor + + +class HybridMambaAttentionDynamicCache: + """ + A dynamic cache that can handle both the attention cache (which has a seq_len dimension) and the mamba cache + (which has a constant shape regardless of seq_len). + + This cache has two sets of lists of tensors: `key_cache` and `value_cache` for attention cache and `conv_states` + and `ssm_states` for mamba cache. Each of these lists has `num_layers` tensors. The expected shape for each tensor + For attention layers, `key_cache` and `value_cache` have a shape of `(batch_size, num_heads, seq_len, head_dim)`, + while `conv_states` and `ssm_states` have a shape of `(batch_size, 0)` (empty tensors). + For mamba layers, `key_cache` and `value_cache` have a shape of `(batch_size, 0)` (empty tensors), + while `conv_states` represents the convolution state and has a shape of `(batch_size, d_inner, d_conv)`, + and `ssm_states` represents the ssm state and has a shape of `(batch_size, d_inner, d_state)`. + """ + + is_compileable = False + + def __init__(self, config: BambaConfig, batch_size, dtype=torch.float16, device=None): + self.layers_block_type = config.layers_block_type + self.has_previous_state = False # only used by mamba + conv_kernel_size = config.mamba_d_conv + ssm_state_size = config.mamba_d_state + + self.conv_states = [] + self.ssm_states = [] + self.transformer_layers = [] + for i in range(config.num_hidden_layers): + if self.layers_block_type[i] == "mamba": + self.conv_states += [ + torch.zeros( + batch_size, + (config.mamba_expand * config.hidden_size + 2 * config.mamba_n_groups * ssm_state_size), + conv_kernel_size, + device=device, + dtype=dtype, + ) + ] + self.ssm_states += [ + torch.zeros( + batch_size, + config.mamba_n_heads, + config.mamba_d_head, + ssm_state_size, + device=device, + dtype=dtype, + ) + ] + else: + self.conv_states += [torch.tensor([[]] * batch_size, device=device)] + self.ssm_states += [torch.tensor([[]] * batch_size, device=device)] + self.transformer_layers.append(i) + + self.key_cache = [torch.tensor([[]] * batch_size, device=device) for _ in range(config.num_hidden_layers)] + self.value_cache = [torch.tensor([[]] * batch_size, device=device) for _ in range(config.num_hidden_layers)] + + def update( + self, + key_states: torch.Tensor, + value_states: torch.Tensor, + layer_idx: int, + cache_kwargs: Optional[dict[str, Any]] = None, + ) -> tuple[torch.Tensor, torch.Tensor]: + # Update the cache + if self.key_cache[layer_idx].shape[-1] == 0: + self.key_cache[layer_idx] = key_states + self.value_cache[layer_idx] = value_states + else: + self.key_cache[layer_idx] = torch.cat([self.key_cache[layer_idx], key_states], dim=2) + self.value_cache[layer_idx] = torch.cat([self.value_cache[layer_idx], value_states], dim=2) + + return self.key_cache[layer_idx], self.value_cache[layer_idx] + + def reorder_cache(self, beam_idx: torch.LongTensor): + """Reorders the cache for beam search, given the selected beam indices.""" + for layer_idx in range(len(self.key_cache)): + device = self.key_cache[layer_idx].device + self.key_cache[layer_idx] = self.key_cache[layer_idx].index_select(0, beam_idx.to(device)) + device = self.value_cache[layer_idx].device + self.value_cache[layer_idx] = self.value_cache[layer_idx].index_select(0, beam_idx.to(device)) + + device = self.conv_states[layer_idx].device + self.conv_states[layer_idx] = self.conv_states[layer_idx].index_select(0, beam_idx.to(device)) + device = self.ssm_states[layer_idx].device + self.ssm_states[layer_idx] = self.ssm_states[layer_idx].index_select(0, beam_idx.to(device)) + + def get_seq_length(self, layer_idx: Optional[int] = 0) -> int: + """Returns the sequence length of the cached states. A layer index can be optionally passed.""" + # take any layer that contains cache and not empty tensor + layer_idx = self.transformer_layers[0] if layer_idx not in self.transformer_layers else layer_idx + if len(self.key_cache) <= layer_idx: + return 0 + return self.key_cache[layer_idx].shape[-2] + + +class BambaRotaryEmbedding(nn.Module): + inv_freq: torch.Tensor # fix linting for `register_buffer` + + def __init__(self, config: BambaConfig, device=None): + super().__init__() + # BC: "rope_type" was originally "type" + if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): + self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) + else: + self.rope_type = "default" + self.max_seq_len_cached = config.max_position_embeddings + self.original_max_seq_len = config.max_position_embeddings + + self.config = config + self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + + inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) + self.original_inv_freq = self.inv_freq + + @torch.no_grad() + @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) + def forward(self, x, position_ids): + inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device) + position_ids_expanded = position_ids[:, None, :].float() + + device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" + with torch.autocast(device_type=device_type, enabled=False): # Force float32 + freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) + emb = torch.cat((freqs, freqs), dim=-1) + cos = emb.cos() * self.attention_scaling + sin = emb.sin() * self.attention_scaling + + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) + + +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + +def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: + """ + This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, + num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) + """ + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) + + +def eager_attention_forward( + module: nn.Module, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attention_mask: Optional[torch.Tensor], + scaling: float, + dropout: float = 0.0, + **kwargs: Unpack[TransformersKwargs], +): + key_states = repeat_kv(key, module.num_key_value_groups) + value_states = repeat_kv(value, module.num_key_value_groups) + + attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling + if attention_mask is not None: + causal_mask = attention_mask[:, :, :, : key_states.shape[-2]] + attn_weights = attn_weights + causal_mask + + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype) + attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) + attn_output = torch.matmul(attn_weights, value_states) + attn_output = attn_output.transpose(1, 2).contiguous() + + return attn_output, attn_weights + + +# Adapted from transformers.models.glm.modular_glm.apply_rotary_pos_emb +def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1): + """Applies Rotary Position Embedding to the query and key tensors. + + Removes the interleaving of cos and sin from GLM + + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + position_ids (`torch.Tensor`, *optional*): + Deprecated and unused. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + cos = cos.unsqueeze(unsqueeze_dim) + sin = sin.unsqueeze(unsqueeze_dim) + + # Keep half or full tensor for later concatenation + rotary_dim = cos.shape[-1] + q_rot, q_pass = q[..., :rotary_dim], q[..., rotary_dim:] + k_rot, k_pass = k[..., :rotary_dim], k[..., rotary_dim:] + + # Apply rotary embeddings on the first half or full tensor + q_embed = (q_rot * cos) + (rotate_half(q_rot) * sin) + k_embed = (k_rot * cos) + (rotate_half(k_rot) * sin) + + # Concatenate back to full shape + q_embed = torch.cat([q_embed, q_pass], dim=-1) + k_embed = torch.cat([k_embed, k_pass], dim=-1) + return q_embed, k_embed + + +class BambaAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config: BambaConfig, layer_idx: int): + super().__init__() + self.config = config + self.layer_idx = layer_idx + self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads) + self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads + self.scaling = self.head_dim**-0.5 + self.attention_dropout = config.attention_dropout + self.is_causal = True + + self.q_proj = nn.Linear( + config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias + ) + self.k_proj = nn.Linear( + config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias + ) + self.v_proj = nn.Linear( + config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias + ) + self.o_proj = nn.Linear( + config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias + ) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + position_embeddings: tuple[torch.Tensor, torch.Tensor], + attention_mask: Optional[torch.Tensor], + past_key_values: Optional[Cache] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> tuple[torch.Tensor, torch.Tensor]: + input_shape = hidden_states.shape[:-1] + hidden_shape = (*input_shape, -1, self.head_dim) + + query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2) + key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2) + value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2) + + cos, sin = position_embeddings + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) + + if past_key_values is not None: + # sin and cos are specific to RoPE models; cache_position needed for the static cache + cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} + key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs) + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + attn_output, attn_weights = attention_interface( + self, + query_states, + key_states, + value_states, + attention_mask, + dropout=0.0 if not self.training else self.attention_dropout, + scaling=self.scaling, + **kwargs, + ) + + attn_output = attn_output.reshape(*input_shape, -1).contiguous() + attn_output = self.o_proj(attn_output) + return attn_output, attn_weights + + +class BambaRMSNormGated(torch.nn.Module): + def __init__(self, hidden_size, eps=1e-6): + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states, gate=None): + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + + if gate is not None: + hidden_states = hidden_states * nn.functional.silu(gate.to(torch.float32)) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + + return self.weight * hidden_states.to(input_dtype) + + +# Helper methods for segment sum computation + + +def pad_tensor_by_size(input_tensor: torch.Tensor, pad_size: int): + """ + Padding x tensor with `pad_size` on the seq_len dim (dim=1) + + Assumes that we only have tensors of either size 4 or 3 + """ + pad_shape = (0, 0, 0, 0, 0, pad_size, 0, 0) if len(input_tensor.shape) == 4 else (0, 0, 0, pad_size, 0, 0) + + return torch.nn.functional.pad(input_tensor, pad_shape, mode="constant", value=0) + + +def reshape_into_chunks(input_tensor, pad_size, chunk_size): + """ + Padding input_tensor with `pad_size` on the seq_len dim (dim=1) and + simultaneously splitting it into chunk sequences. + + Assumes that we only have tensors of either size 4 or 3 + """ + # [bsz, seq_len, ...] -> [bsz, seq_len multiple of chunk_size, ...] + input_tensor = pad_tensor_by_size(input_tensor, pad_size) + + if len(input_tensor.shape) == 3: + # [bsz, seq_len multiple of chunk_size, num_heads] -> [bsz, -1, chunk_size, num_heads] + return input_tensor.reshape(input_tensor.shape[0], -1, chunk_size, input_tensor.shape[2]) + else: + # [bsz, seq_len multiple of chunk_size, num_heads, head_dim or state_size] -> [bsz, -1, chunk_size, num_heads, head_dim or state_size] + return input_tensor.reshape( + input_tensor.shape[0], -1, chunk_size, input_tensor.shape[2], input_tensor.shape[3] + ) + + +def segment_sum(input_tensor): + """ + More stable segment sum calculation. Uses cumulative sums and masking instead of direct subtractions. + """ + chunk_size = input_tensor.size(-1) + # 1. expand input tensor to have an additional dimension and repeat along that dimension + # [..., chunk_size] -> [..., chunk_size, chunk_size] + input_tensor = input_tensor[..., None].expand(*input_tensor.size(), chunk_size) + # 2. create a lower triangular mask with the diagonal set to 0 to 0 out elements above diag + mask = torch.tril(torch.ones(chunk_size, chunk_size, device=input_tensor.device, dtype=torch.bool), diagonal=-1) + input_tensor = input_tensor.masked_fill(~mask, 0) + # 3. compute actual cumsum + tensor_segsum = torch.cumsum(input_tensor, dim=-2) + + # 4. apply mask to keep only the lower triangular part of the cumulative sum result (incl diagonal this time) + mask = torch.tril(torch.ones(chunk_size, chunk_size, device=input_tensor.device, dtype=torch.bool), diagonal=0) + tensor_segsum = tensor_segsum.masked_fill(~mask, -torch.inf) + return tensor_segsum + + +is_fast_path_available = all((selective_state_update, causal_conv1d_fn, causal_conv1d_update)) + + +def apply_mask_to_padding_states(hidden_states, attention_mask): + """ + Tunes out the hidden states for padding tokens, see https://github.com/state-spaces/mamba/issues/66 + """ + if attention_mask is not None and attention_mask.shape[1] > 1 and attention_mask.shape[0] > 1: + dtype = hidden_states.dtype + hidden_states = (hidden_states * attention_mask[:, :, None]).to(dtype) + + return hidden_states + + +# Adapted from transformers.models.mamba2.modeling_mamba2.Mamba2Mixer +class BambaMixer(nn.Module): + """ + Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`. + A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective) + ∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4, + and is why Mamba is called **selective** state spaces) + + The are a few differences between this and Mamba2Mixer: + - The variable use_precomputed_states is slightly different due to the hybrid cache structure + - There's a few non-obvious bugs fixed with batching in the slow path that exist in main + - Some extra variables that our layer doesn't need have been removed + - We ported most of the refactors in https://github.com/huggingface/transformers/pull/35154, which is (as of Dec 18, 2024) unmerged + """ + + def __init__(self, config: BambaConfig, layer_idx: int): + super().__init__() + self.num_heads = config.mamba_n_heads + self.hidden_size = config.hidden_size + self.ssm_state_size = config.mamba_d_state + self.conv_kernel_size = config.mamba_d_conv + self.intermediate_size = int(config.mamba_expand * self.hidden_size) + self.layer_idx = layer_idx + self.use_conv_bias = config.mamba_conv_bias + self.activation = config.hidden_act + self.act = ACT2FN[config.hidden_act] + self.use_bias = config.mamba_proj_bias + + self.layer_norm_epsilon = config.rms_norm_eps + + self.n_groups = config.mamba_n_groups + self.head_dim = config.mamba_d_head + self.chunk_size = config.mamba_chunk_size + + # FIXME: + self.time_step_limit = (0.0, float("inf")) + self.time_step_min = 0.001 + self.time_step_max = 0.1 + + self.conv_dim = self.intermediate_size + 2 * self.n_groups * self.ssm_state_size + self.conv1d = nn.Conv1d( + in_channels=self.conv_dim, + out_channels=self.conv_dim, + bias=config.mamba_conv_bias, + kernel_size=self.conv_kernel_size, + groups=self.conv_dim, + padding=self.conv_kernel_size - 1, + ) + + # projection of the input hidden states + projection_size = self.intermediate_size + self.conv_dim + self.num_heads + self.in_proj = nn.Linear( + self.hidden_size, + projection_size, + bias=self.use_bias, + ) + # selective projection used to make dt, B and C input dependent + + # time step projection (discretization) + # instantiate once and copy inv_dt in init_weights of PretrainedModel + self.dt_bias = nn.Parameter(torch.ones(self.num_heads)) + + # S4D real initialization. These are not discretized! + # The core is to load them, compute the discrete states, then write the updated state. Keeps the memory bounded + A = torch.arange(1, self.num_heads + 1) + self.A_log = nn.Parameter(torch.log(A)) + self.norm = BambaRMSNormGated(self.intermediate_size, eps=self.layer_norm_epsilon) + self.D = nn.Parameter(torch.ones(self.num_heads)) + + self.out_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=self.use_bias) + + if not is_fast_path_available: + logger.warning_once( + "The fast path is not available because one of `(selective_state_update, causal_conv1d_fn, causal_conv1d_update)`" + " is None. Falling back to the naive implementation. To install follow https://github.com/state-spaces/mamba/#installation and" + " https://github.com/Dao-AILab/causal-conv1d" + ) + else: + logger.warning_once("The fast path for Bamba will be used when running the model on a GPU") + + def cuda_kernels_forward( + self, + hidden_states: torch.Tensor, + cache_params: Optional[HybridMambaAttentionDynamicCache] = None, + cache_position: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + seq_idx: Optional[torch.IntTensor] = None, + ): + # 1. Gated MLP's linear projection + hidden_states = apply_mask_to_padding_states(hidden_states, attention_mask) + projected_states = self.in_proj(hidden_states) + + # Set up dimensions for reshapes later + batch_size, seq_len, _ = hidden_states.shape + groups_time_state_size = self.n_groups * self.ssm_state_size + + use_precomputed_states = ( + cache_params is not None + and cache_params.has_previous_state + and seq_len == 1 + and cache_params.conv_states[self.layer_idx].shape[0] + == cache_params.ssm_states[self.layer_idx].shape[0] + == batch_size + and cache_position is not None + and cache_position[0] > 0 + ) + + # getting projected states from cache if it exists + if use_precomputed_states: + gate, hidden_states_B_C, dt = projected_states.squeeze(1).split( + [self.intermediate_size, self.conv_dim, self.num_heads], dim=-1 + ) + + # 2. Convolution sequence transformation + hidden_states_B_C = causal_conv1d_update( + hidden_states_B_C, + cache_params.conv_states[self.layer_idx], + self.conv1d.weight.squeeze(1), + self.conv1d.bias, + self.activation, + ) + + hidden_states, B, C = torch.split( + hidden_states_B_C, + [self.intermediate_size, groups_time_state_size, groups_time_state_size], + dim=-1, + ) + + # 3. SSM transformation + A = -torch.exp(self.A_log.float()) # (nheads,) + A = A[:, None, ...][:, :, None].expand(-1, self.head_dim, self.ssm_state_size).to(dtype=torch.float32) + dt = dt[:, :, None].expand(-1, -1, self.head_dim) + dt_bias = self.dt_bias[:, None, ...].expand(-1, self.head_dim) + D = self.D[:, None, ...].expand(-1, self.head_dim) + B = B.view(batch_size, self.n_groups, B.shape[1] // self.n_groups) + C = C.view(batch_size, self.n_groups, C.shape[1] // self.n_groups) + hidden_states_reshaped = hidden_states.view(batch_size, self.num_heads, self.head_dim) + hidden_states = selective_state_update( + cache_params.ssm_states[self.layer_idx], + hidden_states_reshaped, + dt, + A, + B, + C, + D, + z=None, + dt_bias=dt_bias, + dt_softplus=True, + ) + hidden_states = hidden_states.view(batch_size, self.num_heads * self.head_dim) + hidden_states = self.norm(hidden_states, gate) + + # 4. Final linear projection + out = self.out_proj(hidden_states)[:, None, ...] + # Fused calculations or step by step if no initialized cache is found + else: + A = -torch.exp(self.A_log.float()) # (num_heads) or (intermediate_size, state_size) + dt_limit_kwargs = {} if self.time_step_limit == (0.0, float("inf")) else {"dt_limit": self.time_step_limit} + + # 2-4. Fused kernel for conv1d, SSM, and the final projection + if self.training and cache_params is None: + out = mamba_split_conv1d_scan_combined( + projected_states, + self.conv1d.weight.squeeze(1), + self.conv1d.bias, + self.dt_bias, + A, + D=self.D, + chunk_size=self.chunk_size, + seq_idx=seq_idx, + activation=self.activation, + rmsnorm_weight=self.norm.weight, + rmsnorm_eps=self.norm.variance_epsilon, + outproj_weight=self.out_proj.weight, + outproj_bias=self.out_proj.bias, + headdim=self.head_dim, + ngroups=self.n_groups, + norm_before_gate=False, + return_final_states=False, + **dt_limit_kwargs, + ) + + else: + gate, hidden_states_B_C, dt = projected_states.split( + [self.intermediate_size, self.conv_dim, self.num_heads], dim=-1 + ) + + # 2. Convolution sequence transformation + # Init cache + if cache_params is not None: + # storing the states + # If we just take xBC[:, :, -self.d_conv :], it will error if seqlen < self.d_conv + # Instead F.pad will pad with zeros if seqlen < self.d_conv, and truncate otherwise. + hidden_states_B_C_transposed = hidden_states_B_C.transpose(1, 2) + conv_states = nn.functional.pad( + hidden_states_B_C_transposed, + (self.conv_kernel_size - hidden_states_B_C_transposed.shape[-1], 0), + ) + cache_params.conv_states[self.layer_idx].copy_(conv_states) + + if self.activation not in ["silu", "swish"]: + hidden_states_B_C = self.act( + self.conv1d(hidden_states_B_C.transpose(1, 2))[..., :seq_len].transpose(1, 2) + ) + else: + hidden_states_B_C = causal_conv1d_fn( + x=hidden_states_B_C.transpose(1, 2), + weight=self.conv1d.weight.squeeze(1), + bias=self.conv1d.bias, + activation=self.activation, + seq_idx=seq_idx, + ).transpose(1, 2) + + hidden_states_B_C = apply_mask_to_padding_states(hidden_states_B_C, attention_mask) + hidden_states, B, C = torch.split( + hidden_states_B_C, + [self.intermediate_size, groups_time_state_size, groups_time_state_size], + dim=-1, + ) + + # 3. SSM transformation + scan_output, ssm_state = mamba_chunk_scan_combined( + hidden_states.view(batch_size, seq_len, -1, self.head_dim), + dt, + A, + B.view(batch_size, seq_len, self.n_groups, -1), + C.view(batch_size, seq_len, self.n_groups, -1), + chunk_size=self.chunk_size, + D=self.D, + z=None, + seq_idx=seq_idx, + return_final_states=True, + dt_bias=self.dt_bias, + dt_softplus=True, + **dt_limit_kwargs, + ) + + # Init cache + if ssm_state is not None and cache_params is not None: + cache_params.ssm_states[self.layer_idx].copy_(ssm_state) + + scan_output = scan_output.view(batch_size, seq_len, -1) + # Multiply "gate" branch and apply extra normalization layer + scan_output = self.norm(scan_output, gate) + + # 4. Final linear projection + out = self.out_proj(scan_output) + return out + + # fmt: off + def torch_forward( + self, + input_states, + cache_params: Optional[HybridMambaAttentionDynamicCache] = None, + cache_position: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + ): + batch_size, seq_len, _ = input_states.shape + dtype = input_states.dtype + + # 1. Gated MLP's linear projection + input_states = apply_mask_to_padding_states(input_states, attention_mask) + projected_states = self.in_proj(input_states) + gate, hidden_states_B_C, dt = projected_states.split( + [self.intermediate_size, self.conv_dim, self.num_heads], dim=-1 + ) + + use_precomputed_states = ( + cache_params is not None + and cache_params.has_previous_state + and seq_len == 1 + and cache_params.conv_states[self.layer_idx].shape[0] + == cache_params.ssm_states[self.layer_idx].shape[0] + == batch_size + and cache_position is not None + and cache_position[0] > 0 + ) + + # 2. Convolution sequence transformation + if use_precomputed_states: + cache_params.conv_states[self.layer_idx] = cache_params.conv_states[self.layer_idx].roll(shifts=-1, dims=-1) + cache_params.conv_states[self.layer_idx][:, :, -1] = hidden_states_B_C[:, 0, :].to(cache_params.conv_states[self.layer_idx].device) + + # We need to guarantee that anything regarding the cache is on the same device + conv_states = cache_params.conv_states[self.layer_idx].to(device=self.conv1d.weight.device) + + hidden_states_B_C = torch.sum( + conv_states * self.conv1d.weight.squeeze(1), dim=-1 + ) + if self.use_conv_bias: + hidden_states_B_C = hidden_states_B_C + self.conv1d.bias + hidden_states_B_C = self.act(hidden_states_B_C) + else: + # Init cache + if cache_params is not None: + hidden_states_B_C_transposed = hidden_states_B_C.transpose(1, 2) + conv_states = nn.functional.pad( + hidden_states_B_C_transposed, (self.conv_kernel_size - hidden_states_B_C_transposed.shape[-1], 0) + ) + cache_params.conv_states[self.layer_idx].copy_(conv_states) + + hidden_states_B_C = self.act(self.conv1d(hidden_states_B_C.transpose(1, 2))[..., :seq_len].transpose(1, 2)) + + hidden_states_B_C = apply_mask_to_padding_states(hidden_states_B_C, attention_mask) + hidden_states, B, C = torch.split( + hidden_states_B_C, + [self.intermediate_size, self.n_groups * self.ssm_state_size, self.n_groups * self.ssm_state_size], + dim=-1 + ) + + # 3. SSM transformation + A = -torch.exp(self.A_log.float()) # [num_heads] + if use_precomputed_states: + # We need to guarantee that anything regarding the cache is on the same device + cache_device = cache_params.ssm_states[self.layer_idx].device + + # Note: there is no need to pad parameter matrices here, as there is just one new token + # for batched generation + dt = dt[:, 0, :][:, None, ...] + dt = dt.transpose(1, 2).expand(batch_size, dt.shape[-1], self.head_dim) + # [num_heads] -> [num_heads, head_dim] + dt_bias = self.dt_bias[..., None].expand(self.dt_bias.shape[0], self.head_dim) + + dt = torch.nn.functional.softplus(dt + dt_bias.to(dt.dtype)) + dt = torch.clamp(dt, self.time_step_limit[0], self.time_step_limit[1]) + A = A[..., None, None].expand(self.num_heads, self.head_dim, self.ssm_state_size).to(dtype=torch.float32) + # [bsz, num_heads, head_dim, state_size] + dA = (torch.exp(dt[..., None] * A)).to(device=cache_device) + + # Discretize B + # [bsz, n_groups * state_size] -> [bsz, n_groups, 1, state_size] -> + # -> [bsz, n_groups, group to head repetition factor, state_size] -> [bsz, num_heads, state_size] + B = B.reshape(batch_size, self.n_groups, -1)[..., None, :] + B = B.expand(batch_size, self.n_groups, self.num_heads // self.n_groups, B.shape[-1]).contiguous() + B = B.reshape(batch_size, -1, B.shape[-1]) + # [bsz, num_heads, head_dim, state_size] + dB = dt[..., None] * B[..., None, :] + + # Discretize x into dB + # [bsz, intermediate_size] -> [bsz, num_heads, head_dim] + hidden_states = hidden_states.reshape(batch_size, -1, self.head_dim) + dBx = (dB * hidden_states[..., None]).to(device=cache_device) + + # State calculation + cache_params.ssm_states[self.layer_idx].copy_( + cache_params.ssm_states[self.layer_idx] * dA + dBx + ) + + # Subsequent output + # [bsz, n_groups * state_size] -> [bsz, num_heads, state_size] + C = C.reshape(batch_size, self.n_groups, -1)[..., None, :] + C = C.expand(batch_size, self.n_groups, self.num_heads // self.n_groups, C.shape[-1]).contiguous() + C = C.reshape(batch_size, -1, C.shape[-1]) + # [bsz, num_heads, head_dim] + + ssm_states = cache_params.ssm_states[self.layer_idx].to(device=C.device, dtype=C.dtype) # Shape: [b, h, d, n] + # Reshape ssm_states to merge the first two dimensions + ssm_states_reshaped = ssm_states.view(batch_size * self.num_heads, self.head_dim, self.ssm_state_size) # Shape: [b*h, d, n] + C_reshaped = C.view(batch_size * self.num_heads, self.ssm_state_size, 1) # Shape: [b*h, n, 1] + y = torch.bmm(ssm_states_reshaped, C_reshaped) + y = y.view(batch_size, self.num_heads, self.head_dim) + + # D skip connection + # [num_heads] -> [num_heads, head_dim] + D = self.D[..., None].expand(self.D.shape[0], self.head_dim) + y = (y + hidden_states * D).to(y.dtype) + + # [bsz, num_heads, head_dim] -> [bsz, 1, intermediate_size] + y = y.reshape(batch_size, -1)[:, None, ...] + else: + # begin ssd naive implementation without einsums + dt = nn.functional.softplus(dt + self.dt_bias) + dt = torch.clamp(dt, self.time_step_limit[0], self.time_step_limit[1]) + hidden_states = hidden_states.reshape(batch_size, seq_len, -1, self.head_dim).float() + B = B.reshape(batch_size, seq_len, -1, self.ssm_state_size).float() + C = C.reshape(batch_size, seq_len, -1, self.ssm_state_size).float() + B = B.repeat_interleave(self.num_heads // self.n_groups, dim=2, output_size=self.num_heads) + C = C.repeat_interleave(self.num_heads // self.n_groups, dim=2, output_size=self.num_heads) + pad_size = (self.chunk_size - seq_len % self.chunk_size) % self.chunk_size + + D_residual = self.D[..., None] * pad_tensor_by_size(hidden_states, pad_size) + + # Discretize x and A + hidden_states = hidden_states * dt[..., None] + A = A.to(hidden_states.dtype) * dt + + # Rearrange into blocks/chunks + hidden_states, A, B, C = [reshape_into_chunks(t, pad_size, self.chunk_size) for t in (hidden_states, A, B, C)] + + # [bsz, -1, chunk_size, num_heads] -> [bsz, num_heads, -1, chunk_size] + A = A.permute(0, 3, 1, 2) + A_cumsum = torch.cumsum(A, dim=-1) + + # 1. Compute the output for each intra-chunk (diagonal blocks) + # This is the analog of a causal mask + L = torch.exp(segment_sum(A)) + + # Contraction of C and B to get G (attention-weights like) + G_intermediate = C[:, :, :, None, :, :] * B[:, :, None, :, :, :] # shape: (b, c, l, s, h, n) + G = G_intermediate.sum(dim=-1) # shape: (b, c, l, s, h) + + # Compute M, equivalent to applying attention mask to weights + M_intermediate = G[..., None] * L.permute(0, 2, 3, 4, 1)[..., None] + M = M_intermediate.sum(dim=-1) + + # Compute Y_diag (apply to values) + Y_diag = (M[..., None] * hidden_states[:, :, None]).sum(dim=3) + + # 2. Compute the state for each intra-chunk + # (right term of low-rank factorization of off-diagonal blocks; B terms) + decay_states = torch.exp(A_cumsum[:, :, :, -1:] - A_cumsum) + B_decay = B * decay_states.permute(0, -2, -1, 1)[..., None] + states = (B_decay[..., None, :] * hidden_states[..., None]).sum(dim=2) + + # 3. Compute the inter-chunk SSM recurrence; produces correct SSM states at chunk boundaries + # (middle term of factorization of off-diag blocks; A terms) + if use_precomputed_states: + previous_states = cache_params.ssm_states[self.layer_idx][:, None, ...].to(device=states.device) + else: + previous_states = torch.zeros_like(states[:, :1]) + states = torch.cat([previous_states, states], dim=1) + decay_chunk = torch.exp(segment_sum(nn.functional.pad(A_cumsum[:, :, :, -1], (1, 0)))) + decay_chunk = decay_chunk.transpose(1, 3) + new_states = (decay_chunk[..., None, None] * states[:, :, None, ...]).sum(dim=1) + states, ssm_state = new_states[:, :-1], new_states[:, -1] + + # 4. Compute state -> output conversion per chunk + # (left term of low-rank factorization of off-diagonal blocks; C terms) + state_decay_out = torch.exp(A_cumsum) + C_times_states = (C[..., None, :] * states[:, :, None, ...]) + state_decay_out_permuted = state_decay_out.permute(0, 2, 3, 1) + Y_off = (C_times_states.sum(-1) * state_decay_out_permuted[..., None]) + + # Add output of intra-chunk and inter-chunk terms (diagonal and off-diagonal blocks) + y = Y_diag + Y_off + # [bsz, -1, self.chunk_size, num_heads, head_dim] -> [bsz, (padded) seq_len, num_heads, head_dim] + y = y.reshape(batch_size, -1, self.num_heads, self.head_dim) + + y = y + D_residual + # Cutting off padded chunks + if pad_size > 0: + y = y[:, :seq_len, :, :] + y = y.reshape(batch_size, seq_len, -1) + + # Init cache + if ssm_state is not None and cache_params is not None: + cache_params.ssm_states[self.layer_idx].copy_(ssm_state) + + scan_output = self.norm(y, gate) + + # end ssd naive + + # 4. Final linear projection + contextualized_states = self.out_proj(scan_output.to(dtype)) # [batch, seq_len, hidden_size] + return contextualized_states + # fmt: on + + def forward( + self, + hidden_states, + cache_params: Optional[HybridMambaAttentionDynamicCache] = None, + cache_position: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + seq_idx: Optional[torch.IntTensor] = None, + **kwargs, + ): + if is_fast_path_available and "cuda" in self.in_proj.weight.device.type: + return self.cuda_kernels_forward(hidden_states, cache_params, cache_position, attention_mask, seq_idx) + if seq_idx is not None: + raise NotImplementedError( + "`seq_idx` support requires fast path support. Please install `mamba_ssm` and `causal_conv1d`" + ) + dtype = hidden_states.dtype + if attention_mask is not None and attention_mask.shape[1] > 1 and attention_mask.shape[0] > 1: + # tune out hidden states for pad tokens, see https://github.com/state-spaces/mamba/issues/66 + hidden_states = (hidden_states * attention_mask[:, :, None]).to(dtype) + + return self.torch_forward(hidden_states, cache_params, cache_position, attention_mask) + + +class BambaMLP(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.intermediate_size = config.intermediate_size + self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias) + self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias) + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias) + self.act_fn = ACT2FN[config.hidden_act] + + def forward(self, x): + down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) + return down_proj + + +@use_kernel_forward_from_hub("RMSNorm") +class BambaRMSNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + """ + BambaRMSNorm is equivalent to T5LayerNorm + """ + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + return self.weight * hidden_states.to(input_dtype) + + def extra_repr(self): + return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}" + + +class BambaDecoderLayer(GradientCheckpointingLayer): + def __init__(self, config: BambaConfig, layer_idx: int, layer_type: str = "mamba"): + super().__init__() + + num_experts = 1 + ffn_layer_class = BambaMLP if num_experts == 1 else None + self.feed_forward = ffn_layer_class(config) + self.input_layernorm = BambaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.pre_ff_layernorm = BambaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + self.layer_type = layer_type + if layer_type == "mamba": + self.mamba = BambaMixer(config=config, layer_idx=layer_idx) + elif layer_type == "attention": + self.self_attn = BambaAttention(config, layer_idx) + else: + raise ValueError("Invalid layer_type") + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[HybridMambaAttentionDynamicCache] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + cache_position: Optional[torch.LongTensor] = None, + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + **kwargs: Unpack[BambaFlashAttentionKwargs], + ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]: + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`, *optional*): attention mask of size + `(batch, sequence_length)` where padding elements are indicated by 0. + past_key_values (`HybridMambaAttentionDynamicCache`, *optional*): cached past key and value projection states + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*): + Indices depicting the position of the input sequence tokens in the sequence. + position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*): + Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`, + with `head_dim` being the embedding dimension of each attention head. + kwargs (`dict`, *optional*): + Arbitrary kwargs. Can be used to provide `BambaFlashAttentionKwargs` for + padding-free training and/or improve torch.compile performance. + """ + + residual = hidden_states + + hidden_states = self.input_layernorm(hidden_states) + + # this is a hybrid decoder layer + if self.layer_type == "mamba": + hidden_states = self.mamba( + hidden_states=hidden_states, + cache_params=past_key_values, + cache_position=cache_position, + attention_mask=attention_mask, + **kwargs, + ) + self_attn_weights = None + elif self.layer_type == "attention": + hidden_states, self_attn_weights = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + output_attentions=output_attentions, + use_cache=use_cache, + cache_position=cache_position, + position_embeddings=position_embeddings, + **kwargs, + ) + + # residual connection after attention + hidden_states = residual + hidden_states + + # feed-forward + residual = hidden_states + hidden_states = self.pre_ff_layernorm(hidden_states) + hidden_states = self.feed_forward(hidden_states) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights,) + + return outputs + + +@auto_docstring +class BambaPreTrainedModel(PreTrainedModel): + config: BambaConfig + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["BambaDecoderLayer"] + _skip_keys_device_placement = "past_key_values" + _supports_flash_attn = True + _supports_sdpa = True + # Note: only supports HybridMambaAttentionDynamicCache + _is_stateful = True + + def _init_weights(self, module): + super()._init_weights(module) + if isinstance(module, BambaMixer): + module.dt_bias.data.fill_(1.0) + module.A_log.data = torch.log(torch.arange(1, module.num_heads + 1)) + module.D.data.fill_(1.0) + + +@auto_docstring +class BambaModel(BambaPreTrainedModel): + def __init__(self, config: BambaConfig): + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) + decoder_layers = [] + for i in range(config.num_hidden_layers): + decoder_layers.append(BambaDecoderLayer(config, layer_idx=i, layer_type=config.layers_block_type[i])) + self.layers = nn.ModuleList(decoder_layers) + + self._attn_implementation = config._attn_implementation + self.final_layernorm = BambaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.rotary_emb = BambaRotaryEmbedding(config=config) + + self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() + + @can_return_tuple + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[HybridMambaAttentionDynamicCache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Unpack[BambaFlashAttentionKwargs], + ) -> BaseModelOutputWithPast: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") + + if self.gradient_checkpointing and self.training and use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`." + ) + use_cache = False + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + hidden_states = inputs_embeds + + if use_cache and past_key_values is None: + logger.warning_once( + "Bamba requires an initialized `HybridMambaAttentionDynamicCache` to return a cache. None was " + "provided, so no cache will be returned." + ) + + if cache_position is None: + cache_position = torch.arange(hidden_states.shape[1], device=hidden_states.device) + if position_ids is None: + position_ids = cache_position.unsqueeze(0) + + causal_mask = self._update_causal_mask( + attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions + ) + mamba_mask = self._update_mamba_mask(attention_mask, cache_position) + + # create position embeddings to be shared across the decoder layers + position_embeddings = self.rotary_emb(hidden_states, position_ids) + + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + + for decoder_layer in self.layers: + # Depending on the layer type we opt for 2D base attention mask (Mamba) or 4D causal mask (Attention) + layer_mask = mamba_mask if decoder_layer.layer_type == "mamba" else causal_mask + + if output_hidden_states: + all_hidden_states += (hidden_states,) + + layer_outputs = decoder_layer( + hidden_states, + attention_mask=layer_mask, + position_ids=position_ids, + past_key_values=past_key_values, + output_attentions=output_attentions, + use_cache=use_cache, + cache_position=cache_position, + position_embeddings=position_embeddings, + **kwargs, + ) + + hidden_states = layer_outputs[0] + + if output_attentions: + if layer_outputs[1] is not None: + # append attentions only of attention layers. Mamba layers return `None` as the attention weights + all_self_attns += (layer_outputs[1],) + + hidden_states = self.final_layernorm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if past_key_values and not past_key_values.has_previous_state: + past_key_values.has_previous_state = True + + next_cache = None if not use_cache else past_key_values + + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + ) + + def _update_causal_mask( + self, + attention_mask: torch.Tensor, + input_tensor: torch.Tensor, + cache_position: torch.Tensor, + past_key_values: HybridMambaAttentionDynamicCache, + output_attentions: bool, + ): + if self.config._attn_implementation == "flash_attention_2": + if attention_mask is not None and 0.0 in attention_mask: + return attention_mask + return None + + # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in + # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail + # to infer the attention mask. + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + + # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward + if self.config._attn_implementation == "sdpa" and not output_attentions: + if AttentionMaskConverter._ignore_causal_mask_sdpa( + attention_mask, + inputs_embeds=input_tensor, + past_key_values_length=past_seen_tokens, + is_training=self.training, + ): + return None + + dtype = input_tensor.dtype + sequence_length = input_tensor.shape[1] + target_length = ( + attention_mask.shape[-1] + if isinstance(attention_mask, torch.Tensor) + else past_seen_tokens + sequence_length + 1 + ) + + # In case the provided `attention` mask is 2D, we generate a causal mask here (4D). + causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position( + attention_mask, + sequence_length=sequence_length, + target_length=target_length, + dtype=dtype, + cache_position=cache_position, + batch_size=input_tensor.shape[0], + ) + + if ( + self.config._attn_implementation == "sdpa" + and attention_mask is not None + and attention_mask.device.type in ["cuda", "xpu", "npu"] + and not output_attentions + ): + # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when + # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path. + # Details: https://github.com/pytorch/pytorch/issues/110213 + min_dtype = torch.finfo(dtype).min + causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype) + + return causal_mask + + @staticmethod + def _prepare_4d_causal_attention_mask_with_cache_position( + attention_mask: torch.Tensor, + sequence_length: int, + target_length: int, + dtype: torch.dtype, + cache_position: torch.Tensor, + batch_size: int, + **kwargs, + ): + """ + Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape + `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing. + + Args: + attention_mask (`torch.Tensor`): + A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape + `(batch_size, 1, query_length, key_value_length)`. + sequence_length (`int`): + The sequence length being processed. + target_length (`int`): + The target length: when generating with static cache, the mask should be as long as the static cache, + to account for the 0 padding, the part of the cache that is not filled yet. + dtype (`torch.dtype`): + The dtype to use for the 4D attention mask. + cache_position (`torch.Tensor`): + Indices depicting the position of the input sequence tokens in the sequence. + batch_size (`torch.Tensor`): + Batch size. + """ + if attention_mask is not None and attention_mask.dim() == 4: + # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing. + causal_mask = attention_mask + else: + min_dtype = torch.finfo(dtype).min + causal_mask = torch.full( + (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=cache_position.device + ) + if sequence_length != 1: + causal_mask = torch.triu(causal_mask, diagonal=1) + causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(-1, 1) + causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1) + if attention_mask is not None: + causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit + mask_length = attention_mask.shape[-1] + padding_attention_mask = (attention_mask[:, None, None, :] == attention_mask[:, None, :, None])[ + :, :, -sequence_length:, : + ].to(dtype) + padding_mask = causal_mask[:, :, :, :mask_length] + padding_attention_mask + padding_mask = padding_mask == 0 + causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill( + padding_mask, min_dtype + ) + + return causal_mask + + def _update_mamba_mask(self, attention_mask, cache_position): + """ + No need for zeroing states when + 1. Cached forward + 2. Attending to all inputs + """ + mamba_mask = attention_mask + if cache_position[0] > 0 or (attention_mask is not None and torch.all(attention_mask == 1)): + mamba_mask = None + return mamba_mask + + +@auto_docstring +class BambaForCausalLM(BambaPreTrainedModel, GenerationMixin): + _tied_weights_keys = ["lm_head.weight"] + _tp_plan = {"lm_head": "colwise_rep"} + _pp_plan = {"lm_head": (["hidden_states"], ["logits"])} + + def __init__(self, config): + super().__init__(config) + self.model = BambaModel(config) + self.vocab_size = config.vocab_size + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + self.z_loss_coefficient = config.z_loss_coefficient + + # Initialize weights and apply final processing + self.post_init() + + @can_return_tuple + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[HybridMambaAttentionDynamicCache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + logits_to_keep: Union[int, torch.Tensor] = 0, + **kwargs, + ) -> CausalLMOutputWithPast: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Example: + + ```python + >>> from transformers import AutoTokenizer, BambaForCausalLM + + >>> model = BambaForCausalLM.from_pretrained("...") + >>> tokenizer = AutoTokenizer.from_pretrained("...") + + >>> prompt = "Hey, are you conscious? Can you talk to me?" + >>> inputs = tokenizer(prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(inputs.input_ids, max_length=30) + >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you." + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs: BaseModelOutputWithPast = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + cache_position=cache_position, + **kwargs, + ) + + hidden_states = outputs.last_hidden_state + # Only compute necessary logits, and do not upcast them to float if we are not computing the loss + slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep + logits = self.lm_head(hidden_states[:, slice_indices, :]) + + loss = None + if labels is not None: + loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs) + if self.z_loss_coefficient > 0: + # Type-match loss, but avoid upcasting large logits tensor until after it's been reduced on dim -1 + z_loss = logits.logsumexp(dim=-1).to(dtype=loss.dtype).pow(2).mean() + loss = loss + self.z_loss_coefficient * z_loss + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def prepare_inputs_for_generation( + self, + input_ids, + past_key_values=None, + attention_mask=None, + inputs_embeds=None, + cache_position=None, + position_ids=None, + use_cache=True, + **kwargs, + ): + # Overwritten -- has a unique cache type, `HybridMambaAttentionDynamicCache` + + empty_past_kv = past_key_values is None + + # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens + # Exception 1: when passing input_embeds, input_ids may be missing entries + # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here + # Exception 3: with synced GPUs cache_position may go out of bounds, but we only want dummy token in that case. + # (we can't check exception 3 while compiling) + if not empty_past_kv: + if ( + inputs_embeds is not None # Exception 1 + or cache_position[-1] >= input_ids.shape[1] # Exception 3 + ): + input_ids = input_ids[:, -cache_position.shape[0] :] + elif input_ids.shape[1] != cache_position.shape[0]: # Default case (the "else", a no op, is Exception 2) + input_ids = input_ids[:, cache_position] + else: + past_key_values = HybridMambaAttentionDynamicCache( + self.config, input_ids.shape[0], self.dtype, device=self.device + ) + + if attention_mask is not None and position_ids is None: + # create position_ids on the fly for batch generation + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + if not empty_past_kv: + position_ids = position_ids[:, -input_ids.shape[1] :] + + # if `inputs_embeds` are passed, we only want to use them in the 1st generation step + if inputs_embeds is not None and empty_past_kv: + model_inputs = {"inputs_embeds": inputs_embeds} + else: + model_inputs = {"input_ids": input_ids.contiguous()} # `contiguous()` needed for compilation use cases + + model_inputs.update( + { + "position_ids": position_ids, + "past_key_values": past_key_values, + "use_cache": use_cache, + "attention_mask": attention_mask, + "logits_to_keep": self.config.num_logits_to_keep, + "cache_position": cache_position, + } + ) + + # Forward ALL kwargs that are uninitialized (e.g. `use_cache`). + for key, value in kwargs.items(): + if key not in model_inputs: + model_inputs[key] = value + + return model_inputs + + +__all__ = ["BambaModel", "BambaForCausalLM", "BambaPreTrainedModel"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bamba/modular_bamba.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bamba/modular_bamba.py new file mode 100644 index 0000000000000000000000000000000000000000..5ae5313d21b83de27a95e2179906219d57779094 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bamba/modular_bamba.py @@ -0,0 +1,1222 @@ +# coding=utf-8 +# Copyright 2024 IBM and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch Bamba model.""" + +from typing import Optional, TypedDict, Union + +import torch +from torch import nn + +from transformers.activations import ACT2FN +from transformers.models.jamba.modeling_jamba import HybridMambaAttentionDynamicCache, JambaAttentionDecoderLayer +from transformers.models.llama.modeling_llama import ( + LlamaAttention, + LlamaForCausalLM, + LlamaMLP, + LlamaRMSNorm, + LlamaRotaryEmbedding, + rotate_half, +) +from transformers.models.mamba2.modeling_mamba2 import ( + MambaRMSNormGated, + pad_tensor_by_size, + reshape_into_chunks, + segment_sum, +) + +from ...modeling_attn_mask_utils import AttentionMaskConverter +from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast +from ...modeling_utils import PreTrainedModel +from ...processing_utils import Unpack +from ...utils import ( + auto_docstring, + can_return_tuple, + logging, +) +from ...utils.deprecation import deprecate_kwarg +from ...utils.import_utils import is_causal_conv1d_available, is_mamba_2_ssm_available +from .configuration_bamba import BambaConfig + + +if is_mamba_2_ssm_available(): + from mamba_ssm.ops.triton.selective_state_update import selective_state_update + from mamba_ssm.ops.triton.ssd_combined import mamba_chunk_scan_combined, mamba_split_conv1d_scan_combined +else: + selective_state_update = None + +if is_causal_conv1d_available(): + from causal_conv1d import causal_conv1d_fn, causal_conv1d_update +else: + causal_conv1d_update, causal_conv1d_fn = None, None + +is_fast_path_available = all((selective_state_update, causal_conv1d_fn, causal_conv1d_update)) + + +logger = logging.get_logger(__name__) + + +class BambaFlashAttentionKwargs(TypedDict, total=False): + """ + Keyword arguments for advanced Flash Attention, causal-conv1d, and mamba_ssm kernel usage. + Use cases include padding-free training and fewer `torch.compile` graph breaks. + + Attributes: + cu_seq_lens_q (`torch.LongTensor`) + Gets cumulative sequence length for query state. + cu_seq_lens_k (`torch.LongTensor`) + Gets cumulative sequence length for key state. + max_length_q (`int`): + Maximum sequence length for query state. + max_length_k (`int`): + Maximum sequence length for key state. + seq_idx (`torch.IntTensor): + Index of each packed sequence. + """ + + cu_seq_lens_q: torch.LongTensor + cu_seq_lens_k: torch.LongTensor + max_length_q: int + max_length_k: int + seq_idx: torch.IntTensor + + +# Adapted from transformers.models.jamba.modeling_jamba.HybridMambaAttentionDynamicCache for the v2 mixer +class HybridMambaAttentionDynamicCache(HybridMambaAttentionDynamicCache): + """ + A dynamic cache that can handle both the attention cache (which has a seq_len dimension) and the mamba cache + (which has a constant shape regardless of seq_len). + + This cache has two sets of lists of tensors: `key_cache` and `value_cache` for attention cache and `conv_states` + and `ssm_states` for mamba cache. Each of these lists has `num_layers` tensors. The expected shape for each tensor + For attention layers, `key_cache` and `value_cache` have a shape of `(batch_size, num_heads, seq_len, head_dim)`, + while `conv_states` and `ssm_states` have a shape of `(batch_size, 0)` (empty tensors). + For mamba layers, `key_cache` and `value_cache` have a shape of `(batch_size, 0)` (empty tensors), + while `conv_states` represents the convolution state and has a shape of `(batch_size, d_inner, d_conv)`, + and `ssm_states` represents the ssm state and has a shape of `(batch_size, d_inner, d_state)`. + """ + + def __init__(self, config: BambaConfig, batch_size, dtype=torch.float16, device=None): + self.layers_block_type = config.layers_block_type + self.has_previous_state = False # only used by mamba + conv_kernel_size = config.mamba_d_conv + ssm_state_size = config.mamba_d_state + + self.conv_states = [] + self.ssm_states = [] + self.transformer_layers = [] + for i in range(config.num_hidden_layers): + if self.layers_block_type[i] == "mamba": + self.conv_states += [ + torch.zeros( + batch_size, + (config.mamba_expand * config.hidden_size + 2 * config.mamba_n_groups * ssm_state_size), + conv_kernel_size, + device=device, + dtype=dtype, + ) + ] + self.ssm_states += [ + torch.zeros( + batch_size, + config.mamba_n_heads, + config.mamba_d_head, + ssm_state_size, + device=device, + dtype=dtype, + ) + ] + else: + self.conv_states += [torch.tensor([[]] * batch_size, device=device)] + self.ssm_states += [torch.tensor([[]] * batch_size, device=device)] + self.transformer_layers.append(i) + + self.key_cache = [torch.tensor([[]] * batch_size, device=device) for _ in range(config.num_hidden_layers)] + self.value_cache = [torch.tensor([[]] * batch_size, device=device) for _ in range(config.num_hidden_layers)] + + +class BambaRotaryEmbedding(LlamaRotaryEmbedding): + pass + + +# Adapted from transformers.models.glm.modular_glm.apply_rotary_pos_emb +def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1): + """Applies Rotary Position Embedding to the query and key tensors. + + Removes the interleaving of cos and sin from GLM + + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + position_ids (`torch.Tensor`, *optional*): + Deprecated and unused. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + cos = cos.unsqueeze(unsqueeze_dim) + sin = sin.unsqueeze(unsqueeze_dim) + + # Keep half or full tensor for later concatenation + rotary_dim = cos.shape[-1] + q_rot, q_pass = q[..., :rotary_dim], q[..., rotary_dim:] + k_rot, k_pass = k[..., :rotary_dim], k[..., rotary_dim:] + + # Apply rotary embeddings on the first half or full tensor + q_embed = (q_rot * cos) + (rotate_half(q_rot) * sin) + k_embed = (k_rot * cos) + (rotate_half(k_rot) * sin) + + # Concatenate back to full shape + q_embed = torch.cat([q_embed, q_pass], dim=-1) + k_embed = torch.cat([k_embed, k_pass], dim=-1) + return q_embed, k_embed + + +class BambaAttention(LlamaAttention): + pass + + +class BambaRMSNormGated(MambaRMSNormGated): + pass + + +def apply_mask_to_padding_states(hidden_states, attention_mask): + """ + Tunes out the hidden states for padding tokens, see https://github.com/state-spaces/mamba/issues/66 + """ + if attention_mask is not None and attention_mask.shape[1] > 1 and attention_mask.shape[0] > 1: + dtype = hidden_states.dtype + hidden_states = (hidden_states * attention_mask[:, :, None]).to(dtype) + + return hidden_states + + +# Adapted from transformers.models.mamba2.modeling_mamba2.Mamba2Mixer +class BambaMixer(nn.Module): + """ + Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`. + A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective) + ∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4, + and is why Mamba is called **selective** state spaces) + + The are a few differences between this and Mamba2Mixer: + - The variable use_precomputed_states is slightly different due to the hybrid cache structure + - There's a few non-obvious bugs fixed with batching in the slow path that exist in main + - Some extra variables that our layer doesn't need have been removed + - We ported most of the refactors in https://github.com/huggingface/transformers/pull/35154, which is (as of Dec 18, 2024) unmerged + """ + + def __init__(self, config: BambaConfig, layer_idx: int): + super().__init__() + self.num_heads = config.mamba_n_heads + self.hidden_size = config.hidden_size + self.ssm_state_size = config.mamba_d_state + self.conv_kernel_size = config.mamba_d_conv + self.intermediate_size = int(config.mamba_expand * self.hidden_size) + self.layer_idx = layer_idx + self.use_conv_bias = config.mamba_conv_bias + self.activation = config.hidden_act + self.act = ACT2FN[config.hidden_act] + self.use_bias = config.mamba_proj_bias + + self.layer_norm_epsilon = config.rms_norm_eps + + self.n_groups = config.mamba_n_groups + self.head_dim = config.mamba_d_head + self.chunk_size = config.mamba_chunk_size + + # FIXME: + self.time_step_limit = (0.0, float("inf")) + self.time_step_min = 0.001 + self.time_step_max = 0.1 + + self.conv_dim = self.intermediate_size + 2 * self.n_groups * self.ssm_state_size + self.conv1d = nn.Conv1d( + in_channels=self.conv_dim, + out_channels=self.conv_dim, + bias=config.mamba_conv_bias, + kernel_size=self.conv_kernel_size, + groups=self.conv_dim, + padding=self.conv_kernel_size - 1, + ) + + # projection of the input hidden states + projection_size = self.intermediate_size + self.conv_dim + self.num_heads + self.in_proj = nn.Linear( + self.hidden_size, + projection_size, + bias=self.use_bias, + ) + # selective projection used to make dt, B and C input dependent + + # time step projection (discretization) + # instantiate once and copy inv_dt in init_weights of PretrainedModel + self.dt_bias = nn.Parameter(torch.ones(self.num_heads)) + + # S4D real initialization. These are not discretized! + # The core is to load them, compute the discrete states, then write the updated state. Keeps the memory bounded + A = torch.arange(1, self.num_heads + 1) + self.A_log = nn.Parameter(torch.log(A)) + self.norm = BambaRMSNormGated(self.intermediate_size, eps=self.layer_norm_epsilon) + self.D = nn.Parameter(torch.ones(self.num_heads)) + + self.out_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=self.use_bias) + + if not is_fast_path_available: + logger.warning_once( + "The fast path is not available because one of `(selective_state_update, causal_conv1d_fn, causal_conv1d_update)`" + " is None. Falling back to the naive implementation. To install follow https://github.com/state-spaces/mamba/#installation and" + " https://github.com/Dao-AILab/causal-conv1d" + ) + else: + logger.warning_once("The fast path for Bamba will be used when running the model on a GPU") + + def cuda_kernels_forward( + self, + hidden_states: torch.Tensor, + cache_params: Optional[HybridMambaAttentionDynamicCache] = None, + cache_position: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + seq_idx: Optional[torch.IntTensor] = None, + ): + # 1. Gated MLP's linear projection + hidden_states = apply_mask_to_padding_states(hidden_states, attention_mask) + projected_states = self.in_proj(hidden_states) + + # Set up dimensions for reshapes later + batch_size, seq_len, _ = hidden_states.shape + groups_time_state_size = self.n_groups * self.ssm_state_size + + use_precomputed_states = ( + cache_params is not None + and cache_params.has_previous_state + and seq_len == 1 + and cache_params.conv_states[self.layer_idx].shape[0] + == cache_params.ssm_states[self.layer_idx].shape[0] + == batch_size + and cache_position is not None + and cache_position[0] > 0 + ) + + # getting projected states from cache if it exists + if use_precomputed_states: + gate, hidden_states_B_C, dt = projected_states.squeeze(1).split( + [self.intermediate_size, self.conv_dim, self.num_heads], dim=-1 + ) + + # 2. Convolution sequence transformation + hidden_states_B_C = causal_conv1d_update( + hidden_states_B_C, + cache_params.conv_states[self.layer_idx], + self.conv1d.weight.squeeze(1), + self.conv1d.bias, + self.activation, + ) + + hidden_states, B, C = torch.split( + hidden_states_B_C, + [self.intermediate_size, groups_time_state_size, groups_time_state_size], + dim=-1, + ) + + # 3. SSM transformation + A = -torch.exp(self.A_log.float()) # (nheads,) + A = A[:, None, ...][:, :, None].expand(-1, self.head_dim, self.ssm_state_size).to(dtype=torch.float32) + dt = dt[:, :, None].expand(-1, -1, self.head_dim) + dt_bias = self.dt_bias[:, None, ...].expand(-1, self.head_dim) + D = self.D[:, None, ...].expand(-1, self.head_dim) + B = B.view(batch_size, self.n_groups, B.shape[1] // self.n_groups) + C = C.view(batch_size, self.n_groups, C.shape[1] // self.n_groups) + hidden_states_reshaped = hidden_states.view(batch_size, self.num_heads, self.head_dim) + hidden_states = selective_state_update( + cache_params.ssm_states[self.layer_idx], + hidden_states_reshaped, + dt, + A, + B, + C, + D, + z=None, + dt_bias=dt_bias, + dt_softplus=True, + ) + hidden_states = hidden_states.view(batch_size, self.num_heads * self.head_dim) + hidden_states = self.norm(hidden_states, gate) + + # 4. Final linear projection + out = self.out_proj(hidden_states)[:, None, ...] + # Fused calculations or step by step if no initialized cache is found + else: + A = -torch.exp(self.A_log.float()) # (num_heads) or (intermediate_size, state_size) + dt_limit_kwargs = {} if self.time_step_limit == (0.0, float("inf")) else {"dt_limit": self.time_step_limit} + + # 2-4. Fused kernel for conv1d, SSM, and the final projection + if self.training and cache_params is None: + out = mamba_split_conv1d_scan_combined( + projected_states, + self.conv1d.weight.squeeze(1), + self.conv1d.bias, + self.dt_bias, + A, + D=self.D, + chunk_size=self.chunk_size, + seq_idx=seq_idx, + activation=self.activation, + rmsnorm_weight=self.norm.weight, + rmsnorm_eps=self.norm.variance_epsilon, + outproj_weight=self.out_proj.weight, + outproj_bias=self.out_proj.bias, + headdim=self.head_dim, + ngroups=self.n_groups, + norm_before_gate=False, + return_final_states=False, + **dt_limit_kwargs, + ) + + else: + gate, hidden_states_B_C, dt = projected_states.split( + [self.intermediate_size, self.conv_dim, self.num_heads], dim=-1 + ) + + # 2. Convolution sequence transformation + # Init cache + if cache_params is not None: + # storing the states + # If we just take xBC[:, :, -self.d_conv :], it will error if seqlen < self.d_conv + # Instead F.pad will pad with zeros if seqlen < self.d_conv, and truncate otherwise. + hidden_states_B_C_transposed = hidden_states_B_C.transpose(1, 2) + conv_states = nn.functional.pad( + hidden_states_B_C_transposed, + (self.conv_kernel_size - hidden_states_B_C_transposed.shape[-1], 0), + ) + cache_params.conv_states[self.layer_idx].copy_(conv_states) + + if self.activation not in ["silu", "swish"]: + hidden_states_B_C = self.act( + self.conv1d(hidden_states_B_C.transpose(1, 2))[..., :seq_len].transpose(1, 2) + ) + else: + hidden_states_B_C = causal_conv1d_fn( + x=hidden_states_B_C.transpose(1, 2), + weight=self.conv1d.weight.squeeze(1), + bias=self.conv1d.bias, + activation=self.activation, + seq_idx=seq_idx, + ).transpose(1, 2) + + hidden_states_B_C = apply_mask_to_padding_states(hidden_states_B_C, attention_mask) + hidden_states, B, C = torch.split( + hidden_states_B_C, + [self.intermediate_size, groups_time_state_size, groups_time_state_size], + dim=-1, + ) + + # 3. SSM transformation + scan_output, ssm_state = mamba_chunk_scan_combined( + hidden_states.view(batch_size, seq_len, -1, self.head_dim), + dt, + A, + B.view(batch_size, seq_len, self.n_groups, -1), + C.view(batch_size, seq_len, self.n_groups, -1), + chunk_size=self.chunk_size, + D=self.D, + z=None, + seq_idx=seq_idx, + return_final_states=True, + dt_bias=self.dt_bias, + dt_softplus=True, + **dt_limit_kwargs, + ) + + # Init cache + if ssm_state is not None and cache_params is not None: + cache_params.ssm_states[self.layer_idx].copy_(ssm_state) + + scan_output = scan_output.view(batch_size, seq_len, -1) + # Multiply "gate" branch and apply extra normalization layer + scan_output = self.norm(scan_output, gate) + + # 4. Final linear projection + out = self.out_proj(scan_output) + return out + + # fmt: off + def torch_forward( + self, + input_states, + cache_params: Optional[HybridMambaAttentionDynamicCache] = None, + cache_position: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + ): + batch_size, seq_len, _ = input_states.shape + dtype = input_states.dtype + + # 1. Gated MLP's linear projection + input_states = apply_mask_to_padding_states(input_states, attention_mask) + projected_states = self.in_proj(input_states) + gate, hidden_states_B_C, dt = projected_states.split( + [self.intermediate_size, self.conv_dim, self.num_heads], dim=-1 + ) + + use_precomputed_states = ( + cache_params is not None + and cache_params.has_previous_state + and seq_len == 1 + and cache_params.conv_states[self.layer_idx].shape[0] + == cache_params.ssm_states[self.layer_idx].shape[0] + == batch_size + and cache_position is not None + and cache_position[0] > 0 + ) + + # 2. Convolution sequence transformation + if use_precomputed_states: + cache_params.conv_states[self.layer_idx] = cache_params.conv_states[self.layer_idx].roll(shifts=-1, dims=-1) + cache_params.conv_states[self.layer_idx][:, :, -1] = hidden_states_B_C[:, 0, :].to(cache_params.conv_states[self.layer_idx].device) + + # We need to guarantee that anything regarding the cache is on the same device + conv_states = cache_params.conv_states[self.layer_idx].to(device=self.conv1d.weight.device) + + hidden_states_B_C = torch.sum( + conv_states * self.conv1d.weight.squeeze(1), dim=-1 + ) + if self.use_conv_bias: + hidden_states_B_C = hidden_states_B_C + self.conv1d.bias + hidden_states_B_C = self.act(hidden_states_B_C) + else: + # Init cache + if cache_params is not None: + hidden_states_B_C_transposed = hidden_states_B_C.transpose(1, 2) + conv_states = nn.functional.pad( + hidden_states_B_C_transposed, (self.conv_kernel_size - hidden_states_B_C_transposed.shape[-1], 0) + ) + cache_params.conv_states[self.layer_idx].copy_(conv_states) + + hidden_states_B_C = self.act(self.conv1d(hidden_states_B_C.transpose(1, 2))[..., :seq_len].transpose(1, 2)) + + hidden_states_B_C = apply_mask_to_padding_states(hidden_states_B_C, attention_mask) + hidden_states, B, C = torch.split( + hidden_states_B_C, + [self.intermediate_size, self.n_groups * self.ssm_state_size, self.n_groups * self.ssm_state_size], + dim=-1 + ) + + # 3. SSM transformation + A = -torch.exp(self.A_log.float()) # [num_heads] + if use_precomputed_states: + # We need to guarantee that anything regarding the cache is on the same device + cache_device = cache_params.ssm_states[self.layer_idx].device + + # Note: there is no need to pad parameter matrices here, as there is just one new token + # for batched generation + dt = dt[:, 0, :][:, None, ...] + dt = dt.transpose(1, 2).expand(batch_size, dt.shape[-1], self.head_dim) + # [num_heads] -> [num_heads, head_dim] + dt_bias = self.dt_bias[..., None].expand(self.dt_bias.shape[0], self.head_dim) + + dt = torch.nn.functional.softplus(dt + dt_bias.to(dt.dtype)) + dt = torch.clamp(dt, self.time_step_limit[0], self.time_step_limit[1]) + A = A[..., None, None].expand(self.num_heads, self.head_dim, self.ssm_state_size).to(dtype=torch.float32) + # [bsz, num_heads, head_dim, state_size] + dA = (torch.exp(dt[..., None] * A)).to(device=cache_device) + + # Discretize B + # [bsz, n_groups * state_size] -> [bsz, n_groups, 1, state_size] -> + # -> [bsz, n_groups, group to head repetition factor, state_size] -> [bsz, num_heads, state_size] + B = B.reshape(batch_size, self.n_groups, -1)[..., None, :] + B = B.expand(batch_size, self.n_groups, self.num_heads // self.n_groups, B.shape[-1]).contiguous() + B = B.reshape(batch_size, -1, B.shape[-1]) + # [bsz, num_heads, head_dim, state_size] + dB = dt[..., None] * B[..., None, :] + + # Discretize x into dB + # [bsz, intermediate_size] -> [bsz, num_heads, head_dim] + hidden_states = hidden_states.reshape(batch_size, -1, self.head_dim) + dBx = (dB * hidden_states[..., None]).to(device=cache_device) + + # State calculation + cache_params.ssm_states[self.layer_idx].copy_( + cache_params.ssm_states[self.layer_idx] * dA + dBx + ) + + # Subsequent output + # [bsz, n_groups * state_size] -> [bsz, num_heads, state_size] + C = C.reshape(batch_size, self.n_groups, -1)[..., None, :] + C = C.expand(batch_size, self.n_groups, self.num_heads // self.n_groups, C.shape[-1]).contiguous() + C = C.reshape(batch_size, -1, C.shape[-1]) + # [bsz, num_heads, head_dim] + + ssm_states = cache_params.ssm_states[self.layer_idx].to(device=C.device, dtype=C.dtype) # Shape: [b, h, d, n] + # Reshape ssm_states to merge the first two dimensions + ssm_states_reshaped = ssm_states.view(batch_size * self.num_heads, self.head_dim, self.ssm_state_size) # Shape: [b*h, d, n] + C_reshaped = C.view(batch_size * self.num_heads, self.ssm_state_size, 1) # Shape: [b*h, n, 1] + y = torch.bmm(ssm_states_reshaped, C_reshaped) + y = y.view(batch_size, self.num_heads, self.head_dim) + + # D skip connection + # [num_heads] -> [num_heads, head_dim] + D = self.D[..., None].expand(self.D.shape[0], self.head_dim) + y = (y + hidden_states * D).to(y.dtype) + + # [bsz, num_heads, head_dim] -> [bsz, 1, intermediate_size] + y = y.reshape(batch_size, -1)[:, None, ...] + else: + # begin ssd naive implementation without einsums + dt = nn.functional.softplus(dt + self.dt_bias) + dt = torch.clamp(dt, self.time_step_limit[0], self.time_step_limit[1]) + hidden_states = hidden_states.reshape(batch_size, seq_len, -1, self.head_dim).float() + B = B.reshape(batch_size, seq_len, -1, self.ssm_state_size).float() + C = C.reshape(batch_size, seq_len, -1, self.ssm_state_size).float() + B = B.repeat_interleave(self.num_heads // self.n_groups, dim=2, output_size=self.num_heads) + C = C.repeat_interleave(self.num_heads // self.n_groups, dim=2, output_size=self.num_heads) + pad_size = (self.chunk_size - seq_len % self.chunk_size) % self.chunk_size + + D_residual = self.D[..., None] * pad_tensor_by_size(hidden_states, pad_size) + + # Discretize x and A + hidden_states = hidden_states * dt[..., None] + A = A.to(hidden_states.dtype) * dt + + # Rearrange into blocks/chunks + hidden_states, A, B, C = [reshape_into_chunks(t, pad_size, self.chunk_size) for t in (hidden_states, A, B, C)] + + # [bsz, -1, chunk_size, num_heads] -> [bsz, num_heads, -1, chunk_size] + A = A.permute(0, 3, 1, 2) + A_cumsum = torch.cumsum(A, dim=-1) + + # 1. Compute the output for each intra-chunk (diagonal blocks) + # This is the analog of a causal mask + L = torch.exp(segment_sum(A)) + + # Contraction of C and B to get G (attention-weights like) + G_intermediate = C[:, :, :, None, :, :] * B[:, :, None, :, :, :] # shape: (b, c, l, s, h, n) + G = G_intermediate.sum(dim=-1) # shape: (b, c, l, s, h) + + # Compute M, equivalent to applying attention mask to weights + M_intermediate = G[..., None] * L.permute(0, 2, 3, 4, 1)[..., None] + M = M_intermediate.sum(dim=-1) + + # Compute Y_diag (apply to values) + Y_diag = (M[..., None] * hidden_states[:, :, None]).sum(dim=3) + + # 2. Compute the state for each intra-chunk + # (right term of low-rank factorization of off-diagonal blocks; B terms) + decay_states = torch.exp(A_cumsum[:, :, :, -1:] - A_cumsum) + B_decay = B * decay_states.permute(0, -2, -1, 1)[..., None] + states = (B_decay[..., None, :] * hidden_states[..., None]).sum(dim=2) + + # 3. Compute the inter-chunk SSM recurrence; produces correct SSM states at chunk boundaries + # (middle term of factorization of off-diag blocks; A terms) + if use_precomputed_states: + previous_states = cache_params.ssm_states[self.layer_idx][:, None, ...].to(device=states.device) + else: + previous_states = torch.zeros_like(states[:, :1]) + states = torch.cat([previous_states, states], dim=1) + decay_chunk = torch.exp(segment_sum(nn.functional.pad(A_cumsum[:, :, :, -1], (1, 0)))) + decay_chunk = decay_chunk.transpose(1, 3) + new_states = (decay_chunk[..., None, None] * states[:, :, None, ...]).sum(dim=1) + states, ssm_state = new_states[:, :-1], new_states[:, -1] + + # 4. Compute state -> output conversion per chunk + # (left term of low-rank factorization of off-diagonal blocks; C terms) + state_decay_out = torch.exp(A_cumsum) + C_times_states = (C[..., None, :] * states[:, :, None, ...]) + state_decay_out_permuted = state_decay_out.permute(0, 2, 3, 1) + Y_off = (C_times_states.sum(-1) * state_decay_out_permuted[..., None]) + + # Add output of intra-chunk and inter-chunk terms (diagonal and off-diagonal blocks) + y = Y_diag + Y_off + # [bsz, -1, self.chunk_size, num_heads, head_dim] -> [bsz, (padded) seq_len, num_heads, head_dim] + y = y.reshape(batch_size, -1, self.num_heads, self.head_dim) + + y = y + D_residual + # Cutting off padded chunks + if pad_size > 0: + y = y[:, :seq_len, :, :] + y = y.reshape(batch_size, seq_len, -1) + + # Init cache + if ssm_state is not None and cache_params is not None: + cache_params.ssm_states[self.layer_idx].copy_(ssm_state) + + scan_output = self.norm(y, gate) + + # end ssd naive + + # 4. Final linear projection + contextualized_states = self.out_proj(scan_output.to(dtype)) # [batch, seq_len, hidden_size] + return contextualized_states + # fmt: on + + def forward( + self, + hidden_states, + cache_params: Optional[HybridMambaAttentionDynamicCache] = None, + cache_position: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + seq_idx: Optional[torch.IntTensor] = None, + **kwargs, + ): + if is_fast_path_available and "cuda" in self.in_proj.weight.device.type: + return self.cuda_kernels_forward(hidden_states, cache_params, cache_position, attention_mask, seq_idx) + if seq_idx is not None: + raise NotImplementedError( + "`seq_idx` support requires fast path support. Please install `mamba_ssm` and `causal_conv1d`" + ) + dtype = hidden_states.dtype + if attention_mask is not None and attention_mask.shape[1] > 1 and attention_mask.shape[0] > 1: + # tune out hidden states for pad tokens, see https://github.com/state-spaces/mamba/issues/66 + hidden_states = (hidden_states * attention_mask[:, :, None]).to(dtype) + + return self.torch_forward(hidden_states, cache_params, cache_position, attention_mask) + + +class BambaMLP(LlamaMLP): + pass + + +class BambaRMSNorm(LlamaRMSNorm): + pass + + +class BambaDecoderLayer(JambaAttentionDecoderLayer): + def __init__(self, config: BambaConfig, layer_idx: int, layer_type: str = "mamba"): + super().__init__(config, layer_idx) + + del self.self_attn + + num_experts = 1 + ffn_layer_class = BambaMLP if num_experts == 1 else None + self.feed_forward = ffn_layer_class(config) + + self.layer_type = layer_type + if layer_type == "mamba": + self.mamba = BambaMixer(config=config, layer_idx=layer_idx) + elif layer_type == "attention": + self.self_attn = BambaAttention(config, layer_idx) + else: + raise ValueError("Invalid layer_type") + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[HybridMambaAttentionDynamicCache] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + cache_position: Optional[torch.LongTensor] = None, + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + **kwargs: Unpack[BambaFlashAttentionKwargs], + ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]: + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`, *optional*): attention mask of size + `(batch, sequence_length)` where padding elements are indicated by 0. + past_key_values (`HybridMambaAttentionDynamicCache`, *optional*): cached past key and value projection states + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*): + Indices depicting the position of the input sequence tokens in the sequence. + position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*): + Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`, + with `head_dim` being the embedding dimension of each attention head. + kwargs (`dict`, *optional*): + Arbitrary kwargs. Can be used to provide `BambaFlashAttentionKwargs` for + padding-free training and/or improve torch.compile performance. + """ + + residual = hidden_states + + hidden_states = self.input_layernorm(hidden_states) + + # this is a hybrid decoder layer + if self.layer_type == "mamba": + hidden_states = self.mamba( + hidden_states=hidden_states, + cache_params=past_key_values, + cache_position=cache_position, + attention_mask=attention_mask, + **kwargs, + ) + self_attn_weights = None + elif self.layer_type == "attention": + hidden_states, self_attn_weights = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + output_attentions=output_attentions, + use_cache=use_cache, + cache_position=cache_position, + position_embeddings=position_embeddings, + **kwargs, + ) + + # residual connection after attention + hidden_states = residual + hidden_states + + # feed-forward + residual = hidden_states + hidden_states = self.pre_ff_layernorm(hidden_states) + hidden_states = self.feed_forward(hidden_states) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights,) + + return outputs + + +@auto_docstring +class BambaPreTrainedModel(PreTrainedModel): + config: BambaConfig + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["BambaDecoderLayer"] + _skip_keys_device_placement = "past_key_values" + _supports_flash_attn = True + _supports_sdpa = True + # Note: only supports HybridMambaAttentionDynamicCache + _is_stateful = True + + def _init_weights(self, module): + super()._init_weights(module) + if isinstance(module, BambaMixer): + module.dt_bias.data.fill_(1.0) + module.A_log.data = torch.log(torch.arange(1, module.num_heads + 1)) + module.D.data.fill_(1.0) + + +@auto_docstring +class BambaModel(BambaPreTrainedModel): + def __init__(self, config: BambaConfig): + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) + decoder_layers = [] + for i in range(config.num_hidden_layers): + decoder_layers.append(BambaDecoderLayer(config, layer_idx=i, layer_type=config.layers_block_type[i])) + self.layers = nn.ModuleList(decoder_layers) + + self._attn_implementation = config._attn_implementation + self.final_layernorm = BambaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.rotary_emb = BambaRotaryEmbedding(config=config) + + self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() + + @can_return_tuple + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[HybridMambaAttentionDynamicCache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Unpack[BambaFlashAttentionKwargs], + ) -> BaseModelOutputWithPast: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") + + if self.gradient_checkpointing and self.training and use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`." + ) + use_cache = False + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + hidden_states = inputs_embeds + + if use_cache and past_key_values is None: + logger.warning_once( + "Bamba requires an initialized `HybridMambaAttentionDynamicCache` to return a cache. None was " + "provided, so no cache will be returned." + ) + + if cache_position is None: + cache_position = torch.arange(hidden_states.shape[1], device=hidden_states.device) + if position_ids is None: + position_ids = cache_position.unsqueeze(0) + + causal_mask = self._update_causal_mask( + attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions + ) + mamba_mask = self._update_mamba_mask(attention_mask, cache_position) + + # create position embeddings to be shared across the decoder layers + position_embeddings = self.rotary_emb(hidden_states, position_ids) + + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + + for decoder_layer in self.layers: + # Depending on the layer type we opt for 2D base attention mask (Mamba) or 4D causal mask (Attention) + layer_mask = mamba_mask if decoder_layer.layer_type == "mamba" else causal_mask + + if output_hidden_states: + all_hidden_states += (hidden_states,) + + layer_outputs = decoder_layer( + hidden_states, + attention_mask=layer_mask, + position_ids=position_ids, + past_key_values=past_key_values, + output_attentions=output_attentions, + use_cache=use_cache, + cache_position=cache_position, + position_embeddings=position_embeddings, + **kwargs, + ) + + hidden_states = layer_outputs[0] + + if output_attentions: + if layer_outputs[1] is not None: + # append attentions only of attention layers. Mamba layers return `None` as the attention weights + all_self_attns += (layer_outputs[1],) + + hidden_states = self.final_layernorm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if past_key_values and not past_key_values.has_previous_state: + past_key_values.has_previous_state = True + + next_cache = None if not use_cache else past_key_values + + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + ) + + def _update_causal_mask( + self, + attention_mask: torch.Tensor, + input_tensor: torch.Tensor, + cache_position: torch.Tensor, + past_key_values: HybridMambaAttentionDynamicCache, + output_attentions: bool, + ): + if self.config._attn_implementation == "flash_attention_2": + if attention_mask is not None and 0.0 in attention_mask: + return attention_mask + return None + + # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in + # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail + # to infer the attention mask. + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + + # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward + if self.config._attn_implementation == "sdpa" and not output_attentions: + if AttentionMaskConverter._ignore_causal_mask_sdpa( + attention_mask, + inputs_embeds=input_tensor, + past_key_values_length=past_seen_tokens, + is_training=self.training, + ): + return None + + dtype = input_tensor.dtype + sequence_length = input_tensor.shape[1] + target_length = ( + attention_mask.shape[-1] + if isinstance(attention_mask, torch.Tensor) + else past_seen_tokens + sequence_length + 1 + ) + + # In case the provided `attention` mask is 2D, we generate a causal mask here (4D). + causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position( + attention_mask, + sequence_length=sequence_length, + target_length=target_length, + dtype=dtype, + cache_position=cache_position, + batch_size=input_tensor.shape[0], + ) + + if ( + self.config._attn_implementation == "sdpa" + and attention_mask is not None + and attention_mask.device.type in ["cuda", "xpu", "npu"] + and not output_attentions + ): + # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when + # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path. + # Details: https://github.com/pytorch/pytorch/issues/110213 + min_dtype = torch.finfo(dtype).min + causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype) + + return causal_mask + + @staticmethod + def _prepare_4d_causal_attention_mask_with_cache_position( + attention_mask: torch.Tensor, + sequence_length: int, + target_length: int, + dtype: torch.dtype, + cache_position: torch.Tensor, + batch_size: int, + **kwargs, + ): + """ + Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape + `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing. + + Args: + attention_mask (`torch.Tensor`): + A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape + `(batch_size, 1, query_length, key_value_length)`. + sequence_length (`int`): + The sequence length being processed. + target_length (`int`): + The target length: when generating with static cache, the mask should be as long as the static cache, + to account for the 0 padding, the part of the cache that is not filled yet. + dtype (`torch.dtype`): + The dtype to use for the 4D attention mask. + cache_position (`torch.Tensor`): + Indices depicting the position of the input sequence tokens in the sequence. + batch_size (`torch.Tensor`): + Batch size. + """ + if attention_mask is not None and attention_mask.dim() == 4: + # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing. + causal_mask = attention_mask + else: + min_dtype = torch.finfo(dtype).min + causal_mask = torch.full( + (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=cache_position.device + ) + if sequence_length != 1: + causal_mask = torch.triu(causal_mask, diagonal=1) + causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(-1, 1) + causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1) + if attention_mask is not None: + causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit + mask_length = attention_mask.shape[-1] + padding_attention_mask = (attention_mask[:, None, None, :] == attention_mask[:, None, :, None])[ + :, :, -sequence_length:, : + ].to(dtype) + padding_mask = causal_mask[:, :, :, :mask_length] + padding_attention_mask + padding_mask = padding_mask == 0 + causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill( + padding_mask, min_dtype + ) + + return causal_mask + + def _update_mamba_mask(self, attention_mask, cache_position): + """ + No need for zeroing states when + 1. Cached forward + 2. Attending to all inputs + """ + mamba_mask = attention_mask + if cache_position[0] > 0 or (attention_mask is not None and torch.all(attention_mask == 1)): + mamba_mask = None + return mamba_mask + + +class BambaForCausalLM(LlamaForCausalLM): + def __init__(self, config): + super().__init__(config) + self.z_loss_coefficient = config.z_loss_coefficient + + # Initialize weights and apply final processing + self.post_init() + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[HybridMambaAttentionDynamicCache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + logits_to_keep: Union[int, torch.Tensor] = 0, + **kwargs, + ) -> CausalLMOutputWithPast: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Example: + + ```python + >>> from transformers import AutoTokenizer, BambaForCausalLM + + >>> model = BambaForCausalLM.from_pretrained("...") + >>> tokenizer = AutoTokenizer.from_pretrained("...") + + >>> prompt = "Hey, are you conscious? Can you talk to me?" + >>> inputs = tokenizer(prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(inputs.input_ids, max_length=30) + >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you." + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs: BaseModelOutputWithPast = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + cache_position=cache_position, + **kwargs, + ) + + hidden_states = outputs.last_hidden_state + # Only compute necessary logits, and do not upcast them to float if we are not computing the loss + slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep + logits = self.lm_head(hidden_states[:, slice_indices, :]) + + loss = None + if labels is not None: + loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs) + if self.z_loss_coefficient > 0: + # Type-match loss, but avoid upcasting large logits tensor until after it's been reduced on dim -1 + z_loss = logits.logsumexp(dim=-1).to(dtype=loss.dtype).pow(2).mean() + loss = loss + self.z_loss_coefficient * z_loss + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def prepare_inputs_for_generation( + self, + input_ids, + past_key_values=None, + attention_mask=None, + inputs_embeds=None, + cache_position=None, + position_ids=None, + use_cache=True, + **kwargs, + ): + # Overwritten -- has a unique cache type, `HybridMambaAttentionDynamicCache` + + empty_past_kv = past_key_values is None + + # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens + # Exception 1: when passing input_embeds, input_ids may be missing entries + # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here + # Exception 3: with synced GPUs cache_position may go out of bounds, but we only want dummy token in that case. + # (we can't check exception 3 while compiling) + if not empty_past_kv: + if ( + inputs_embeds is not None # Exception 1 + or cache_position[-1] >= input_ids.shape[1] # Exception 3 + ): + input_ids = input_ids[:, -cache_position.shape[0] :] + elif input_ids.shape[1] != cache_position.shape[0]: # Default case (the "else", a no op, is Exception 2) + input_ids = input_ids[:, cache_position] + else: + past_key_values = HybridMambaAttentionDynamicCache( + self.config, input_ids.shape[0], self.dtype, device=self.device + ) + + if attention_mask is not None and position_ids is None: + # create position_ids on the fly for batch generation + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + if not empty_past_kv: + position_ids = position_ids[:, -input_ids.shape[1] :] + + # if `inputs_embeds` are passed, we only want to use them in the 1st generation step + if inputs_embeds is not None and empty_past_kv: + model_inputs = {"inputs_embeds": inputs_embeds} + else: + model_inputs = {"input_ids": input_ids.contiguous()} # `contiguous()` needed for compilation use cases + + model_inputs.update( + { + "position_ids": position_ids, + "past_key_values": past_key_values, + "use_cache": use_cache, + "attention_mask": attention_mask, + "logits_to_keep": self.config.num_logits_to_keep, + "cache_position": cache_position, + } + ) + + # Forward ALL kwargs that are uninitialized (e.g. `use_cache`). + for key, value in kwargs.items(): + if key not in model_inputs: + model_inputs[key] = value + + return model_inputs + + +__all__ = ["BambaModel", "BambaForCausalLM", "BambaPreTrainedModel"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bart/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bart/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8f4c713f4698d7c901ed06738efc8983e317805c --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bart/__init__.py @@ -0,0 +1,31 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_bart import * + from .modeling_bart import * + from .modeling_flax_bart import * + from .modeling_tf_bart import * + from .tokenization_bart import * + from .tokenization_bart_fast import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bart/configuration_bart.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bart/configuration_bart.py new file mode 100644 index 0000000000000000000000000000000000000000..90781feab3b51e6fc30198794c47254f16222c60 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bart/configuration_bart.py @@ -0,0 +1,406 @@ +# coding=utf-8 +# Copyright 2021 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""BART model configuration""" + +import warnings +from collections import OrderedDict +from collections.abc import Mapping +from typing import Any, Optional + +from ... import PreTrainedTokenizer +from ...configuration_utils import PretrainedConfig +from ...onnx import OnnxConfig, OnnxConfigWithPast, OnnxSeq2SeqConfigWithPast +from ...onnx.utils import compute_effective_axis_dimension +from ...utils import TensorType, is_torch_available, logging + + +logger = logging.get_logger(__name__) + + +class BartConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`BartModel`]. It is used to instantiate a BART + model according to the specified arguments, defining the model architecture. Instantiating a configuration with the + defaults will yield a similar configuration to that of the BART + [facebook/bart-large](https://huggingface.co/facebook/bart-large) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 50265): + Vocabulary size of the BART model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`BartModel`] or [`TFBartModel`]. + d_model (`int`, *optional*, defaults to 1024): + Dimensionality of the layers and the pooler layer. + encoder_layers (`int`, *optional*, defaults to 12): + Number of encoder layers. + decoder_layers (`int`, *optional*, defaults to 12): + Number of decoder layers. + encoder_attention_heads (`int`, *optional*, defaults to 16): + Number of attention heads for each attention layer in the Transformer encoder. + decoder_attention_heads (`int`, *optional*, defaults to 16): + Number of attention heads for each attention layer in the Transformer decoder. + decoder_ffn_dim (`int`, *optional*, defaults to 4096): + Dimensionality of the "intermediate" (often named feed-forward) layer in decoder. + encoder_ffn_dim (`int`, *optional*, defaults to 4096): + Dimensionality of the "intermediate" (often named feed-forward) layer in decoder. + activation_function (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"silu"` and `"gelu_new"` are supported. + dropout (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + activation_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for activations inside the fully connected layer. + classifier_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for classifier. + max_position_embeddings (`int`, *optional*, defaults to 1024): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + init_std (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + encoder_layerdrop (`float`, *optional*, defaults to 0.0): + The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://huggingface.co/papers/1909.11556) + for more details. + decoder_layerdrop (`float`, *optional*, defaults to 0.0): + The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://huggingface.co/papers/1909.11556) + for more details. + scale_embedding (`bool`, *optional*, defaults to `False`): + Scale embeddings by diving by sqrt(d_model). + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). + num_labels (`int`, *optional*, defaults to 3): + The number of labels to use in [`BartForSequenceClassification`]. + forced_eos_token_id (`int`, *optional*, defaults to 2): + The id of the token to force as the last generated token when `max_length` is reached. Usually set to + `eos_token_id`. + + Example: + + ```python + >>> from transformers import BartConfig, BartModel + + >>> # Initializing a BART facebook/bart-large style configuration + >>> configuration = BartConfig() + + >>> # Initializing a model (with random weights) from the facebook/bart-large style configuration + >>> model = BartModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "bart" + keys_to_ignore_at_inference = ["past_key_values"] + attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"} + + def __init__( + self, + vocab_size=50265, + max_position_embeddings=1024, + encoder_layers=12, + encoder_ffn_dim=4096, + encoder_attention_heads=16, + decoder_layers=12, + decoder_ffn_dim=4096, + decoder_attention_heads=16, + encoder_layerdrop=0.0, + decoder_layerdrop=0.0, + activation_function="gelu", + d_model=1024, + dropout=0.1, + attention_dropout=0.0, + activation_dropout=0.0, + init_std=0.02, + classifier_dropout=0.0, + scale_embedding=False, + use_cache=True, + num_labels=3, + pad_token_id=1, + bos_token_id=0, + eos_token_id=2, + is_encoder_decoder=True, + decoder_start_token_id=2, + forced_eos_token_id=2, + **kwargs, + ): + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.d_model = d_model + self.encoder_ffn_dim = encoder_ffn_dim + self.encoder_layers = encoder_layers + self.encoder_attention_heads = encoder_attention_heads + self.decoder_ffn_dim = decoder_ffn_dim + self.decoder_layers = decoder_layers + self.decoder_attention_heads = decoder_attention_heads + self.dropout = dropout + self.attention_dropout = attention_dropout + self.activation_dropout = activation_dropout + self.activation_function = activation_function + self.init_std = init_std + self.encoder_layerdrop = encoder_layerdrop + self.decoder_layerdrop = decoder_layerdrop + self.classifier_dropout = classifier_dropout + self.use_cache = use_cache + self.num_hidden_layers = encoder_layers + self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True + + super().__init__( + num_labels=num_labels, + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + is_encoder_decoder=is_encoder_decoder, + decoder_start_token_id=decoder_start_token_id, + forced_eos_token_id=forced_eos_token_id, + **kwargs, + ) + + # ensure backward compatibility for BART CNN models + if self.forced_bos_token_id is None and kwargs.get("force_bos_token_to_be_generated", False): + self.forced_bos_token_id = self.bos_token_id + warnings.warn( + f"Please make sure the config includes `forced_bos_token_id={self.bos_token_id}` in future versions. " + "The config can simply be saved and uploaded again to be fixed." + ) + + +class BartOnnxConfig(OnnxSeq2SeqConfigWithPast): + @property + def inputs(self) -> Mapping[str, Mapping[int, str]]: + if self.task in ["default", "seq2seq-lm"]: + common_inputs = OrderedDict( + [ + ("input_ids", {0: "batch", 1: "encoder_sequence"}), + ("attention_mask", {0: "batch", 1: "encoder_sequence"}), + ] + ) + + if self.use_past: + common_inputs["decoder_input_ids"] = {0: "batch"} + common_inputs["decoder_attention_mask"] = {0: "batch", 1: "past_decoder_sequence + sequence"} + else: + common_inputs["decoder_input_ids"] = {0: "batch", 1: "decoder_sequence"} + common_inputs["decoder_attention_mask"] = {0: "batch", 1: "decoder_sequence"} + + if self.use_past: + self.fill_with_past_key_values_(common_inputs, direction="inputs") + elif self.task == "causal-lm": + # TODO: figure this case out. + common_inputs = OrderedDict( + [ + ("input_ids", {0: "batch", 1: "encoder_sequence"}), + ("attention_mask", {0: "batch", 1: "encoder_sequence"}), + ] + ) + if self.use_past: + num_encoder_layers, _ = self.num_layers + for i in range(num_encoder_layers): + common_inputs[f"past_key_values.{i}.key"] = {0: "batch", 2: "past_sequence + sequence"} + common_inputs[f"past_key_values.{i}.value"] = {0: "batch", 2: "past_sequence + sequence"} + else: + common_inputs = OrderedDict( + [ + ("input_ids", {0: "batch", 1: "encoder_sequence"}), + ("attention_mask", {0: "batch", 1: "encoder_sequence"}), + ("decoder_input_ids", {0: "batch", 1: "decoder_sequence"}), + ("decoder_attention_mask", {0: "batch", 1: "decoder_sequence"}), + ] + ) + + return common_inputs + + @property + def outputs(self) -> Mapping[str, Mapping[int, str]]: + if self.task in ["default", "seq2seq-lm"]: + common_outputs = super().outputs + else: + common_outputs = super(OnnxConfigWithPast, self).outputs + if self.use_past: + num_encoder_layers, _ = self.num_layers + for i in range(num_encoder_layers): + common_outputs[f"present.{i}.key"] = {0: "batch", 2: "past_sequence + sequence"} + common_outputs[f"present.{i}.value"] = {0: "batch", 2: "past_sequence + sequence"} + return common_outputs + + def _generate_dummy_inputs_for_default_and_seq2seq_lm( + self, + tokenizer: PreTrainedTokenizer, + batch_size: int = -1, + seq_length: int = -1, + is_pair: bool = False, + framework: Optional[TensorType] = None, + ) -> Mapping[str, Any]: + encoder_inputs = self._generate_dummy_inputs_for_sequence_classification_and_question_answering( + tokenizer, batch_size, seq_length, is_pair, framework + ) + + # Generate decoder inputs + decoder_seq_length = seq_length if not self.use_past else 1 + decoder_inputs = self._generate_dummy_inputs_for_sequence_classification_and_question_answering( + tokenizer, batch_size, decoder_seq_length, is_pair, framework + ) + decoder_inputs = {f"decoder_{name}": tensor for name, tensor in decoder_inputs.items()} + common_inputs = dict(**encoder_inputs, **decoder_inputs) + + if self.use_past: + if not is_torch_available(): + raise ValueError("Cannot generate dummy past_keys inputs without PyTorch installed.") + else: + import torch + batch, encoder_seq_length = common_inputs["input_ids"].shape + decoder_seq_length = common_inputs["decoder_input_ids"].shape[1] + num_encoder_attention_heads, num_decoder_attention_heads = self.num_attention_heads + encoder_shape = ( + batch, + num_encoder_attention_heads, + encoder_seq_length, + self._config.hidden_size // num_encoder_attention_heads, + ) + decoder_past_length = decoder_seq_length + 3 + decoder_shape = ( + batch, + num_decoder_attention_heads, + decoder_past_length, + self._config.hidden_size // num_decoder_attention_heads, + ) + + common_inputs["decoder_attention_mask"] = torch.cat( + [common_inputs["decoder_attention_mask"], torch.ones(batch, decoder_past_length)], dim=1 + ) + + common_inputs["past_key_values"] = [] + # If the number of encoder and decoder layers are present in the model configuration, both are considered + num_encoder_layers, num_decoder_layers = self.num_layers + min_num_layers = min(num_encoder_layers, num_decoder_layers) + max_num_layers = max(num_encoder_layers, num_decoder_layers) - min_num_layers + remaining_side_name = "encoder" if num_encoder_layers > num_decoder_layers else "decoder" + + for _ in range(min_num_layers): + common_inputs["past_key_values"].append( + ( + torch.zeros(decoder_shape), + torch.zeros(decoder_shape), + torch.zeros(encoder_shape), + torch.zeros(encoder_shape), + ) + ) + # TODO: test this. + shape = encoder_shape if remaining_side_name == "encoder" else decoder_shape + for _ in range(min_num_layers, max_num_layers): + common_inputs["past_key_values"].append((torch.zeros(shape), torch.zeros(shape))) + return common_inputs + + def _generate_dummy_inputs_for_causal_lm( + self, + tokenizer: PreTrainedTokenizer, + batch_size: int = -1, + seq_length: int = -1, + is_pair: bool = False, + framework: Optional[TensorType] = None, + ) -> Mapping[str, Any]: + common_inputs = self._generate_dummy_inputs_for_sequence_classification_and_question_answering( + tokenizer, batch_size, seq_length, is_pair, framework + ) + + if self.use_past: + if not is_torch_available(): + raise ValueError("Cannot generate dummy past_keys inputs without PyTorch installed.") + else: + import torch + batch, seqlen = common_inputs["input_ids"].shape + # Not using the same length for past_key_values + past_key_values_length = seqlen + 2 + num_encoder_layers, _ = self.num_layers + num_encoder_attention_heads, _ = self.num_attention_heads + past_shape = ( + batch, + num_encoder_attention_heads, + past_key_values_length, + self._config.hidden_size // num_encoder_attention_heads, + ) + + mask_dtype = common_inputs["attention_mask"].dtype + common_inputs["attention_mask"] = torch.cat( + [common_inputs["attention_mask"], torch.ones(batch, past_key_values_length, dtype=mask_dtype)], dim=1 + ) + common_inputs["past_key_values"] = [ + (torch.zeros(past_shape), torch.zeros(past_shape)) for _ in range(num_encoder_layers) + ] + return common_inputs + + def _generate_dummy_inputs_for_sequence_classification_and_question_answering( + self, + tokenizer: PreTrainedTokenizer, + batch_size: int = -1, + seq_length: int = -1, + is_pair: bool = False, + framework: Optional[TensorType] = None, + ) -> Mapping[str, Any]: + # Copied from OnnxConfig.generate_dummy_inputs + # Did not use super(OnnxConfigWithPast, self).generate_dummy_inputs for code clarity. + # If dynamic axis (-1) we forward with a fixed dimension of 2 samples to avoid optimizations made by ONNX + batch_size = compute_effective_axis_dimension( + batch_size, fixed_dimension=OnnxConfig.default_fixed_batch, num_token_to_add=0 + ) + + # If dynamic axis (-1) we forward with a fixed dimension of 8 tokens to avoid optimizations made by ONNX + token_to_add = tokenizer.num_special_tokens_to_add(is_pair) + seq_length = compute_effective_axis_dimension( + seq_length, fixed_dimension=OnnxConfig.default_fixed_sequence, num_token_to_add=token_to_add + ) + + # Generate dummy inputs according to compute batch and sequence + dummy_input = [" ".join([tokenizer.unk_token]) * seq_length] * batch_size + common_inputs = dict(tokenizer(dummy_input, return_tensors=framework)) + return common_inputs + + def generate_dummy_inputs( + self, + tokenizer: PreTrainedTokenizer, + batch_size: int = -1, + seq_length: int = -1, + is_pair: bool = False, + framework: Optional[TensorType] = None, + ) -> Mapping[str, Any]: + if self.task in ["default", "seq2seq-lm"]: + common_inputs = self._generate_dummy_inputs_for_default_and_seq2seq_lm( + tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair, framework=framework + ) + + elif self.task == "causal-lm": + common_inputs = self._generate_dummy_inputs_for_causal_lm( + tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair, framework=framework + ) + else: + common_inputs = self._generate_dummy_inputs_for_sequence_classification_and_question_answering( + tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair, framework=framework + ) + + return common_inputs + + def _flatten_past_key_values_(self, flattened_output, name, idx, t): + if self.task in ["default", "seq2seq-lm"]: + flattened_output = super()._flatten_past_key_values_(flattened_output, name, idx, t) + else: + flattened_output = super(OnnxSeq2SeqConfigWithPast, self)._flatten_past_key_values_( + flattened_output, name, idx, t + ) + + +__all__ = ["BartConfig", "BartOnnxConfig"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bart/modeling_bart.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bart/modeling_bart.py new file mode 100644 index 0000000000000000000000000000000000000000..0a1f2451cff12042ff4de266523a9a27ece5fed7 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bart/modeling_bart.py @@ -0,0 +1,1949 @@ +# coding=utf-8 +# Copyright 2021 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch BART model.""" + +import math +import warnings +from typing import Callable, Optional, Union + +import torch +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from ...activations import ACT2FN +from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache +from ...generation import GenerationMixin +from ...modeling_attn_mask_utils import ( + AttentionMaskConverter, + _prepare_4d_attention_mask, + _prepare_4d_attention_mask_for_sdpa, +) +from ...modeling_flash_attention_utils import FlashAttentionKwargs +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_outputs import ( + BaseModelOutput, + BaseModelOutputWithPastAndCrossAttentions, + CausalLMOutputWithCrossAttentions, + Seq2SeqLMOutput, + Seq2SeqModelOutput, + Seq2SeqQuestionAnsweringModelOutput, + Seq2SeqSequenceClassifierOutput, +) +from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel +from ...processing_utils import Unpack +from ...utils import ( + auto_docstring, + is_torch_flex_attn_available, + is_torchdynamo_compiling, + logging, +) +from ...utils.deprecation import deprecate_kwarg +from .configuration_bart import BartConfig + + +if is_torch_flex_attn_available(): + from ...integrations.flex_attention import BlockMask, make_flex_block_causal_mask + + +logger = logging.get_logger(__name__) + + +def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int): + """ + Shift input ids one token to the right. + """ + shifted_input_ids = input_ids.new_zeros(input_ids.shape) + shifted_input_ids[:, 1:] = input_ids[:, :-1].clone() + shifted_input_ids[:, 0] = decoder_start_token_id + + if pad_token_id is None: + raise ValueError("self.model.config.pad_token_id has to be defined.") + # replace possible -100 values in labels by `pad_token_id` + shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id) + + return shifted_input_ids + + +class BartLearnedPositionalEmbedding(nn.Embedding): + """ + This module learns positional embeddings up to a fixed maximum size. + """ + + def __init__(self, num_embeddings: int, embedding_dim: int): + # Bart is set up so that if padding_idx is specified then offset the embedding ids by 2 + # and adjust num_embeddings appropriately. Other models don't have this hack + self.offset = 2 + super().__init__(num_embeddings + self.offset, embedding_dim) + + def forward( + self, input_ids: torch.Tensor, past_key_values_length: int = 0, position_ids: Optional[torch.Tensor] = None + ): + """`input_ids' shape is expected to be [bsz x seqlen].""" + + if position_ids is None: + bsz, seq_len = input_ids.shape[:2] + position_ids = torch.arange( + past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=self.weight.device + ).expand(bsz, -1) + else: + position_ids = position_ids.unsqueeze(0) + + return super().forward(position_ids + self.offset) + + +class BartScaledWordEmbedding(nn.Embedding): + """ + This module overrides nn.Embeddings' forward by multiplying with embeddings scale. + """ + + def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int, embed_scale: Optional[float] = 1.0): + super().__init__(num_embeddings, embedding_dim, padding_idx) + self.embed_scale = embed_scale + + def forward(self, input_ids: torch.Tensor): + return super().forward(input_ids) * self.embed_scale + + +def eager_attention_forward( + module: nn.Module, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attention_mask: Optional[torch.Tensor], + scaling: Optional[float] = None, + dropout: float = 0.0, + head_mask: Optional[torch.Tensor] = None, + **kwargs, +): + if scaling is None: + scaling = query.size(-1) ** -0.5 + + attn_weights = torch.matmul(query, key.transpose(2, 3)) * scaling + if attention_mask is not None: + attn_weights = attn_weights + attention_mask + + attn_weights = nn.functional.softmax(attn_weights, dim=-1) + + if head_mask is not None: + attn_weights = attn_weights * head_mask.view(1, -1, 1, 1) + + attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) + attn_output = torch.matmul(attn_weights, value) + attn_output = attn_output.transpose(1, 2).contiguous() + + return attn_output, attn_weights + + +class BartAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__( + self, + embed_dim: int, + num_heads: int, + dropout: float = 0.0, + is_decoder: bool = False, + bias: bool = True, + is_causal: bool = False, + config: Optional[BartConfig] = None, + layer_idx: Optional[int] = None, + ): + super().__init__() + self.embed_dim = embed_dim + self.num_heads = num_heads + self.dropout = dropout + self.head_dim = embed_dim // num_heads + self.config = config + + if (self.head_dim * num_heads) != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}" + f" and `num_heads`: {num_heads})." + ) + self.scaling = self.head_dim**-0.5 + self.is_decoder = is_decoder + self.is_causal = is_causal + self.layer_idx = layer_idx + if layer_idx is None and self.is_decoder: + logger.warning_once( + f"Instantiating a decoder {self.__class__.__name__} without passing `layer_idx` is not recommended and " + "will lead to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` " + "when creating this class." + ) + + self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + key_value_states: Optional[torch.Tensor] = None, + past_key_values: Optional[Cache] = None, + attention_mask: Optional[torch.Tensor] = None, + layer_head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + cache_position: Optional[torch.Tensor] = None, + # TODO: we need a refactor so that the different attention modules can get their specific kwargs + # ATM, we have mixed things encoder, decoder, and encoder-decoder attn + **kwargs: Unpack[FlashAttentionKwargs], + ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]: + """Input shape: Batch x Time x Channel""" + + # if key_value_states are provided this layer is used as a cross-attention layer + # for the decoder + is_cross_attention = key_value_states is not None + + # determine input shapes + bsz, tgt_len = hidden_states.shape[:-1] + src_len = key_value_states.shape[1] if is_cross_attention else tgt_len + + q_input_shape = (bsz, tgt_len, -1, self.head_dim) + kv_input_shape = (bsz, src_len, -1, self.head_dim) + + # get query proj + query_states = self.q_proj(hidden_states).view(*q_input_shape).transpose(1, 2) + + is_updated = False + if past_key_values is not None: + if isinstance(past_key_values, EncoderDecoderCache): + is_updated = past_key_values.is_updated.get(self.layer_idx) + if is_cross_attention: + # after the first generated id, we can subsequently re-use all key/value_states from cache + curr_past_key_value = past_key_values.cross_attention_cache + else: + curr_past_key_value = past_key_values.self_attention_cache + else: + curr_past_key_value = past_key_values + + current_states = key_value_states if is_cross_attention else hidden_states + if is_cross_attention and past_key_values is not None and is_updated: + # reuse k,v, cross_attentions + key_states = curr_past_key_value.layers[self.layer_idx].keys + value_states = curr_past_key_value.layers[self.layer_idx].values + else: + key_states = self.k_proj(current_states) + value_states = self.v_proj(current_states) + key_states = key_states.view(*kv_input_shape).transpose(1, 2) + value_states = value_states.view(*kv_input_shape).transpose(1, 2) + + if past_key_values is not None: + # save all key/value_states to cache to be re-used for fast auto-regressive generation + cache_position = cache_position if not is_cross_attention else None + key_states, value_states = curr_past_key_value.update( + key_states, value_states, self.layer_idx, {"cache_position": cache_position} + ) + # set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls + if is_cross_attention and isinstance(past_key_values, EncoderDecoderCache): + past_key_values.is_updated[self.layer_idx] = True + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + attn_output, attn_weights = attention_interface( + self, + query_states, + key_states, + value_states, + attention_mask, + dropout=0.0 if not self.training else self.dropout, + scaling=self.scaling, + output_attentions=output_attentions, + head_mask=layer_head_mask, + **kwargs, + ) + + attn_output = attn_output.reshape(bsz, tgt_len, -1).contiguous() + attn_output = self.out_proj(attn_output) + + return attn_output, attn_weights + + +class BartEncoderLayer(GradientCheckpointingLayer): + def __init__(self, config: BartConfig, layer_idx: Optional[int] = None): + super().__init__() + self.embed_dim = config.d_model + + self.self_attn = BartAttention( + embed_dim=self.embed_dim, + num_heads=config.encoder_attention_heads, + dropout=config.attention_dropout, + config=config, + layer_idx=layer_idx, + ) + self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) + self.dropout = config.dropout + self.activation_fn = ACT2FN[config.activation_function] + self.activation_dropout = config.activation_dropout + self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim) + self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim) + self.final_layer_norm = nn.LayerNorm(self.embed_dim) + + def forward( + self, + hidden_states: torch.FloatTensor, + attention_mask: torch.FloatTensor, + layer_head_mask: torch.FloatTensor, + output_attentions: Optional[bool] = False, + ) -> tuple[torch.FloatTensor, Optional[torch.FloatTensor]]: + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`): attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size + `(encoder_attention_heads,)`. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + """ + residual = hidden_states + hidden_states, attn_weights = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + layer_head_mask=layer_head_mask, + output_attentions=output_attentions, + ) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + + residual = hidden_states + hidden_states = self.activation_fn(self.fc1(hidden_states)) + hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) + hidden_states = self.fc2(hidden_states) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + hidden_states = self.final_layer_norm(hidden_states) + + if hidden_states.dtype == torch.float16 and ( + torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any() + ): + clamp_value = torch.finfo(hidden_states.dtype).max - 1000 + hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (attn_weights,) + + return outputs + + +class BartDecoderLayer(GradientCheckpointingLayer): + def __init__(self, config: BartConfig, layer_idx: Optional[int] = None): + super().__init__() + self.embed_dim = config.d_model + + self.self_attn = BartAttention( + embed_dim=self.embed_dim, + num_heads=config.decoder_attention_heads, + dropout=config.attention_dropout, + is_decoder=True, + is_causal=True, + config=config, + layer_idx=layer_idx, + ) + self.dropout = config.dropout + self.activation_fn = ACT2FN[config.activation_function] + self.activation_dropout = config.activation_dropout + + self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) + self.encoder_attn = BartAttention( + self.embed_dim, + config.decoder_attention_heads, + dropout=config.attention_dropout, + is_decoder=True, + config=config, + layer_idx=layer_idx, + ) + self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim) + self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim) + self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim) + self.final_layer_norm = nn.LayerNorm(self.embed_dim) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + layer_head_mask: Optional[torch.Tensor] = None, + cross_attn_layer_head_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[Cache] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = True, + cache_position: Optional[torch.Tensor] = None, + ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]: + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`): attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + encoder_hidden_states (`torch.FloatTensor`): + cross attention input to the layer of shape `(batch, seq_len, embed_dim)` + encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size + `(encoder_attention_heads,)`. + cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of + size `(decoder_attention_heads,)`. + past_key_values (`Cache`): cached past key and value projection states + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*): + Indices depicting the position of the input sequence tokens in the sequence. It is used to update the + cache in the correct position and to infer the complete sequence length. + """ + residual = hidden_states + + # Self Attention + hidden_states, self_attn_weights = self.self_attn( + hidden_states=hidden_states, + past_key_values=past_key_values, + attention_mask=attention_mask, + layer_head_mask=layer_head_mask, + output_attentions=output_attentions, + cache_position=cache_position, + ) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + + # Cross-Attention Block + cross_attn_weights = None + if encoder_hidden_states is not None: + residual = hidden_states + + hidden_states, cross_attn_weights = self.encoder_attn( + hidden_states=hidden_states, + key_value_states=encoder_hidden_states, + attention_mask=encoder_attention_mask, + layer_head_mask=cross_attn_layer_head_mask, + past_key_values=past_key_values, + output_attentions=output_attentions, + cache_position=cache_position, + ) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + hidden_states = self.encoder_attn_layer_norm(hidden_states) + + # Fully Connected + residual = hidden_states + hidden_states = self.activation_fn(self.fc1(hidden_states)) + hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) + hidden_states = self.fc2(hidden_states) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + hidden_states = self.final_layer_norm(hidden_states) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights, cross_attn_weights) + + return outputs + + +class BartClassificationHead(nn.Module): + """Head for sentence-level classification tasks.""" + + def __init__( + self, + input_dim: int, + inner_dim: int, + num_classes: int, + pooler_dropout: float, + ): + super().__init__() + self.dense = nn.Linear(input_dim, inner_dim) + self.dropout = nn.Dropout(p=pooler_dropout) + self.out_proj = nn.Linear(inner_dim, num_classes) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dropout(hidden_states) + hidden_states = self.dense(hidden_states) + hidden_states = torch.tanh(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.out_proj(hidden_states) + return hidden_states + + +@auto_docstring +class BartPreTrainedModel(PreTrainedModel): + config: BartConfig + base_model_prefix = "model" + supports_gradient_checkpointing = True + _keys_to_ignore_on_load_unexpected = ["encoder.version", "decoder.version"] + _no_split_modules = [r"BartEncoderLayer", r"BartDecoderLayer"] + _skip_keys_device_placement = "past_key_values" + _supports_flash_attn = True + _supports_sdpa = True + _supports_flex_attn = True + + _can_compile_fullgraph = True + + def _init_weights(self, module): + std = self.config.init_std + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, nn.LayerNorm): + module.weight.data.fill_(1.0) + module.bias.data.zero_() + + @property + def dummy_inputs(self): + pad_token = self.config.pad_token_id + input_ids = torch.tensor([[0, 6, 10, 4, 2], [0, 8, 12, 2, pad_token]], device=self.device) + dummy_inputs = { + "attention_mask": input_ids.ne(pad_token), + "input_ids": input_ids, + } + return dummy_inputs + + def _update_full_mask( + self, + attention_mask: Union[torch.Tensor, None], + inputs_embeds: torch.Tensor, + ): + if attention_mask is not None: + if self.config._attn_implementation == "flash_attention_2": + attention_mask = attention_mask if 0 in attention_mask else None + elif self.config._attn_implementation == "sdpa": + # output_attentions=True & head_mask can not be supported when using SDPA, fall back to + # the manual implementation that requires a 4D causal mask in all cases. + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + attention_mask = _prepare_4d_attention_mask_for_sdpa(attention_mask, inputs_embeds.dtype) + elif self.config._attn_implementation == "flex_attention": + if isinstance(attention_mask, torch.Tensor): + attention_mask = make_flex_block_causal_mask(attention_mask, is_causal=False) + else: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + attention_mask = _prepare_4d_attention_mask(attention_mask, inputs_embeds.dtype) + + return attention_mask + + def _update_causal_mask( + self, + attention_mask: Optional[Union[torch.Tensor, "BlockMask"]], + input_tensor: torch.Tensor, + cache_position: torch.Tensor, + past_key_values: Cache, + ): + if self.config._attn_implementation == "flex_attention": + if isinstance(attention_mask, torch.Tensor): + attention_mask = make_flex_block_causal_mask(attention_mask) + # Other attention flavors support in-built causal (when `mask is None`) + # while we need to create our specific block mask regardless + elif attention_mask is None: + attention_mask = make_flex_block_causal_mask( + torch.ones( + size=(input_tensor.shape[0], input_tensor.shape[1]), + device=attention_mask.device, + ) + ) + return attention_mask + + if self.config._attn_implementation == "flash_attention_2": + if attention_mask is not None and (attention_mask == 0.0).any(): + return attention_mask + return None + + # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in + # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail + # to infer the attention mask. + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + using_compilable_cache = past_key_values.is_compileable if past_key_values is not None else False + + # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward + if self.config._attn_implementation == "sdpa" and not using_compilable_cache: + if AttentionMaskConverter._ignore_causal_mask_sdpa( + attention_mask, + inputs_embeds=input_tensor, + past_key_values_length=past_seen_tokens, + is_training=self.training, + ): + return None + + dtype = input_tensor.dtype + sequence_length = input_tensor.shape[1] + if using_compilable_cache: + target_length = past_key_values.get_max_cache_shape() + else: + target_length = ( + attention_mask.shape[-1] + if isinstance(attention_mask, torch.Tensor) + else past_seen_tokens + sequence_length + 1 + ) + + # In case the provided `attention` mask is 2D, we generate a causal mask here (4D). + causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position( + attention_mask, + sequence_length=sequence_length, + target_length=target_length, + dtype=dtype, + cache_position=cache_position, + batch_size=input_tensor.shape[0], + ) + + if ( + self.config._attn_implementation == "sdpa" + and attention_mask is not None + and attention_mask.device.type in ["cuda", "xpu", "npu"] + ): + # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when + # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path. + # Details: https://github.com/pytorch/pytorch/issues/110213 + min_dtype = torch.finfo(dtype).min + causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype) + + return causal_mask + + @staticmethod + # Copied from transformers.models.gptj.modeling_gptj.GPTJModel._prepare_4d_causal_attention_mask_with_cache_position + def _prepare_4d_causal_attention_mask_with_cache_position( + attention_mask: torch.Tensor, + sequence_length: int, + target_length: int, + dtype: torch.dtype, + cache_position: torch.Tensor, + batch_size: int, + **kwargs, + ): + """ + Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape + `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing. + + Args: + attention_mask (`torch.Tensor`): + A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape + `(batch_size, 1, query_length, key_value_length)`. + sequence_length (`int`): + The sequence length being processed. + target_length (`int`): + The target length: when generating with static cache, the mask should be as long as the static cache, + to account for the 0 padding, the part of the cache that is not filled yet. + dtype (`torch.dtype`): + The dtype to use for the 4D attention mask. + cache_position (`torch.Tensor`): + Indices depicting the position of the input sequence tokens in the sequence. + batch_size (`torch.Tensor`): + Batch size. + """ + if attention_mask is not None and attention_mask.dim() == 4: + # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing. + causal_mask = attention_mask + else: + min_dtype = torch.finfo(dtype).min + causal_mask = torch.full( + (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=cache_position.device + ) + if sequence_length != 1: + causal_mask = torch.triu(causal_mask, diagonal=1) + causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(-1, 1) + causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1) + if attention_mask is not None: + causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit + mask_length = attention_mask.shape[-1] + padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to( + causal_mask.device + ) + padding_mask = padding_mask == 0 + causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill( + padding_mask, min_dtype + ) + + return causal_mask + + def _update_cross_attn_mask( + self, + encoder_hidden_states: Union[torch.Tensor, None], + encoder_attention_mask: Union[torch.Tensor, None], + input_shape: torch.Size, + inputs_embeds: torch.Tensor, + ): + # expand encoder attention mask + if encoder_hidden_states is not None and encoder_attention_mask is not None: + if self.config._attn_implementation == "flash_attention_2": + encoder_attention_mask = encoder_attention_mask if 0 in encoder_attention_mask else None + elif self.config._attn_implementation == "sdpa": + # output_attentions=True & cross_attn_head_mask can not be supported when using SDPA, and we fall back on + # the manual implementation that requires a 4D causal mask in all cases. + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + encoder_attention_mask = _prepare_4d_attention_mask_for_sdpa( + encoder_attention_mask, + inputs_embeds.dtype, + tgt_len=input_shape[-1], + ) + elif self.config._attn_implementation == "flex_attention": + if isinstance(encoder_attention_mask, torch.Tensor): + encoder_attention_mask = make_flex_block_causal_mask( + encoder_attention_mask, + query_length=input_shape[-1], + is_causal=False, + ) + else: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + encoder_attention_mask = _prepare_4d_attention_mask( + encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1] + ) + + return encoder_attention_mask + + +class PretrainedBartModel(BartPreTrainedModel): + def __init_subclass__(self): + warnings.warn( + "The class `PretrainedBartModel` has been depreciated, please use `BartPreTrainedModel` instead.", + FutureWarning, + ) + + +class BartPretrainedModel(BartPreTrainedModel): + def __init_subclass__(self): + warnings.warn( + "The class `PretrainedBartModel` has been depreciated, please use `BartPreTrainedModel` instead.", + FutureWarning, + ) + + +class BartEncoder(BartPreTrainedModel): + """ + Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a + [`BartEncoderLayer`]. + + Args: + config: BartConfig + embed_tokens (nn.Embedding): output embedding + """ + + def __init__(self, config: BartConfig, embed_tokens: Optional[nn.Embedding] = None): + super().__init__(config) + + self.dropout = config.dropout + self.layerdrop = config.encoder_layerdrop + + embed_dim = config.d_model + self.padding_idx = config.pad_token_id + self.max_source_positions = config.max_position_embeddings + embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0 + + self.embed_tokens = BartScaledWordEmbedding( + config.vocab_size, embed_dim, self.padding_idx, embed_scale=embed_scale + ) + + if embed_tokens is not None: + self.embed_tokens.weight = embed_tokens.weight + + self.embed_positions = BartLearnedPositionalEmbedding( + config.max_position_embeddings, + embed_dim, + ) + self.layers = nn.ModuleList([BartEncoderLayer(config, layer_idx=i) for i in range(config.encoder_layers)]) + self.layernorm_embedding = nn.LayerNorm(embed_dim) + + self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, BaseModelOutput]: + r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you + provide it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert `input_ids` indices into associated vectors + than the model's internal embedding lookup matrix. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + input = input_ids + input_ids = input_ids.view(-1, input_ids.shape[-1]) + elif inputs_embeds is not None: + input = inputs_embeds[:, :, -1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + embed_pos = self.embed_positions(input) + embed_pos = embed_pos.to(inputs_embeds.device) + + hidden_states = inputs_embeds + embed_pos + hidden_states = self.layernorm_embedding(hidden_states) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + + attention_mask = self._update_full_mask( + attention_mask, + inputs_embeds, + ) + + encoder_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + + # check if head_mask has a correct number of layers specified if desired + if head_mask is not None: + if head_mask.size()[0] != (len(self.layers)): + raise ValueError( + f"The head_mask should be specified for {len(self.layers)} layers, but it is for" + f" {head_mask.size()[0]}." + ) + + for idx, encoder_layer in enumerate(self.layers): + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + # add LayerDrop (see https://huggingface.co/papers/1909.11556 for description) + to_drop = False + if self.training: + dropout_probability = torch.rand([]) + if dropout_probability < self.layerdrop: # skip the layer + to_drop = True + + if to_drop: + layer_outputs = (None, None) + else: + layer_outputs = encoder_layer( + hidden_states, + attention_mask, + layer_head_mask=(head_mask[idx] if head_mask is not None else None), + output_attentions=output_attentions, + ) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) + return BaseModelOutput( + last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions + ) + + +class BartDecoder(BartPreTrainedModel): + """ + Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`BartDecoderLayer`] + + Args: + config: BartConfig + embed_tokens (nn.Embedding): output embedding + """ + + def __init__(self, config: BartConfig, embed_tokens: Optional[nn.Embedding] = None): + super().__init__(config) + self.dropout = config.dropout + self.layerdrop = config.decoder_layerdrop + self.padding_idx = config.pad_token_id + self.max_target_positions = config.max_position_embeddings + embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0 + + self.embed_tokens = BartScaledWordEmbedding( + config.vocab_size, config.d_model, self.padding_idx, embed_scale=embed_scale + ) + + if embed_tokens is not None: + self.embed_tokens.weight = embed_tokens.weight + + self.embed_positions = BartLearnedPositionalEmbedding( + config.max_position_embeddings, + config.d_model, + ) + self.layers = nn.ModuleList([BartDecoderLayer(config, layer_idx=i) for i in range(config.decoder_layers)]) + + self.layernorm_embedding = nn.LayerNorm(config.d_model) + + self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.Tensor] = None, + cross_attn_head_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + ) -> Union[tuple, BaseModelOutputWithPastAndCrossAttentions]: + r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you + provide it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention + of the decoder. + encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*): + Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values + selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the cross-attention modules in the decoder to avoid performing + cross-attention on hidden heads. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache). + + Contains pre-computed hidden-states (key and values in the self-attention blocks and in the + cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. + + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those + that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of + all `decoder_input_ids` of shape `(batch_size, sequence_length)`. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert `input_ids` indices into associated vectors + than the model's internal embedding lookup matrix. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*): + Indices depicting the position of the input sequence tokens in the sequence. It is used to update the + cache in the correct position and to infer the complete sequence length. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + # retrieve input_ids and inputs_embeds + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time") + elif input_ids is not None: + input = input_ids + input_shape = input.shape + input_ids = input_ids.view(-1, input_shape[-1]) + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + input = inputs_embeds[:, :, -1] + else: + raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds") + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input) + + # initialize `past_key_values` + if use_cache and past_key_values is None: + past_key_values = ( + EncoderDecoderCache(DynamicCache(config=self.config), DynamicCache(config=self.config)) + if encoder_hidden_states is not None + else DynamicCache(config=self.config) + ) + if use_cache and isinstance(past_key_values, tuple): + logger.warning_once( + "Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. " + "You should pass an instance of `EncoderDecoderCache` instead, e.g. " + "`past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`." + ) + past_key_values = EncoderDecoderCache.from_legacy_cache(past_key_values) + + batch_size, seq_length = inputs_embeds.size()[:-1] + past_key_values_length = past_key_values.get_seq_length() if past_key_values is not None else 0 + if cache_position is None: + cache_position = torch.arange( + past_key_values_length, past_key_values_length + seq_length, device=inputs_embeds.device + ) + + if attention_mask is None and not is_torchdynamo_compiling(): + # required mask seq length can be calculated via length of past cache + mask_seq_length = past_key_values_length + seq_length + attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device) + + self_attn_cache = ( + past_key_values.self_attention_cache + if isinstance(past_key_values, EncoderDecoderCache) + else past_key_values + ) + + attention_mask = self._update_causal_mask( + attention_mask, + inputs_embeds, + cache_position, + self_attn_cache, + ) + encoder_attention_mask = self._update_cross_attn_mask( + encoder_hidden_states, + encoder_attention_mask, + input_shape, + inputs_embeds, + ) + + # embed positions + positions = self.embed_positions(input, past_key_values_length, position_ids=cache_position) + positions = positions.to(inputs_embeds.device) + + hidden_states = inputs_embeds + positions + hidden_states = self.layernorm_embedding(hidden_states) + + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None + + # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired + for attn_mask, mask_name in zip([head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]): + if attn_mask is not None: + if attn_mask.size()[0] != (len(self.layers)): + raise ValueError( + f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for" + f" {head_mask.size()[0]}." + ) + + for idx, decoder_layer in enumerate(self.layers): + # add LayerDrop (see https://huggingface.co/papers/1909.11556 for description) + if output_hidden_states: + all_hidden_states += (hidden_states,) + if self.training: + dropout_probability = torch.rand([]) + if dropout_probability < self.layerdrop: + continue + + layer_outputs = decoder_layer( + hidden_states, + attention_mask, + encoder_hidden_states, # as a positional argument for gradient checkpointing + encoder_attention_mask=encoder_attention_mask, + layer_head_mask=(head_mask[idx] if head_mask is not None else None), + cross_attn_layer_head_mask=(cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None), + past_key_values=past_key_values, + output_attentions=output_attentions, + use_cache=use_cache, + cache_position=cache_position, + ) + hidden_states = layer_outputs[0] + if output_attentions: + all_self_attns += (layer_outputs[1],) + + if encoder_hidden_states is not None: + all_cross_attentions += (layer_outputs[2],) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if not return_dict: + return tuple( + v + for v in [hidden_states, past_key_values, all_hidden_states, all_self_attns, all_cross_attentions] + if v is not None + ) + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=past_key_values, + hidden_states=all_hidden_states, + attentions=all_self_attns, + cross_attentions=all_cross_attentions, + ) + + +@auto_docstring +class BartModel(BartPreTrainedModel): + _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] + + def __init__(self, config: BartConfig): + super().__init__(config) + + padding_idx, vocab_size = config.pad_token_id, config.vocab_size + embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0 + self.shared = BartScaledWordEmbedding(vocab_size, config.d_model, padding_idx, embed_scale=embed_scale) + + self.encoder = BartEncoder(config, self.shared) + self.decoder = BartDecoder(config, self.shared) + + # Initialize weights and apply final processing + self.post_init() + + def _tie_weights(self): + if self.config.tie_word_embeddings: + # Some model checkpoints like "facebook/bart-large-cnn"'s embedding weight is in decoder.embed_tokens, need check here, see issue #36247 + if self.shared.weight.device == torch.device( + "meta" + ) and self.decoder.embed_tokens.weight.device != torch.device("meta"): + self._tie_or_clone_weights(self.encoder.embed_tokens, self.decoder.embed_tokens) + self._tie_or_clone_weights(self.shared, self.decoder.embed_tokens) + else: + self._tie_or_clone_weights(self.encoder.embed_tokens, self.shared) + self._tie_or_clone_weights(self.decoder.embed_tokens, self.shared) + + def get_input_embeddings(self): + return self.shared + + def set_input_embeddings(self, value): + self.shared = value + self.encoder.embed_tokens = self.shared + self.decoder.embed_tokens = self.shared + + def get_encoder(self): + return self.encoder + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + decoder_input_ids: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.Tensor] = None, + decoder_head_mask: Optional[torch.Tensor] = None, + cross_attn_head_mask: Optional[torch.Tensor] = None, + encoder_outputs: Optional[list[torch.FloatTensor]] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + decoder_inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + ) -> Union[tuple, Seq2SeqModelOutput]: + r""" + decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*): + Indices of decoder input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are decoder input IDs?](../glossary#decoder-input-ids) + + Bart uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values` + is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`). + + For translation and summarization training, `decoder_input_ids` should be provided. If no + `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right + for denoising pre-training following the paper. + decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*): + Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also + be used by default. + + If you want to change padding behavior, you should read [`modeling_bart._prepare_decoder_attention_mask`] + and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more + information on the default strategy. + cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0, + 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + """ + # different to other models, Bart automatically creates decoder_input_ids from + # input_ids if no decoder_input_ids are provided + if decoder_input_ids is None and decoder_inputs_embeds is None: + if input_ids is None: + raise ValueError( + "If no `decoder_input_ids` or `decoder_inputs_embeds` are " + "passed, `input_ids` cannot be `None`. Please pass either " + "`input_ids` or `decoder_input_ids` or `decoder_inputs_embeds`." + ) + + decoder_input_ids = shift_tokens_right( + input_ids, self.config.pad_token_id, self.config.decoder_start_token_id + ) + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if encoder_outputs is None: + encoder_outputs = self.encoder( + input_ids=input_ids, + attention_mask=attention_mask, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True + elif return_dict and not isinstance(encoder_outputs, BaseModelOutput): + encoder_outputs = BaseModelOutput( + last_hidden_state=encoder_outputs[0], + hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, + attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None, + ) + + # decoder outputs consists of (dec_features, past_key_values, dec_hidden, dec_attn) + decoder_outputs = self.decoder( + input_ids=decoder_input_ids, + attention_mask=decoder_attention_mask, + encoder_hidden_states=encoder_outputs[0], + encoder_attention_mask=attention_mask, + head_mask=decoder_head_mask, + cross_attn_head_mask=cross_attn_head_mask, + past_key_values=past_key_values, + inputs_embeds=decoder_inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + cache_position=cache_position, + ) + + if not return_dict: + return decoder_outputs + encoder_outputs + + return Seq2SeqModelOutput( + last_hidden_state=decoder_outputs.last_hidden_state, + past_key_values=decoder_outputs.past_key_values, + decoder_hidden_states=decoder_outputs.hidden_states, + decoder_attentions=decoder_outputs.attentions, + cross_attentions=decoder_outputs.cross_attentions, + encoder_last_hidden_state=encoder_outputs.last_hidden_state, + encoder_hidden_states=encoder_outputs.hidden_states, + encoder_attentions=encoder_outputs.attentions, + ) + + +@auto_docstring( + custom_intro=""" + The BART Model with a language modeling head. Can be used for summarization. + """ +) +class BartForConditionalGeneration(BartPreTrainedModel, GenerationMixin): + base_model_prefix = "model" + _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"] + _keys_to_ignore_on_load_missing = ["final_logits_bias"] + + def __init__(self, config: BartConfig): + super().__init__(config) + self.model = BartModel(config) + self.register_buffer("final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings))) + self.lm_head = nn.Linear(config.d_model, self.model.shared.num_embeddings, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_encoder(self): + return self.model.get_encoder() + + def get_decoder(self): + return self.model.get_decoder() + + def resize_token_embeddings( + self, new_num_tokens: int, pad_to_multiple_of: Optional[int] = None, mean_resizing: bool = True + ) -> nn.Embedding: + new_embeddings = super().resize_token_embeddings(new_num_tokens, pad_to_multiple_of, mean_resizing) + self._resize_final_logits_bias(new_embeddings.weight.shape[0]) + return new_embeddings + + def _resize_final_logits_bias(self, new_num_tokens: int) -> None: + old_num_tokens = self.final_logits_bias.shape[-1] + if new_num_tokens <= old_num_tokens: + new_bias = self.final_logits_bias[:, :new_num_tokens] + else: + extra_bias = torch.zeros((1, new_num_tokens - old_num_tokens), device=self.final_logits_bias.device) + new_bias = torch.cat([self.final_logits_bias, extra_bias], dim=1) + self.register_buffer("final_logits_bias", new_bias) + + def _tie_weights(self): + if self.config.tie_word_embeddings: + self.model._tie_weights() + self._tie_or_clone_weights(self.lm_head, self.model.shared) + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + decoder_input_ids: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.Tensor] = None, + decoder_head_mask: Optional[torch.Tensor] = None, + cross_attn_head_mask: Optional[torch.Tensor] = None, + encoder_outputs: Optional[list[torch.FloatTensor]] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + decoder_inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + ) -> Union[tuple, Seq2SeqLMOutput]: + r""" + decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*): + Indices of decoder input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are decoder input IDs?](../glossary#decoder-input-ids) + + Bart uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values` + is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`). + + For translation and summarization training, `decoder_input_ids` should be provided. If no + `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right + for denoising pre-training following the paper. + decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*): + Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also + be used by default. + + If you want to change padding behavior, you should read [`modeling_bart._prepare_decoder_attention_mask`] + and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more + information on the default strategy. + cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0, + 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Example summarization: + + ```python + >>> from transformers import AutoTokenizer, BartForConditionalGeneration + + >>> model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn") + >>> tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn") + + >>> ARTICLE_TO_SUMMARIZE = ( + ... "PG&E stated it scheduled the blackouts in response to forecasts for high winds " + ... "amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were " + ... "scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow." + ... ) + >>> inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors="pt") + + >>> # Generate Summary + >>> summary_ids = model.generate(inputs["input_ids"], num_beams=2, min_length=0, max_length=20) + >>> tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + 'PG&E scheduled the blackouts in response to forecasts for high winds amid dry conditions' + ``` + + Mask filling example: + + ```python + >>> from transformers import AutoTokenizer, BartForConditionalGeneration + + >>> tokenizer = AutoTokenizer.from_pretrained("facebook/bart-base") + >>> model = BartForConditionalGeneration.from_pretrained("facebook/bart-base") + + >>> TXT = "My friends are but they eat too many carbs." + >>> input_ids = tokenizer([TXT], return_tensors="pt")["input_ids"] + >>> logits = model(input_ids).logits + + >>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item() + >>> probs = logits[0, masked_index].softmax(dim=0) + >>> values, predictions = probs.topk(5) + + >>> tokenizer.decode(predictions).split() + ['not', 'good', 'healthy', 'great', 'very'] + ``` + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if labels is not None: + if use_cache: + logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.") + use_cache = False + if decoder_input_ids is None and decoder_inputs_embeds is None: + decoder_input_ids = shift_tokens_right( + labels, self.config.pad_token_id, self.config.decoder_start_token_id + ) + + outputs = self.model( + input_ids, + attention_mask=attention_mask, + decoder_input_ids=decoder_input_ids, + encoder_outputs=encoder_outputs, + decoder_attention_mask=decoder_attention_mask, + head_mask=head_mask, + decoder_head_mask=decoder_head_mask, + cross_attn_head_mask=cross_attn_head_mask, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + decoder_inputs_embeds=decoder_inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + cache_position=cache_position, + ) + + lm_logits = self.lm_head(outputs[0]) + lm_logits = lm_logits + self.final_logits_bias.to(lm_logits.device) + + masked_lm_loss = None + if labels is not None: + labels = labels.to(lm_logits.device) + loss_fct = CrossEntropyLoss() + masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1)) + + if not return_dict: + output = (lm_logits,) + outputs[1:] + return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output + + return Seq2SeqLMOutput( + loss=masked_lm_loss, + logits=lm_logits, + past_key_values=outputs.past_key_values, + decoder_hidden_states=outputs.decoder_hidden_states, + decoder_attentions=outputs.decoder_attentions, + cross_attentions=outputs.cross_attentions, + encoder_last_hidden_state=outputs.encoder_last_hidden_state, + encoder_hidden_states=outputs.encoder_hidden_states, + encoder_attentions=outputs.encoder_attentions, + ) + + def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor): + return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id) + + +@auto_docstring( + custom_intro=""" + Bart model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE + tasks. + """ +) +class BartForSequenceClassification(BartPreTrainedModel): + _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] + + def __init__(self, config: BartConfig, **kwargs): + super().__init__(config, **kwargs) + self.model = BartModel(config) + self.classification_head = BartClassificationHead( + config.d_model, + config.d_model, + config.num_labels, + config.classifier_dropout, + ) + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + decoder_input_ids: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.Tensor] = None, + decoder_head_mask: Optional[torch.Tensor] = None, + cross_attn_head_mask: Optional[torch.Tensor] = None, + encoder_outputs: Optional[list[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + decoder_inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + ) -> Union[tuple, Seq2SeqSequenceClassifierOutput]: + r""" + decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*): + Indices of decoder input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are decoder input IDs?](../glossary#decoder-input-ids) + + Bart uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values` + is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`). + + For translation and summarization training, `decoder_input_ids` should be provided. If no + `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right + for denoising pre-training following the paper. + decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*): + Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also + be used by default. + + If you want to change padding behavior, you should read [`modeling_bart._prepare_decoder_attention_mask`] + and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more + information on the default strategy. + cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0, + 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + if labels is not None: + use_cache = False + + if input_ids is None and inputs_embeds is not None: + raise NotImplementedError( + f"Passing input embeddings is currently not supported for {self.__class__.__name__}" + ) + + outputs = self.model( + input_ids, + attention_mask=attention_mask, + decoder_input_ids=decoder_input_ids, + decoder_attention_mask=decoder_attention_mask, + head_mask=head_mask, + decoder_head_mask=decoder_head_mask, + cross_attn_head_mask=cross_attn_head_mask, + encoder_outputs=encoder_outputs, + inputs_embeds=inputs_embeds, + decoder_inputs_embeds=decoder_inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + cache_position=cache_position, + ) + hidden_states = outputs[0] # last hidden state + + eos_mask = input_ids.eq(self.config.eos_token_id).to(hidden_states.device) + + if len(torch.unique_consecutive(eos_mask.sum(1))) > 1: + raise ValueError("All examples must have the same number of tokens.") + sentence_representation = hidden_states[eos_mask, :].view(hidden_states.size(0), -1, hidden_states.size(-1))[ + :, -1, : + ] + logits = self.classification_head(sentence_representation) + + loss = None + if labels is not None: + labels = labels.to(logits.device) + if self.config.problem_type is None: + if self.config.num_labels == 1: + self.config.problem_type = "regression" + elif self.config.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.config.num_labels == 1: + loss = loss_fct(logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits, labels) + if not return_dict: + output = (logits,) + outputs[1:] + return ((loss,) + output) if loss is not None else output + + return Seq2SeqSequenceClassifierOutput( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + decoder_hidden_states=outputs.decoder_hidden_states, + decoder_attentions=outputs.decoder_attentions, + cross_attentions=outputs.cross_attentions, + encoder_last_hidden_state=outputs.encoder_last_hidden_state, + encoder_hidden_states=outputs.encoder_hidden_states, + encoder_attentions=outputs.encoder_attentions, + ) + + +@auto_docstring +class BartForQuestionAnswering(BartPreTrainedModel): + _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] + + def __init__(self, config): + super().__init__(config) + + config.num_labels = 2 + self.num_labels = config.num_labels + + self.model = BartModel(config) + self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + decoder_input_ids: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.Tensor] = None, + decoder_head_mask: Optional[torch.Tensor] = None, + cross_attn_head_mask: Optional[torch.Tensor] = None, + encoder_outputs: Optional[list[torch.FloatTensor]] = None, + start_positions: Optional[torch.LongTensor] = None, + end_positions: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + decoder_inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + ) -> Union[tuple, Seq2SeqQuestionAnsweringModelOutput]: + r""" + decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*): + Indices of decoder input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are decoder input IDs?](../glossary#decoder-input-ids) + + Bart uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values` + is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`). + + For translation and summarization training, `decoder_input_ids` should be provided. If no + `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right + for denoising pre-training following the paper. + decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*): + Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also + be used by default. + + If you want to change padding behavior, you should read [`modeling_bart._prepare_decoder_attention_mask`] + and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more + information on the default strategy. + cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0, + 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + if start_positions is not None and end_positions is not None: + use_cache = False + + outputs = self.model( + input_ids, + attention_mask=attention_mask, + decoder_input_ids=decoder_input_ids, + decoder_attention_mask=decoder_attention_mask, + head_mask=head_mask, + decoder_head_mask=decoder_head_mask, + cross_attn_head_mask=cross_attn_head_mask, + encoder_outputs=encoder_outputs, + inputs_embeds=inputs_embeds, + decoder_inputs_embeds=decoder_inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + cache_position=cache_position, + ) + + sequence_output = outputs[0] + + logits = self.qa_outputs(sequence_output) + start_logits, end_logits = logits.split(1, dim=-1) + start_logits = start_logits.squeeze(-1).contiguous() + end_logits = end_logits.squeeze(-1).contiguous() + + total_loss = None + if start_positions is not None and end_positions is not None: + # If we are on multi-GPU, split add a dimension + if len(start_positions.size()) > 1: + start_positions = start_positions.squeeze(-1) + if len(end_positions.size()) > 1: + end_positions = end_positions.squeeze(-1) + # sometimes the start/end positions are outside our model inputs, we ignore these terms + ignored_index = start_logits.size(1) + start_positions = start_positions.clamp(0, ignored_index) + end_positions = end_positions.clamp(0, ignored_index) + + loss_fct = CrossEntropyLoss(ignore_index=ignored_index) + start_loss = loss_fct(start_logits, start_positions) + end_loss = loss_fct(end_logits, end_positions) + total_loss = (start_loss + end_loss) / 2 + + if not return_dict: + output = ( + start_logits, + end_logits, + ) + outputs[1:] + return ((total_loss,) + output) if total_loss is not None else output + + return Seq2SeqQuestionAnsweringModelOutput( + loss=total_loss, + start_logits=start_logits, + end_logits=end_logits, + past_key_values=outputs.past_key_values, + decoder_hidden_states=outputs.decoder_hidden_states, + decoder_attentions=outputs.decoder_attentions, + cross_attentions=outputs.cross_attentions, + encoder_last_hidden_state=outputs.encoder_last_hidden_state, + encoder_hidden_states=outputs.encoder_hidden_states, + encoder_attentions=outputs.encoder_attentions, + ) + + +class BartDecoderWrapper(BartPreTrainedModel): + """ + This wrapper class is a helper class to correctly load pretrained checkpoints when the causal language model is + used in combination with the [`EncoderDecoderModel`] framework. + """ + + def __init__(self, config): + super().__init__(config) + self.decoder = BartDecoder(config) + + def forward(self, *args, **kwargs): + return self.decoder(*args, **kwargs) + + +@auto_docstring( + custom_intro=""" + BART decoder with a language modeling head on top (linear layer with weights tied to the input embeddings). + """ +) +class BartForCausalLM(BartPreTrainedModel, GenerationMixin): + _tied_weights_keys = ["lm_head.weight"] + + def __init__(self, config): + config.is_decoder = True + config.is_encoder_decoder = False + super().__init__(config) + self.model = BartDecoderWrapper(config) + + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.decoder.embed_tokens + + def set_input_embeddings(self, value): + self.model.decoder.embed_tokens = value + + def set_decoder(self, decoder): + self.model.decoder = decoder + + def get_decoder(self): + return self.model.decoder + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.Tensor] = None, + cross_attn_head_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + ) -> Union[tuple, CausalLMOutputWithCrossAttentions]: + r""" + cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Example: + + ```python + >>> from transformers import AutoTokenizer, BartForCausalLM + + >>> tokenizer = AutoTokenizer.from_pretrained("facebook/bart-base") + >>> model = BartForCausalLM.from_pretrained("facebook/bart-base", add_cross_attention=False) + >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder." + >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") + >>> outputs = model(**inputs) + + >>> logits = outputs.logits + >>> expected_shape = [1, inputs.input_ids.shape[-1], model.config.vocab_size] + >>> list(logits.shape) == expected_shape + True + ```""" + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs = self.model.decoder( + input_ids=input_ids, + attention_mask=attention_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + head_mask=head_mask, + cross_attn_head_mask=cross_attn_head_mask, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + cache_position=cache_position, + ) + + logits = self.lm_head(outputs[0]) + + loss = None + if labels is not None: + labels = labels.to(logits.device) + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.config.vocab_size), labels.view(-1)) + + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return CausalLMOutputWithCrossAttentions( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + cross_attentions=outputs.cross_attentions, + ) + + +__all__ = [ + "BartForCausalLM", + "BartForConditionalGeneration", + "BartForQuestionAnswering", + "BartForSequenceClassification", + "BartModel", + "BartPreTrainedModel", + "BartPretrainedModel", + "PretrainedBartModel", +] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bart/modeling_flax_bart.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bart/modeling_flax_bart.py new file mode 100644 index 0000000000000000000000000000000000000000..818254f3bfa1e95685fdcde8af8f074edb69761f --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bart/modeling_flax_bart.py @@ -0,0 +1,2006 @@ +# coding=utf-8 +# Copyright 2021 The Fairseq Authors and The Google Flax Team Authors And The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Flax Bart model.""" + +import math +import random +from functools import partial +from typing import Callable, Optional + +import flax.linen as nn +import jax +import jax.numpy as jnp +from flax.core.frozen_dict import FrozenDict, freeze, unfreeze +from flax.linen import combine_masks, make_causal_mask +from flax.linen.attention import dot_product_attention_weights +from flax.traverse_util import flatten_dict, unflatten_dict +from jax import lax +from jax.random import PRNGKey + +from ...modeling_flax_outputs import ( + FlaxBaseModelOutput, + FlaxBaseModelOutputWithPastAndCrossAttentions, + FlaxCausalLMOutputWithCrossAttentions, + FlaxSeq2SeqLMOutput, + FlaxSeq2SeqModelOutput, + FlaxSeq2SeqQuestionAnsweringModelOutput, + FlaxSeq2SeqSequenceClassifierOutput, +) +from ...modeling_flax_utils import ( + ACT2FN, + FlaxPreTrainedModel, + append_call_sample_docstring, + append_replace_return_docstrings, + overwrite_call_docstring, +) +from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings +from .configuration_bart import BartConfig + + +logger = logging.get_logger(__name__) + +_CHECKPOINT_FOR_DOC = "facebook/bart-base" +_CONFIG_FOR_DOC = "BartConfig" + + +BART_START_DOCSTRING = r""" + This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a Flax Linen + [flax.nn.Module](https://flax.readthedocs.io/en/latest/_autosummary/flax.nn.module.html) subclass. Use it as a + regular Flax Module and refer to the Flax documentation for all matter related to general usage and behavior. + + Finally, this model supports inherent JAX features such as: + + - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit) + - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation) + - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap) + - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap) + + Parameters: + config ([`BartConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights. + dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`): + The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on GPUs) and + `jax.numpy.bfloat16` (on TPUs). + + This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If + specified all the computation will be performed with the given `dtype`. + + **Note that this only specifies the dtype of the computation and does not influence the dtype of model + parameters.** + + If you wish to change the dtype of the model parameters, see [`~FlaxPreTrainedModel.to_fp16`] and + [`~FlaxPreTrainedModel.to_bf16`]. +""" + +BART_INPUTS_DOCSTRING = r""" + Args: + input_ids (`jnp.ndarray` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + decoder_input_ids (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*): + Indices of decoder input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are decoder input IDs?](../glossary#decoder-input-ids) + + For translation and summarization training, `decoder_input_ids` should be provided. If no + `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right + for denoising pre-training following the paper. + decoder_attention_mask (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*): + Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also + be used by default. + + If you want to change padding behavior, you should modify to your needs. See diagram 1 in [the + paper](https://huggingface.co/papers/1910.13461) for more information on the default strategy. + position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + decoder_position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the + range `[0, config.max_position_embeddings - 1]`. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +BART_ENCODE_INPUTS_DOCSTRING = r""" + Args: + input_ids (`jnp.ndarray` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + +BART_DECODE_INPUTS_DOCSTRING = r""" + Args: + decoder_input_ids (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`): + Indices of decoder input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are decoder input IDs?](../glossary#decoder-input-ids) + + For translation and summarization training, `decoder_input_ids` should be provided. If no + `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right + for denoising pre-training following the paper. + encoder_outputs (`tuple(tuple(jnp.ndarray)`): + Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`) + `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of + hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder. + encoder_attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + decoder_attention_mask (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*): + Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also + be used by default. + + If you want to change padding behavior, you should modify to your needs. See diagram 1 in [the + paper](https://huggingface.co/papers/1910.13461) for more information on the default strategy. + decoder_position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the + range `[0, config.max_position_embeddings - 1]`. + past_key_values (`dict[str, np.ndarray]`, *optional*, returned by `init_cache` or when passing previous `past_key_values`): + Dictionary of pre-computed hidden-states (key and values in the attention blocks) that can be used for fast + auto-regressive decoding. Pre-computed key and value hidden-states are of shape *[batch_size, max_length]*. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +def shift_tokens_right(input_ids: jnp.ndarray, pad_token_id: int, decoder_start_token_id: int) -> jnp.ndarray: + """ + Shift input ids one token to the right. + """ + shifted_input_ids = jnp.zeros_like(input_ids) + shifted_input_ids = shifted_input_ids.at[:, 1:].set(input_ids[:, :-1]) + shifted_input_ids = shifted_input_ids.at[:, 0].set(decoder_start_token_id) + + shifted_input_ids = jnp.where(shifted_input_ids == -100, pad_token_id, shifted_input_ids) + return shifted_input_ids + + +class FlaxBartAttention(nn.Module): + config: BartConfig + embed_dim: int + num_heads: int + dropout: float = 0.0 + causal: bool = False + bias: bool = True + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self) -> None: + self.head_dim = self.embed_dim // self.num_heads + if self.head_dim * self.num_heads != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}" + f" and `num_heads`: {self.num_heads})." + ) + + dense = partial( + nn.Dense, + self.embed_dim, + use_bias=self.bias, + dtype=self.dtype, + kernel_init=jax.nn.initializers.normal(self.config.init_std), + ) + + self.q_proj, self.k_proj, self.v_proj = dense(), dense(), dense() + self.out_proj = dense() + + self.dropout_layer = nn.Dropout(rate=self.dropout) + + if self.causal: + self.causal_mask = make_causal_mask( + jnp.ones((1, self.config.max_position_embeddings), dtype="bool"), dtype="bool" + ) + + def _split_heads(self, hidden_states): + return hidden_states.reshape(hidden_states.shape[:2] + (self.num_heads, self.head_dim)) + + def _merge_heads(self, hidden_states): + return hidden_states.reshape(hidden_states.shape[:2] + (self.embed_dim,)) + + @nn.compact + def _concatenate_to_cache(self, key, value, query, attention_mask): + """ + This function takes projected key, value states from a single input token and concatenates the states to cached + states from previous steps. This function is slightly adapted from the official Flax repository: + https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252 + """ + # detect if we're initializing by absence of existing cache data. + is_initialized = self.has_variable("cache", "cached_key") + cached_key = self.variable("cache", "cached_key", jnp.zeros, key.shape, key.dtype) + cached_value = self.variable("cache", "cached_value", jnp.zeros, value.shape, value.dtype) + cache_index = self.variable("cache", "cache_index", lambda: jnp.array(0, dtype=jnp.int32)) + + if is_initialized: + *batch_dims, max_length, num_heads, depth_per_head = cached_key.value.shape + # update key, value caches with our new 1d spatial slices + cur_index = cache_index.value + indices = (0,) * len(batch_dims) + (cur_index, 0, 0) + key = lax.dynamic_update_slice(cached_key.value, key, indices) + value = lax.dynamic_update_slice(cached_value.value, value, indices) + cached_key.value = key + cached_value.value = value + num_updated_cache_vectors = query.shape[1] + cache_index.value = cache_index.value + num_updated_cache_vectors + # causal mask for cached decoder self-attention: our single query position should only attend to those key positions that have already been generated and cached, not the remaining zero elements. + pad_mask = jnp.broadcast_to( + jnp.arange(max_length) < cur_index + num_updated_cache_vectors, + tuple(batch_dims) + (1, num_updated_cache_vectors, max_length), + ) + attention_mask = combine_masks(pad_mask, attention_mask) + return key, value, attention_mask + + def __call__( + self, + hidden_states: jnp.ndarray, + key_value_states: Optional[jnp.ndarray] = None, + attention_mask: Optional[jnp.ndarray] = None, + init_cache: bool = False, + deterministic: bool = True, + ) -> tuple[jnp.ndarray]: + """Input shape: Batch x Time x Channel""" + + # if key_value_states are provided this layer is used as a cross-attention layer + # for the decoder + is_cross_attention = key_value_states is not None + batch_size = hidden_states.shape[0] + + # get query proj + query_states = self.q_proj(hidden_states) + # get key, value proj + if is_cross_attention: + # cross_attentions + key_states = self.k_proj(key_value_states) + value_states = self.v_proj(key_value_states) + else: + # self_attention + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + query_states = self._split_heads(query_states) + key_states = self._split_heads(key_states) + value_states = self._split_heads(value_states) + + # handle cache prepare causal attention mask + if self.causal: + query_length, key_length = query_states.shape[1], key_states.shape[1] + if self.has_variable("cache", "cached_key"): + mask_shift = self.variables["cache"]["cache_index"] + max_decoder_length = self.variables["cache"]["cached_key"].shape[1] + causal_mask = lax.dynamic_slice( + self.causal_mask, (0, 0, mask_shift, 0), (1, 1, query_length, max_decoder_length) + ) + else: + causal_mask = self.causal_mask[:, :, :query_length, :key_length] + causal_mask = jnp.broadcast_to(causal_mask, (batch_size,) + causal_mask.shape[1:]) + + # combine masks if needed + if attention_mask is not None and self.causal: + attention_mask = jnp.broadcast_to(jnp.expand_dims(attention_mask, axis=(-3, -2)), causal_mask.shape) + attention_mask = combine_masks(attention_mask, causal_mask) + elif self.causal: + attention_mask = causal_mask + elif attention_mask is not None: + attention_mask = jnp.expand_dims(attention_mask, axis=(-3, -2)) + + # During fast autoregressive decoding, we feed one position at a time, + # and cache the keys and values step by step. + if self.causal and (self.has_variable("cache", "cached_key") or init_cache): + key_states, value_states, attention_mask = self._concatenate_to_cache( + key_states, value_states, query_states, attention_mask + ) + + # Convert the boolean attention mask to an attention bias. + if attention_mask is not None: + # attention mask in the form of attention bias + attention_bias = lax.select( + attention_mask > 0, + jnp.full(attention_mask.shape, 0.0).astype(self.dtype), + jnp.full(attention_mask.shape, jnp.finfo(self.dtype).min).astype(self.dtype), + ) + else: + attention_bias = None + + dropout_rng = None + if not deterministic and self.dropout > 0.0: + dropout_rng = self.make_rng("dropout") + + attn_weights = dot_product_attention_weights( + query_states, + key_states, + bias=attention_bias, + dropout_rng=dropout_rng, + dropout_rate=self.dropout, + broadcast_dropout=True, + deterministic=deterministic, + dtype=self.dtype, + precision=None, + ) + + attn_output = jnp.einsum("...hqk,...khd->...qhd", attn_weights, value_states) + attn_output = self._merge_heads(attn_output) + attn_output = self.out_proj(attn_output) + + return attn_output, attn_weights + + +class FlaxBartEncoderLayer(nn.Module): + config: BartConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self) -> None: + self.embed_dim = self.config.d_model + self.self_attn = FlaxBartAttention( + config=self.config, + embed_dim=self.embed_dim, + num_heads=self.config.encoder_attention_heads, + dropout=self.config.attention_dropout, + dtype=self.dtype, + ) + self.self_attn_layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05) + self.dropout_layer = nn.Dropout(rate=self.config.dropout) + self.activation_fn = ACT2FN[self.config.activation_function] + self.activation_dropout_layer = nn.Dropout(rate=self.config.activation_dropout) + self.fc1 = nn.Dense( + self.config.encoder_ffn_dim, + dtype=self.dtype, + kernel_init=jax.nn.initializers.normal(self.config.init_std), + ) + self.fc2 = nn.Dense( + self.embed_dim, dtype=self.dtype, kernel_init=jax.nn.initializers.normal(self.config.init_std) + ) + self.final_layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05) + + def __call__( + self, + hidden_states: jnp.ndarray, + attention_mask: jnp.ndarray, + output_attentions: bool = True, + deterministic: bool = True, + ) -> tuple[jnp.ndarray]: + residual = hidden_states + hidden_states, attn_weights = self.self_attn(hidden_states=hidden_states, attention_mask=attention_mask) + + hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic) + hidden_states = residual + hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + + residual = hidden_states + hidden_states = self.activation_fn(self.fc1(hidden_states)) + hidden_states = self.activation_dropout_layer(hidden_states, deterministic=deterministic) + hidden_states = self.fc2(hidden_states) + hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic) + hidden_states = residual + hidden_states + hidden_states = self.final_layer_norm(hidden_states) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (attn_weights,) + + return outputs + + +class FlaxBartEncoderLayerCollection(nn.Module): + config: BartConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.layers = [ + FlaxBartEncoderLayer(self.config, name=str(i), dtype=self.dtype) for i in range(self.config.encoder_layers) + ] + self.layerdrop = self.config.encoder_layerdrop + + def __call__( + self, + hidden_states, + attention_mask, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + all_attentions = () if output_attentions else None + all_hidden_states = () if output_hidden_states else None + + for encoder_layer in self.layers: + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + # add LayerDrop (see https://huggingface.co/papers/1909.11556 for description) + dropout_probability = random.uniform(0, 1) + if not deterministic and (dropout_probability < self.layerdrop): # skip the layer + layer_outputs = (None, None) + else: + layer_outputs = encoder_layer( + hidden_states, + attention_mask, + output_attentions, + deterministic, + ) + hidden_states = layer_outputs[0] + if output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + + if output_hidden_states: + all_hidden_states += (hidden_states,) + + outputs = (hidden_states, all_hidden_states, all_attentions) + + if not return_dict: + return tuple(v for v in outputs if v is not None) + + return FlaxBaseModelOutput( + last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions + ) + + +class FlaxBartDecoderLayer(nn.Module): + config: BartConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self) -> None: + self.embed_dim = self.config.d_model + self.self_attn = FlaxBartAttention( + config=self.config, + embed_dim=self.embed_dim, + num_heads=self.config.decoder_attention_heads, + dropout=self.config.attention_dropout, + causal=True, + dtype=self.dtype, + ) + self.dropout_layer = nn.Dropout(rate=self.config.dropout) + self.activation_fn = ACT2FN[self.config.activation_function] + self.activation_dropout_layer = nn.Dropout(rate=self.config.activation_dropout) + + self.self_attn_layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05) + self.encoder_attn = FlaxBartAttention( + config=self.config, + embed_dim=self.embed_dim, + num_heads=self.config.decoder_attention_heads, + dropout=self.config.attention_dropout, + dtype=self.dtype, + ) + self.encoder_attn_layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05) + self.fc1 = nn.Dense( + self.config.decoder_ffn_dim, + dtype=self.dtype, + kernel_init=jax.nn.initializers.normal(self.config.init_std), + ) + self.fc2 = nn.Dense( + self.embed_dim, dtype=self.dtype, kernel_init=jax.nn.initializers.normal(self.config.init_std) + ) + self.final_layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05) + + def __call__( + self, + hidden_states: jnp.ndarray, + attention_mask: jnp.ndarray, + encoder_hidden_states: Optional[jnp.ndarray] = None, + encoder_attention_mask: Optional[jnp.ndarray] = None, + init_cache: bool = False, + output_attentions: bool = True, + deterministic: bool = True, + ) -> tuple[jnp.ndarray]: + residual = hidden_states + + # Self Attention + hidden_states, self_attn_weights = self.self_attn( + hidden_states=hidden_states, attention_mask=attention_mask, init_cache=init_cache + ) + hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic) + hidden_states = residual + hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + + # Cross-Attention Block + cross_attn_weights = None + if encoder_hidden_states is not None: + residual = hidden_states + + hidden_states, cross_attn_weights = self.encoder_attn( + hidden_states=hidden_states, + key_value_states=encoder_hidden_states, + attention_mask=encoder_attention_mask, + ) + hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic) + hidden_states = residual + hidden_states + hidden_states = self.encoder_attn_layer_norm(hidden_states) + + # Fully Connected + residual = hidden_states + hidden_states = self.activation_fn(self.fc1(hidden_states)) + hidden_states = self.activation_dropout_layer(hidden_states, deterministic=deterministic) + hidden_states = self.fc2(hidden_states) + hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic) + hidden_states = residual + hidden_states + hidden_states = self.final_layer_norm(hidden_states) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights, cross_attn_weights) + + return outputs + + +class FlaxBartDecoderLayerCollection(nn.Module): + config: BartConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.layers = [ + FlaxBartDecoderLayer(self.config, name=str(i), dtype=self.dtype) for i in range(self.config.decoder_layers) + ] + self.layerdrop = self.config.decoder_layerdrop + + def __call__( + self, + hidden_states, + attention_mask, + encoder_hidden_states: Optional[jnp.ndarray] = None, + encoder_attention_mask: Optional[jnp.ndarray] = None, + deterministic: bool = True, + init_cache: bool = False, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None + + for decoder_layer in self.layers: + if output_hidden_states: + all_hidden_states += (hidden_states,) + # add LayerDrop (see https://huggingface.co/papers/1909.11556 for description) + dropout_probability = random.uniform(0, 1) + if not deterministic and (dropout_probability < self.layerdrop): + layer_outputs = (None, None, None) + else: + layer_outputs = decoder_layer( + hidden_states, + attention_mask=attention_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + init_cache=init_cache, + output_attentions=output_attentions, + deterministic=deterministic, + ) + + hidden_states = layer_outputs[0] + if output_attentions: + all_self_attns += (layer_outputs[1],) + + if encoder_hidden_states is not None: + all_cross_attentions += (layer_outputs[2],) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + outputs = [hidden_states, all_hidden_states, all_self_attns, all_cross_attentions] + + if not return_dict: + return tuple(v for v in outputs if v is not None) + + return FlaxBaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + hidden_states=all_hidden_states, + attentions=all_self_attns, + cross_attentions=all_cross_attentions, + ) + + +class FlaxBartClassificationHead(nn.Module): + """Head for sentence-level classification tasks.""" + + config: BartConfig + inner_dim: int + num_classes: int + pooler_dropout: float + dtype: jnp.dtype = jnp.float32 + + def setup(self): + self.dense = nn.Dense( + self.inner_dim, dtype=self.dtype, kernel_init=jax.nn.initializers.normal(self.config.init_std) + ) + self.dropout = nn.Dropout(rate=self.pooler_dropout) + self.out_proj = nn.Dense( + self.num_classes, + dtype=self.dtype, + kernel_init=jax.nn.initializers.normal(self.config.init_std), + ) + + def __call__(self, hidden_states: jnp.ndarray, deterministic: bool): + hidden_states = self.dropout(hidden_states, deterministic=deterministic) + hidden_states = self.dense(hidden_states) + hidden_states = jnp.tanh(hidden_states) + hidden_states = self.dropout(hidden_states, deterministic=deterministic) + hidden_states = self.out_proj(hidden_states) + return hidden_states + + +class FlaxBartEncoder(nn.Module): + config: BartConfig + embed_tokens: nn.Embed + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.dropout_layer = nn.Dropout(rate=self.config.dropout) + + embed_dim = self.config.d_model + self.padding_idx = self.config.pad_token_id + self.max_source_positions = self.config.max_position_embeddings + self.embed_scale = math.sqrt(embed_dim) if self.config.scale_embedding else 1.0 + + # Bart is set up so that if padding_idx is specified then offset the embedding ids by 2 + # and adjust num_embeddings appropriately. Other models don't have this hack + self.offset = 2 + self.embed_positions = nn.Embed( + self.config.max_position_embeddings + self.offset, + embed_dim, + embedding_init=jax.nn.initializers.normal(self.config.init_std), + dtype=self.dtype, + ) + self.layers = FlaxBartEncoderLayerCollection(self.config, self.dtype) + self.layernorm_embedding = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05) + + def __call__( + self, + input_ids, + attention_mask, + position_ids, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + deterministic: bool = True, + ): + input_shape = input_ids.shape + input_ids = input_ids.reshape(-1, input_shape[-1]) + + inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale + + embed_pos = self.embed_positions(position_ids + self.offset) + + hidden_states = inputs_embeds + embed_pos + hidden_states = self.layernorm_embedding(hidden_states) + hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic) + + outputs = self.layers( + hidden_states, + attention_mask, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + if not return_dict: + return outputs + + return FlaxBaseModelOutput( + last_hidden_state=outputs.last_hidden_state, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +class FlaxBartDecoder(nn.Module): + config: BartConfig + embed_tokens: nn.Embed + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.dropout_layer = nn.Dropout(rate=self.config.dropout) + + embed_dim = self.config.d_model + self.padding_idx = self.config.pad_token_id + self.max_target_positions = self.config.max_position_embeddings + self.embed_scale = math.sqrt(self.config.d_model) if self.config.scale_embedding else 1.0 + + # Bart is set up so that if padding_idx is specified then offset the embedding ids by 2 + # and adjust num_embeddings appropriately. Other models don't have this hack + self.offset = 2 + self.embed_positions = nn.Embed( + self.config.max_position_embeddings + self.offset, + embed_dim, + embedding_init=jax.nn.initializers.normal(self.config.init_std), + dtype=self.dtype, + ) + + self.layers = FlaxBartDecoderLayerCollection(self.config, self.dtype) + self.layernorm_embedding = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05) + + def __call__( + self, + input_ids, + attention_mask, + position_ids, + encoder_hidden_states: Optional[jnp.ndarray] = None, + encoder_attention_mask: Optional[jnp.ndarray] = None, + init_cache: bool = False, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + deterministic: bool = True, + ): + input_shape = input_ids.shape + input_ids = input_ids.reshape(-1, input_shape[-1]) + + inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale + + # embed positions + positions = self.embed_positions(position_ids + self.offset) + + hidden_states = inputs_embeds + positions + hidden_states = self.layernorm_embedding(hidden_states) + + hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic) + + outputs = self.layers( + hidden_states, + attention_mask, + encoder_hidden_states, + encoder_attention_mask, + deterministic=deterministic, + init_cache=init_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + if not return_dict: + return outputs + + return FlaxBaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=outputs.last_hidden_state, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + cross_attentions=outputs.cross_attentions, + ) + + +class FlaxBartModule(nn.Module): + config: BartConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.shared = nn.Embed( + self.config.vocab_size, + self.config.d_model, + embedding_init=jax.nn.initializers.normal(self.config.init_std), + dtype=self.dtype, + ) + + self.encoder = FlaxBartEncoder(self.config, dtype=self.dtype, embed_tokens=self.shared) + self.decoder = FlaxBartDecoder(self.config, dtype=self.dtype, embed_tokens=self.shared) + + def _get_encoder_module(self): + return self.encoder + + def _get_decoder_module(self): + return self.decoder + + def __call__( + self, + input_ids, + attention_mask, + decoder_input_ids, + decoder_attention_mask, + position_ids, + decoder_position_ids, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + deterministic: bool = True, + ): + encoder_outputs = self.encoder( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + deterministic=deterministic, + ) + + decoder_outputs = self.decoder( + input_ids=decoder_input_ids, + attention_mask=decoder_attention_mask, + position_ids=decoder_position_ids, + encoder_hidden_states=encoder_outputs[0], + encoder_attention_mask=attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + deterministic=deterministic, + ) + + if not return_dict: + return decoder_outputs + encoder_outputs + + return FlaxSeq2SeqModelOutput( + last_hidden_state=decoder_outputs.last_hidden_state, + decoder_hidden_states=decoder_outputs.hidden_states, + decoder_attentions=decoder_outputs.attentions, + cross_attentions=decoder_outputs.cross_attentions, + encoder_last_hidden_state=encoder_outputs.last_hidden_state, + encoder_hidden_states=encoder_outputs.hidden_states, + encoder_attentions=encoder_outputs.attentions, + ) + + +class FlaxBartPreTrainedModel(FlaxPreTrainedModel): + config_class = BartConfig + base_model_prefix: str = "model" + module_class: nn.Module = None + + def __init__( + self, + config: BartConfig, + input_shape: tuple[int] = (1, 1), + seed: int = 0, + dtype: jnp.dtype = jnp.float32, + _do_init: bool = True, + **kwargs, + ): + module = self.module_class(config=config, dtype=dtype, **kwargs) + super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init) + + def init_weights(self, rng: jax.random.PRNGKey, input_shape: tuple, params: FrozenDict = None) -> FrozenDict: + # init input tensors + input_ids = jnp.zeros(input_shape, dtype="i4") + # make sure initialization pass will work for FlaxBartForSequenceClassificationModule + input_ids = input_ids.at[(..., -1)].set(self.config.eos_token_id) + attention_mask = jnp.ones_like(input_ids) + decoder_input_ids = input_ids + decoder_attention_mask = jnp.ones_like(input_ids) + + batch_size, sequence_length = input_ids.shape + position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length)) + decoder_position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length)) + + params_rng, dropout_rng = jax.random.split(rng) + rngs = {"params": params_rng, "dropout": dropout_rng} + + random_params = self.module.init( + rngs, + input_ids, + attention_mask, + decoder_input_ids, + decoder_attention_mask, + position_ids, + decoder_position_ids, + )["params"] + + if params is not None: + random_params = flatten_dict(unfreeze(random_params)) + params = flatten_dict(unfreeze(params)) + for missing_key in self._missing_keys: + params[missing_key] = random_params[missing_key] + self._missing_keys = set() + return freeze(unflatten_dict(params)) + else: + return random_params + + def init_cache(self, batch_size, max_length, encoder_outputs): + r""" + Args: + batch_size (`int`): + batch_size used for fast auto-regressive decoding. Defines the batch size of the initialized cache. + max_length (`int`): + maximum possible length for auto-regressive decoding. Defines the sequence length of the initialized + cache. + encoder_outputs (`Union[FlaxBaseModelOutput, tuple(tuple(jnp.ndarray)]`): + `encoder_outputs` consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: + `attentions`). `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) + is a sequence of hidden-states at the output of the last layer of the encoder. Used in the + cross-attention of the decoder. + """ + # init input variables to retrieve cache + decoder_input_ids = jnp.ones((batch_size, max_length), dtype="i4") + decoder_attention_mask = jnp.ones_like(decoder_input_ids) + decoder_position_ids = jnp.broadcast_to( + jnp.arange(jnp.atleast_2d(decoder_input_ids).shape[-1]), decoder_input_ids.shape + ) + + def _decoder_forward(module, decoder_input_ids, decoder_attention_mask, decoder_position_ids, **kwargs): + decoder_module = module._get_decoder_module() + return decoder_module( + decoder_input_ids, + decoder_attention_mask, + decoder_position_ids, + **kwargs, + ) + + init_variables = self.module.init( + jax.random.PRNGKey(0), + decoder_input_ids=decoder_input_ids, + decoder_attention_mask=decoder_attention_mask, + decoder_position_ids=decoder_position_ids, + encoder_hidden_states=encoder_outputs[0], + init_cache=True, + method=_decoder_forward, # we only need to call the decoder to init the cache + ) + return unfreeze(init_variables["cache"]) + + @add_start_docstrings(BART_ENCODE_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=FlaxBaseModelOutput, config_class=BartConfig) + def encode( + self, + input_ids: jnp.ndarray, + attention_mask: Optional[jnp.ndarray] = None, + position_ids: Optional[jnp.ndarray] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + train: bool = False, + params: Optional[dict] = None, + dropout_rng: PRNGKey = None, + ): + r""" + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, FlaxBartForConditionalGeneration + + >>> model = FlaxBartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn") + >>> tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn") + + >>> text = "My friends are cool but they eat too many carbs." + >>> inputs = tokenizer(text, max_length=1024, return_tensors="jax") + >>> encoder_outputs = model.encode(**inputs) + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.return_dict + + if attention_mask is None: + attention_mask = jnp.ones_like(input_ids) + if position_ids is None: + batch_size, sequence_length = input_ids.shape + position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length)) + + # Handle any PRNG if needed + rngs = {} + if dropout_rng is not None: + rngs["dropout"] = dropout_rng + + def _encoder_forward(module, input_ids, attention_mask, position_ids, **kwargs): + encode_module = module._get_encoder_module() + return encode_module(input_ids, attention_mask, position_ids, **kwargs) + + return self.module.apply( + {"params": params or self.params}, + input_ids=jnp.array(input_ids, dtype="i4"), + attention_mask=jnp.array(attention_mask, dtype="i4"), + position_ids=jnp.array(position_ids, dtype="i4"), + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + deterministic=not train, + rngs=rngs, + method=_encoder_forward, + ) + + @add_start_docstrings(BART_DECODE_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=FlaxBaseModelOutputWithPastAndCrossAttentions, config_class=BartConfig) + def decode( + self, + decoder_input_ids, + encoder_outputs, + encoder_attention_mask: Optional[jnp.ndarray] = None, + decoder_attention_mask: Optional[jnp.ndarray] = None, + decoder_position_ids: Optional[jnp.ndarray] = None, + past_key_values: Optional[dict] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + train: bool = False, + params: Optional[dict] = None, + dropout_rng: PRNGKey = None, + ): + r""" + Returns: + + Example: + + ```python + >>> import jax.numpy as jnp + >>> from transformers import AutoTokenizer, FlaxBartForConditionalGeneration + + >>> model = FlaxBartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn") + >>> tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn") + + >>> text = "My friends are cool but they eat too many carbs." + >>> inputs = tokenizer(text, max_length=1024, return_tensors="jax") + >>> encoder_outputs = model.encode(**inputs) + + >>> decoder_start_token_id = model.config.decoder_start_token_id + >>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id + + >>> outputs = model.decode(decoder_input_ids, encoder_outputs) + >>> last_decoder_hidden_states = outputs.last_hidden_state + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.return_dict + + encoder_hidden_states = encoder_outputs[0] + if encoder_attention_mask is None: + batch_size, sequence_length = encoder_hidden_states.shape[:2] + encoder_attention_mask = jnp.ones((batch_size, sequence_length)) + + batch_size, sequence_length = decoder_input_ids.shape + if decoder_attention_mask is None: + decoder_attention_mask = jnp.ones((batch_size, sequence_length)) + + if decoder_position_ids is None: + if past_key_values is not None: + raise ValueError("Make sure to provide `decoder_position_ids` when passing `past_key_values`.") + + decoder_position_ids = jnp.broadcast_to( + jnp.arange(sequence_length)[None, :], (batch_size, sequence_length) + ) + + # Handle any PRNG if needed + rngs = {} + if dropout_rng is not None: + rngs["dropout"] = dropout_rng + + inputs = {"params": params or self.params} + + # if past_key_values are passed then cache is already initialized a private flag init_cache has to be + # passed down to ensure cache is used. It has to be made sure that cache is marked as mutable so that + # it can be changed by FlaxBartAttention module + if past_key_values: + inputs["cache"] = past_key_values + mutable = ["cache"] + else: + mutable = False + + def _decoder_forward(module, decoder_input_ids, decoder_attention_mask, decoder_position_ids, **kwargs): + decoder_module = module._get_decoder_module() + return decoder_module( + decoder_input_ids, + decoder_attention_mask, + decoder_position_ids, + **kwargs, + ) + + outputs = self.module.apply( + inputs, + decoder_input_ids=jnp.array(decoder_input_ids, dtype="i4"), + decoder_attention_mask=jnp.array(decoder_attention_mask, dtype="i4"), + decoder_position_ids=jnp.array(decoder_position_ids, dtype="i4"), + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=jnp.array(encoder_attention_mask, dtype="i4"), + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + deterministic=not train, + rngs=rngs, + mutable=mutable, + method=_decoder_forward, + ) + + # add updated cache to model output + if past_key_values is not None and return_dict: + outputs, past = outputs + outputs["past_key_values"] = unfreeze(past["cache"]) + return outputs + elif past_key_values is not None and not return_dict: + outputs, past = outputs + outputs = outputs[:1] + (unfreeze(past["cache"]),) + outputs[1:] + + return outputs + + @add_start_docstrings_to_model_forward(BART_INPUTS_DOCSTRING) + def __call__( + self, + input_ids: jnp.ndarray, + attention_mask: Optional[jnp.ndarray] = None, + decoder_input_ids: Optional[jnp.ndarray] = None, + decoder_attention_mask: Optional[jnp.ndarray] = None, + position_ids: Optional[jnp.ndarray] = None, + decoder_position_ids: Optional[jnp.ndarray] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + train: bool = False, + params: Optional[dict] = None, + dropout_rng: PRNGKey = None, + ): + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.return_dict + + # prepare encoder inputs + if attention_mask is None: + attention_mask = jnp.ones_like(input_ids) + if position_ids is None: + batch_size, sequence_length = input_ids.shape + position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length)) + + # prepare decoder inputs + if decoder_input_ids is None: + decoder_input_ids = shift_tokens_right( + input_ids, self.config.pad_token_id, decoder_start_token_id=self.config.decoder_start_token_id + ) + if decoder_attention_mask is None: + decoder_attention_mask = jnp.ones_like(decoder_input_ids) + if decoder_position_ids is None: + batch_size, sequence_length = decoder_input_ids.shape + decoder_position_ids = jnp.broadcast_to( + jnp.arange(sequence_length)[None, :], (batch_size, sequence_length) + ) + + # Handle any PRNG if needed + rngs = {"dropout": dropout_rng} if dropout_rng is not None else {} + + return self.module.apply( + {"params": params or self.params}, + input_ids=jnp.array(input_ids, dtype="i4"), + attention_mask=jnp.array(attention_mask, dtype="i4"), + position_ids=jnp.array(position_ids, dtype="i4"), + decoder_input_ids=jnp.array(decoder_input_ids, dtype="i4"), + decoder_attention_mask=jnp.array(decoder_attention_mask, dtype="i4"), + decoder_position_ids=jnp.array(decoder_position_ids, dtype="i4"), + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + deterministic=not train, + rngs=rngs, + ) + + +@add_start_docstrings( + "The bare Bart Model transformer outputting raw hidden-states without any specific head on top.", + BART_START_DOCSTRING, +) +class FlaxBartModel(FlaxBartPreTrainedModel): + config: BartConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + module_class = FlaxBartModule + + +append_call_sample_docstring(FlaxBartModel, _CHECKPOINT_FOR_DOC, FlaxSeq2SeqModelOutput, _CONFIG_FOR_DOC) + + +class FlaxBartForConditionalGenerationModule(nn.Module): + config: BartConfig + dtype: jnp.dtype = jnp.float32 + bias_init: Callable[..., jnp.ndarray] = jax.nn.initializers.zeros + + def setup(self): + self.model = FlaxBartModule(config=self.config, dtype=self.dtype) + self.lm_head = nn.Dense( + self.model.shared.num_embeddings, + use_bias=False, + dtype=self.dtype, + kernel_init=jax.nn.initializers.normal(self.config.init_std), + ) + self.final_logits_bias = self.param("final_logits_bias", self.bias_init, (1, self.model.shared.num_embeddings)) + + def _get_encoder_module(self): + return self.model.encoder + + def _get_decoder_module(self): + return self.model.decoder + + def __call__( + self, + input_ids, + attention_mask, + decoder_input_ids, + decoder_attention_mask, + position_ids, + decoder_position_ids, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + deterministic: bool = True, + ): + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + decoder_input_ids=decoder_input_ids, + decoder_attention_mask=decoder_attention_mask, + position_ids=position_ids, + decoder_position_ids=decoder_position_ids, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + deterministic=deterministic, + ) + + hidden_states = outputs[0] + + if self.config.tie_word_embeddings: + shared_embedding = self.model.variables["params"]["shared"]["embedding"] + lm_logits = self.lm_head.apply({"params": {"kernel": shared_embedding.T}}, hidden_states) + else: + lm_logits = self.lm_head(hidden_states) + + lm_logits += jax.lax.stop_gradient(self.final_logits_bias.astype(self.dtype)) + + if not return_dict: + output = (lm_logits,) + outputs[1:] + return output + + return FlaxSeq2SeqLMOutput( + logits=lm_logits, + decoder_hidden_states=outputs.decoder_hidden_states, + decoder_attentions=outputs.decoder_attentions, + cross_attentions=outputs.cross_attentions, + encoder_last_hidden_state=outputs.encoder_last_hidden_state, + encoder_hidden_states=outputs.encoder_hidden_states, + encoder_attentions=outputs.encoder_attentions, + ) + + +@add_start_docstrings( + "The BART Model with a language modeling head. Can be used for summarization.", BART_START_DOCSTRING +) +class FlaxBartForConditionalGeneration(FlaxBartPreTrainedModel): + module_class = FlaxBartForConditionalGenerationModule + dtype: jnp.dtype = jnp.float32 + + @add_start_docstrings(BART_DECODE_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=FlaxCausalLMOutputWithCrossAttentions, config_class=BartConfig) + def decode( + self, + decoder_input_ids, + encoder_outputs, + encoder_attention_mask: Optional[jnp.ndarray] = None, + decoder_attention_mask: Optional[jnp.ndarray] = None, + decoder_position_ids: Optional[jnp.ndarray] = None, + past_key_values: Optional[dict] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + train: bool = False, + params: Optional[dict] = None, + dropout_rng: PRNGKey = None, + ): + r""" + Returns: + + Example: + + ```python + >>> import jax.numpy as jnp + >>> from transformers import AutoTokenizer, FlaxBartForConditionalGeneration + + >>> model = FlaxBartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn") + >>> tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn") + + >>> text = "My friends are cool but they eat too many carbs." + >>> inputs = tokenizer(text, max_length=1024, return_tensors="jax") + >>> encoder_outputs = model.encode(**inputs) + + >>> decoder_start_token_id = model.config.decoder_start_token_id + >>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id + + >>> outputs = model.decode(decoder_input_ids, encoder_outputs) + >>> logits = outputs.logits + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.return_dict + + encoder_hidden_states = encoder_outputs[0] + if encoder_attention_mask is None: + batch_size, sequence_length = encoder_hidden_states.shape[:2] + encoder_attention_mask = jnp.ones((batch_size, sequence_length)) + + batch_size, sequence_length = decoder_input_ids.shape + if decoder_attention_mask is None: + decoder_attention_mask = jnp.ones((batch_size, sequence_length)) + + if decoder_position_ids is None: + if past_key_values is not None: + raise ValueError("Make sure to provide `decoder_position_ids` when passing `past_key_values`.") + + decoder_position_ids = jnp.broadcast_to( + jnp.arange(sequence_length)[None, :], (batch_size, sequence_length) + ) + + # Handle any PRNG if needed + rngs = {} + if dropout_rng is not None: + rngs["dropout"] = dropout_rng + + inputs = {"params": params or self.params} + + # if past_key_values are passed then cache is already initialized a private flag init_cache has to be + # passed down to ensure cache is used. It has to be made sure that cache is marked as mutable so that + # it can be changed by FlaxBartAttention module + if past_key_values: + inputs["cache"] = past_key_values + mutable = ["cache"] + else: + mutable = False + + def _decoder_forward(module, decoder_input_ids, decoder_attention_mask, decoder_position_ids, **kwargs): + decoder_module = module._get_decoder_module() + outputs = decoder_module( + decoder_input_ids, + decoder_attention_mask, + decoder_position_ids, + **kwargs, + ) + hidden_states = outputs[0] + + if self.config.tie_word_embeddings: + shared_embedding = module.model.variables["params"]["shared"]["embedding"] + lm_logits = module.lm_head.apply({"params": {"kernel": shared_embedding.T}}, hidden_states) + else: + lm_logits = module.lm_head(hidden_states) + + lm_logits += module.final_logits_bias.astype(self.dtype) + return lm_logits, outputs + + outputs = self.module.apply( + inputs, + decoder_input_ids=jnp.array(decoder_input_ids, dtype="i4"), + decoder_attention_mask=jnp.array(decoder_attention_mask, dtype="i4"), + decoder_position_ids=jnp.array(decoder_position_ids, dtype="i4"), + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=jnp.array(encoder_attention_mask, dtype="i4"), + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + deterministic=not train, + rngs=rngs, + mutable=mutable, + method=_decoder_forward, + ) + + if past_key_values is None: + lm_logits, decoder_outputs = outputs + else: + (lm_logits, decoder_outputs), past = outputs + + if return_dict: + outputs = FlaxCausalLMOutputWithCrossAttentions( + logits=lm_logits, + hidden_states=decoder_outputs.hidden_states, + attentions=decoder_outputs.attentions, + cross_attentions=decoder_outputs.cross_attentions, + ) + else: + outputs = (lm_logits,) + decoder_outputs[1:] + + # add updated cache to model output + if past_key_values is not None and return_dict: + outputs["past_key_values"] = unfreeze(past["cache"]) + return outputs + elif past_key_values is not None and not return_dict: + outputs = outputs[:1] + (unfreeze(past["cache"]),) + outputs[1:] + + return outputs + + def prepare_inputs_for_generation( + self, + decoder_input_ids, + max_length, + attention_mask: Optional[jax.Array] = None, + decoder_attention_mask: Optional[jax.Array] = None, + encoder_outputs=None, + **kwargs, + ): + # initializing the cache + batch_size, seq_length = decoder_input_ids.shape + + past_key_values = self.init_cache(batch_size, max_length, encoder_outputs) + # Note that usually one would have to put 0's in the attention_mask for x > input_ids.shape[-1] and x < cache_length. + # But since the decoder uses a causal mask, those positions are masked anyways. + # Thus we can create a single static attention_mask here, which is more efficient for compilation + extended_attention_mask = jnp.ones((batch_size, max_length), dtype="i4") + if decoder_attention_mask is not None: + position_ids = decoder_attention_mask.cumsum(axis=-1) - 1 + extended_attention_mask = lax.dynamic_update_slice(extended_attention_mask, decoder_attention_mask, (0, 0)) + else: + position_ids = jnp.broadcast_to(jnp.arange(seq_length, dtype="i4")[None, :], (batch_size, seq_length)) + + return { + "past_key_values": past_key_values, + "encoder_outputs": encoder_outputs, + "encoder_attention_mask": attention_mask, + "decoder_attention_mask": extended_attention_mask, + "decoder_position_ids": position_ids, + } + + def update_inputs_for_generation(self, model_outputs, model_kwargs): + model_kwargs["past_key_values"] = model_outputs.past_key_values + model_kwargs["decoder_position_ids"] = model_kwargs["decoder_position_ids"][:, -1:] + 1 + return model_kwargs + + +FLAX_BART_CONDITIONAL_GENERATION_DOCSTRING = """ + Returns: + + Summarization example: + + ```python + >>> from transformers import AutoTokenizer, FlaxBartForConditionalGeneration + + >>> model = FlaxBartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn") + >>> tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn") + + >>> ARTICLE_TO_SUMMARIZE = "My friends are cool but they eat too many carbs." + >>> inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors="np") + + >>> # Generate Summary + >>> summary_ids = model.generate(inputs["input_ids"]).sequences + >>> print(tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)) + ``` + + Mask filling example: + + ```python + >>> import jax + >>> from transformers import AutoTokenizer, FlaxBartForConditionalGeneration + + >>> model = FlaxBartForConditionalGeneration.from_pretrained("facebook/bart-large") + >>> tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large") + + >>> TXT = "My friends are but they eat too many carbs." + >>> input_ids = tokenizer([TXT], return_tensors="jax")["input_ids"] + + >>> logits = model(input_ids).logits + >>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero()[0].item() + >>> probs = jax.nn.softmax(logits[0, masked_index], axis=0) + >>> values, predictions = jax.lax.top_k(probs, k=1) + + >>> tokenizer.decode(predictions).split() + ``` +""" + +overwrite_call_docstring( + FlaxBartForConditionalGeneration, BART_INPUTS_DOCSTRING + FLAX_BART_CONDITIONAL_GENERATION_DOCSTRING +) +append_replace_return_docstrings( + FlaxBartForConditionalGeneration, output_type=FlaxSeq2SeqLMOutput, config_class=_CONFIG_FOR_DOC +) + + +class FlaxBartForSequenceClassificationModule(nn.Module): + config: BartConfig + dtype: jnp.dtype = jnp.float32 + num_labels: Optional[int] = None + + def setup(self): + self.model = FlaxBartModule(config=self.config, dtype=self.dtype) + self.classification_head = FlaxBartClassificationHead( + config=self.config, + inner_dim=self.config.d_model, + num_classes=self.num_labels if self.num_labels is not None else self.config.num_labels, + pooler_dropout=self.config.classifier_dropout, + ) + + def _get_encoder_module(self): + return self.model.encoder + + def _get_decoder_module(self): + return self.model.decoder + + def __call__( + self, + input_ids, + attention_mask, + decoder_input_ids, + decoder_attention_mask, + position_ids, + decoder_position_ids, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + deterministic: bool = True, + ): + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + decoder_input_ids=decoder_input_ids, + decoder_attention_mask=decoder_attention_mask, + position_ids=position_ids, + decoder_position_ids=decoder_position_ids, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + deterministic=deterministic, + ) + + hidden_states = outputs[0] # last hidden state + + eos_mask = jnp.where(input_ids == self.config.eos_token_id, 1, 0) + + # The first condition is necessary to overcome jax._src.errors.ConcretizationTypeError during JIT compilation + if not isinstance(eos_mask, jax.interpreters.partial_eval.DynamicJaxprTracer): + if len(jnp.unique(eos_mask.sum(1))) > 1: + raise ValueError("All examples must have the same number of tokens.") + + if any(eos_mask.sum(1) == 0): + raise ValueError("There are missing tokens in input_ids") + + # Ensure to keep 1 only for the last token for each example + eos_mask_noised = eos_mask + jnp.arange(eos_mask.shape[1]) * 1e-6 + eos_mask = jnp.where(eos_mask_noised == eos_mask_noised.max(1).reshape(-1, 1), 1, 0) + + sentence_representation = jnp.einsum("ijk, ij -> ijk", hidden_states, eos_mask).sum(1) + logits = self.classification_head(sentence_representation, deterministic=deterministic) + + if not return_dict: + output = (logits,) + outputs[1:] + return output + + return FlaxSeq2SeqSequenceClassifierOutput( + logits=logits, + decoder_hidden_states=outputs.decoder_hidden_states, + decoder_attentions=outputs.decoder_attentions, + cross_attentions=outputs.cross_attentions, + encoder_last_hidden_state=outputs.encoder_last_hidden_state, + encoder_hidden_states=outputs.encoder_hidden_states, + encoder_attentions=outputs.encoder_attentions, + ) + + +@add_start_docstrings( + """ + Bart model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE + tasks. + """, + BART_START_DOCSTRING, +) +class FlaxBartForSequenceClassification(FlaxBartPreTrainedModel): + module_class = FlaxBartForSequenceClassificationModule + dtype = jnp.float32 + + +append_call_sample_docstring( + FlaxBartForSequenceClassification, + _CHECKPOINT_FOR_DOC, + FlaxSeq2SeqSequenceClassifierOutput, + _CONFIG_FOR_DOC, +) + + +class FlaxBartForQuestionAnsweringModule(nn.Module): + config: BartConfig + dtype: jnp.dtype = jnp.float32 + num_labels = 2 + + def setup(self): + self.model = FlaxBartModule(config=self.config, dtype=self.dtype) + self.qa_outputs = nn.Dense( + self.num_labels, dtype=self.dtype, kernel_init=jax.nn.initializers.normal(self.config.init_std) + ) + + def _get_encoder_module(self): + return self.model.encoder + + def _get_decoder_module(self): + return self.model.decoder + + def __call__( + self, + input_ids, + attention_mask, + decoder_input_ids, + decoder_attention_mask, + position_ids, + decoder_position_ids, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + deterministic: bool = True, + ): + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + decoder_input_ids=decoder_input_ids, + decoder_attention_mask=decoder_attention_mask, + position_ids=position_ids, + decoder_position_ids=decoder_position_ids, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + deterministic=deterministic, + ) + + sequence_output = outputs[0] + + logits = self.qa_outputs(sequence_output) + start_logits, end_logits = jnp.split(logits, logits.shape[-1], axis=-1) + start_logits = start_logits.squeeze(-1) + end_logits = end_logits.squeeze(-1) + + if not return_dict: + output = (start_logits, end_logits) + outputs[1:] + return output + + return FlaxSeq2SeqQuestionAnsweringModelOutput( + start_logits=start_logits, + end_logits=end_logits, + decoder_hidden_states=outputs.decoder_hidden_states, + decoder_attentions=outputs.decoder_attentions, + cross_attentions=outputs.cross_attentions, + encoder_last_hidden_state=outputs.encoder_last_hidden_state, + encoder_hidden_states=outputs.encoder_hidden_states, + encoder_attentions=outputs.encoder_attentions, + ) + + +@add_start_docstrings( + """ + BART Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear + layer on top of the hidden-states output to compute `span start logits` and `span end logits`). + """, + BART_START_DOCSTRING, +) +class FlaxBartForQuestionAnswering(FlaxBartPreTrainedModel): + module_class = FlaxBartForQuestionAnsweringModule + dtype = jnp.float32 + + +append_call_sample_docstring( + FlaxBartForQuestionAnswering, + _CHECKPOINT_FOR_DOC, + FlaxSeq2SeqQuestionAnsweringModelOutput, + _CONFIG_FOR_DOC, +) + + +class FlaxBartDecoderPreTrainedModel(FlaxPreTrainedModel): + config_class = BartConfig + base_model_prefix: str = "model" + module_class: nn.Module = None + + def __init__( + self, + config: BartConfig, + input_shape: tuple[int] = (1, 1), + seed: int = 0, + dtype: jnp.dtype = jnp.float32, + _do_init: bool = True, + **kwargs, + ): + config.is_decoder = True + config.is_encoder_decoder = False + module = self.module_class(config=config, dtype=dtype, **kwargs) + super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init) + + def init_weights(self, rng: jax.random.PRNGKey, input_shape: tuple, params: FrozenDict = None) -> FrozenDict: + # init input tensors + input_ids = jnp.zeros(input_shape, dtype="i4") + attention_mask = jnp.ones_like(input_ids) + + batch_size, sequence_length = input_ids.shape + position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length)) + + params_rng, dropout_rng = jax.random.split(rng) + rngs = {"params": params_rng, "dropout": dropout_rng} + encoder_hidden_states = jnp.zeros(input_shape + (self.config.d_model,)) + encoder_attention_mask = attention_mask + module_init_outputs = self.module.init( + rngs, + input_ids, + attention_mask, + position_ids, + encoder_hidden_states, + encoder_attention_mask, + return_dict=False, + ) + return module_init_outputs["params"] + + def init_cache(self, batch_size, max_length): + r""" + Args: + batch_size (`int`): + batch_size used for fast auto-regressive decoding. Defines the batch size of the initialized cache. + max_length (`int`): + maximum possible length for auto-regressive decoding. Defines the sequence length of the initialized + cache. + """ + # init input variables to retrieve cache + input_ids = jnp.ones((batch_size, max_length), dtype="i4") + attention_mask = jnp.ones_like(input_ids, dtype="i4") + position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape) + + init_variables = self.module.init( + jax.random.PRNGKey(0), input_ids, attention_mask, position_ids, return_dict=False, init_cache=True + ) + return unfreeze(init_variables["cache"]) + + @add_start_docstrings_to_model_forward(BART_DECODE_INPUTS_DOCSTRING) + def __call__( + self, + input_ids: jnp.ndarray, + attention_mask: Optional[jnp.ndarray] = None, + position_ids: Optional[jnp.ndarray] = None, + encoder_hidden_states: Optional[jnp.ndarray] = None, + encoder_attention_mask: Optional[jnp.ndarray] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + train: bool = False, + params: Optional[dict] = None, + past_key_values: Optional[dict] = None, + dropout_rng: PRNGKey = None, + ): + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.return_dict + + if encoder_hidden_states is not None and encoder_attention_mask is None: + batch_size, sequence_length = encoder_hidden_states.shape[:2] + encoder_attention_mask = jnp.ones((batch_size, sequence_length)) + + # prepare decoder inputs + if attention_mask is None: + attention_mask = jnp.ones_like(input_ids) + if position_ids is None: + batch_size, sequence_length = input_ids.shape + position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length)) + + # Handle any PRNG if needed + rngs = {"dropout": dropout_rng} if dropout_rng is not None else {} + + inputs = {"params": params or self.params} + + # if past_key_values are passed then cache is already initialized a private flag init_cache has to be passed + # down to ensure cache is used. It has to be made sure that cache is marked as mutable so that it can be + # changed by FlaxBartAttention module + if past_key_values: + inputs["cache"] = past_key_values + mutable = ["cache"] + else: + mutable = False + + outputs = self.module.apply( + inputs, + input_ids=jnp.array(input_ids, dtype="i4"), + attention_mask=jnp.array(attention_mask, dtype="i4"), + position_ids=jnp.array(position_ids, dtype="i4"), + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + deterministic=not train, + rngs=rngs, + mutable=mutable, + ) + + # add updated cache to model output + if past_key_values is not None and return_dict: + outputs, past_key_values = outputs + outputs["past_key_values"] = unfreeze(past_key_values["cache"]) + return outputs + elif past_key_values is not None and not return_dict: + outputs, past_key_values = outputs + outputs = outputs[:1] + (unfreeze(past_key_values["cache"]),) + outputs[1:] + + return outputs + + +class FlaxBartDecoderWrapper(nn.Module): + """ + This wrapper class is a helper class to correctly load pretrained checkpoints when the causal language model is + used in combination with the [`EncoderDecoderModel`] framework. + """ + + config: BartConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self): + embed_dim = self.config.d_model + embed_tokens = nn.Embed( + self.config.vocab_size, + embed_dim, + embedding_init=jax.nn.initializers.normal(self.config.init_std), + dtype=self.dtype, + ) + self.decoder = FlaxBartDecoder(config=self.config, embed_tokens=embed_tokens, dtype=self.dtype) + + def __call__(self, *args, **kwargs): + return self.decoder(*args, **kwargs) + + +class FlaxBartForCausalLMModule(nn.Module): + config: BartConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self): + self.model = FlaxBartDecoderWrapper(config=self.config, dtype=self.dtype) + self.lm_head = nn.Dense( + self.config.vocab_size, + use_bias=False, + dtype=self.dtype, + kernel_init=jax.nn.initializers.normal(self.config.init_std), + ) + + def __call__( + self, + input_ids, + attention_mask, + position_ids, + encoder_hidden_states: Optional[jnp.ndarray] = None, + encoder_attention_mask: Optional[jnp.ndarray] = None, + init_cache: bool = False, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + deterministic: bool = True, + ): + outputs = self.model( + input_ids, + attention_mask, + position_ids, + encoder_hidden_states, + encoder_attention_mask, + deterministic=deterministic, + init_cache=init_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs[0] + + if self.config.tie_word_embeddings: + shared_embedding = self.model.variables["params"]["decoder"]["embed_tokens"]["embedding"] + lm_logits = self.lm_head.apply({"params": {"kernel": shared_embedding.T}}, hidden_states) + else: + lm_logits = self.lm_head(hidden_states) + + if not return_dict: + return (lm_logits,) + outputs[1:] + + return FlaxCausalLMOutputWithCrossAttentions( + logits=lm_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + cross_attentions=outputs.cross_attentions, + ) + + +@add_start_docstrings( + """ + Bart Decoder Model with a language modeling head on top (linear layer with weights tied to the input embeddings) + e.g for autoregressive tasks. + """, + BART_START_DOCSTRING, +) +class FlaxBartForCausalLM(FlaxBartDecoderPreTrainedModel): + module_class = FlaxBartForCausalLMModule + + def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jax.Array] = None): + # initializing the cache + batch_size, seq_length = input_ids.shape + + past_key_values = self.init_cache(batch_size, max_length) + # Note that usually one would have to put 0's in the attention_mask for x > input_ids.shape[-1] and x < cache_length. + # But since the decoder uses a causal mask, those positions are masked anyway. + # Thus, we can create a single static attention_mask here, which is more efficient for compilation + extended_attention_mask = jnp.ones((batch_size, max_length), dtype="i4") + if attention_mask is not None: + position_ids = attention_mask.cumsum(axis=-1) - 1 + extended_attention_mask = lax.dynamic_update_slice(extended_attention_mask, attention_mask, (0, 0)) + else: + position_ids = jnp.broadcast_to(jnp.arange(seq_length, dtype="i4")[None, :], (batch_size, seq_length)) + + return { + "past_key_values": past_key_values, + "attention_mask": extended_attention_mask, + "position_ids": position_ids, + } + + def update_inputs_for_generation(self, model_outputs, model_kwargs): + model_kwargs["past_key_values"] = model_outputs.past_key_values + model_kwargs["position_ids"] = model_kwargs["position_ids"][:, -1:] + 1 + return model_kwargs + + +append_call_sample_docstring( + FlaxBartForCausalLM, + _CHECKPOINT_FOR_DOC, + FlaxCausalLMOutputWithCrossAttentions, + _CONFIG_FOR_DOC, +) + + +__all__ = [ + "FlaxBartDecoderPreTrainedModel", + "FlaxBartForCausalLM", + "FlaxBartForConditionalGeneration", + "FlaxBartForQuestionAnswering", + "FlaxBartForSequenceClassification", + "FlaxBartModel", + "FlaxBartPreTrainedModel", +] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bart/modeling_tf_bart.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bart/modeling_tf_bart.py new file mode 100644 index 0000000000000000000000000000000000000000..0a6d2317d696f9a9d1edce17f256ceccc8134e49 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bart/modeling_tf_bart.py @@ -0,0 +1,1713 @@ +# coding=utf-8 +# Copyright 2021 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""TF 2.0 Bart model.""" + +from __future__ import annotations + +import random + +import numpy as np +import tensorflow as tf + +from ...activations_tf import get_tf_activation +from ...modeling_tf_outputs import ( + TFBaseModelOutput, + TFBaseModelOutputWithPastAndCrossAttentions, + TFSeq2SeqLMOutput, + TFSeq2SeqModelOutput, + TFSeq2SeqSequenceClassifierOutput, +) + +# Public API +from ...modeling_tf_utils import ( + TFCausalLanguageModelingLoss, + TFModelInputType, + TFPreTrainedModel, + TFSequenceClassificationLoss, + keras, + keras_serializable, + unpack_inputs, +) +from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax +from ...utils import ( + add_code_sample_docstrings, + add_end_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, + replace_return_docstrings, +) +from .configuration_bart import BartConfig + + +logger = logging.get_logger(__name__) + +_CHECKPOINT_FOR_DOC = "facebook/bart-large" +_CONFIG_FOR_DOC = "BartConfig" + + +LARGE_NEGATIVE = -1e8 + + +def shift_tokens_right(input_ids: tf.Tensor, pad_token_id: int, decoder_start_token_id: int): + pad_token_id = tf.cast(pad_token_id, input_ids.dtype) + decoder_start_token_id = tf.cast(decoder_start_token_id, input_ids.dtype) + start_tokens = tf.fill( + (shape_list(input_ids)[0], 1), tf.convert_to_tensor(decoder_start_token_id, input_ids.dtype) + ) + shifted_input_ids = tf.concat([start_tokens, input_ids[:, :-1]], -1) + # replace possible -100 values in labels by `pad_token_id` + shifted_input_ids = tf.where( + shifted_input_ids == -100, + tf.fill(shape_list(shifted_input_ids), tf.convert_to_tensor(pad_token_id, input_ids.dtype)), + shifted_input_ids, + ) + + # "Verify that `labels` has only positive values and -100" + assert_gte0 = tf.debugging.assert_greater_equal(shifted_input_ids, tf.constant(0, dtype=input_ids.dtype)) + + # Make sure the assertion op is called by wrapping the result in an identity no-op + with tf.control_dependencies([assert_gte0]): + shifted_input_ids = tf.identity(shifted_input_ids) + + return shifted_input_ids + + +def _make_causal_mask(input_ids_shape: tf.TensorShape, past_key_values_length: int = 0): + """ + Make causal mask used for bi-directional self-attention. + """ + bsz = input_ids_shape[0] + tgt_len = input_ids_shape[1] + mask = tf.ones((tgt_len, tgt_len)) * LARGE_NEGATIVE + mask_cond = tf.range(shape_list(mask)[-1]) + + mask = tf.where(mask_cond < tf.reshape(mask_cond + 1, (shape_list(mask)[-1], 1)), 0.0, mask) + + if past_key_values_length > 0: + mask = tf.concat([tf.zeros((tgt_len, past_key_values_length)), mask], axis=-1) + + return tf.tile(mask[None, None, :, :], (bsz, 1, 1, 1)) + + +def _expand_mask(mask: tf.Tensor, tgt_len: int | None = None): + """ + Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`. + """ + src_len = shape_list(mask)[1] + tgt_len = tgt_len if tgt_len is not None else src_len + one_cst = tf.constant(1.0) + mask = tf.cast(mask, dtype=one_cst.dtype) + expanded_mask = tf.tile(mask[:, None, None, :], (1, 1, tgt_len, 1)) + + return (one_cst - expanded_mask) * LARGE_NEGATIVE + + +class TFBartLearnedPositionalEmbedding(keras.layers.Embedding): + """ + This module learns positional embeddings up to a fixed maximum size. + """ + + def __init__(self, num_embeddings: int, embedding_dim: int, **kwargs): + # Bart is set up so that if padding_idx is specified then offset the embedding ids by 2 + # and adjust num_embeddings appropriately. Other models don't have this hack + self.offset = 2 + super().__init__(num_embeddings + self.offset, embedding_dim, **kwargs) + + def call( + self, + input_shape: tf.TensorShape | None = None, + past_key_values_length: int = 0, + position_ids: tf.Tensor | None = None, + ): + """Input is expected to be of size [bsz x seqlen].""" + if position_ids is None: + seq_len = input_shape[1] + position_ids = tf.range(seq_len, delta=1, name="range") + position_ids += past_key_values_length + + offset_dtype = position_ids.dtype if isinstance(position_ids, tf.Tensor) else tf.int32 + return super().call(position_ids + tf.constant(self.offset, dtype=offset_dtype)) + + +class TFBartAttention(keras.layers.Layer): + """Multi-headed attention from "Attention Is All You Need""" + + def __init__( + self, + embed_dim: int, + num_heads: int, + dropout: float = 0.0, + is_decoder: bool = False, + bias: bool = True, + **kwargs, + ): + super().__init__(**kwargs) + self.embed_dim = embed_dim + + self.num_heads = num_heads + self.dropout = keras.layers.Dropout(dropout) + self.head_dim = embed_dim // num_heads + if (self.head_dim * num_heads) != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}" + f" and `num_heads`: {num_heads})." + ) + self.scaling = self.head_dim**-0.5 + self.is_decoder = is_decoder + + self.k_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj") + self.q_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="q_proj") + self.v_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="v_proj") + self.out_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="out_proj") + + def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int): + return tf.transpose(tf.reshape(tensor, (bsz, seq_len, self.num_heads, self.head_dim)), (0, 2, 1, 3)) + + def call( + self, + hidden_states: tf.Tensor, + key_value_states: tf.Tensor | None = None, + past_key_value: tuple[tuple[tf.Tensor]] | None = None, + attention_mask: tf.Tensor | None = None, + layer_head_mask: tf.Tensor | None = None, + training: bool | None = False, + ) -> tuple[tf.Tensor, tf.Tensor | None]: + """Input shape: Batch x Time x Channel""" + + # if key_value_states are provided this layer is used as a cross-attention layer + # for the decoder + is_cross_attention = key_value_states is not None + bsz, tgt_len, embed_dim = shape_list(hidden_states) + + # get query proj + query_states = self.q_proj(hidden_states) * self.scaling + # get key, value proj + if is_cross_attention and past_key_value is not None: + # reuse k,v, cross_attentions + key_states = past_key_value[0] + value_states = past_key_value[1] + elif is_cross_attention: + # cross_attentions + key_states = self._shape(self.k_proj(key_value_states), -1, bsz) + value_states = self._shape(self.v_proj(key_value_states), -1, bsz) + elif past_key_value is not None: + # reuse k, v, self_attention + key_states = self._shape(self.k_proj(hidden_states), -1, bsz) + value_states = self._shape(self.v_proj(hidden_states), -1, bsz) + key_states = tf.concat([past_key_value[0], key_states], axis=2) + value_states = tf.concat([past_key_value[1], value_states], axis=2) + else: + # self_attention + key_states = self._shape(self.k_proj(hidden_states), -1, bsz) + value_states = self._shape(self.v_proj(hidden_states), -1, bsz) + + if self.is_decoder: + # if cross_attention save Tuple(tf.Tensor, tf.Tensor) of all cross attention key/value_states. + # Further calls to cross_attention layer can then reuse all cross-attention + # key/value_states (first "if" case) + # if uni-directional self-attention (decoder) save Tuple(tf.Tensor, tf.Tensor) of + # all previous decoder key/value_states. Further calls to uni-directional self-attention + # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) + # if encoder bi-directional self-attention `past_key_value` is always `None` + past_key_value = (key_states, value_states) + + proj_shape = (bsz * self.num_heads, -1, self.head_dim) + query_states = tf.reshape(self._shape(query_states, tgt_len, bsz), proj_shape) + key_states = tf.reshape(key_states, proj_shape) + value_states = tf.reshape(value_states, proj_shape) + + src_len = shape_list(key_states)[1] + attn_weights = tf.matmul(query_states, key_states, transpose_b=True) + + tf.debugging.assert_equal( + shape_list(attn_weights), + [bsz * self.num_heads, tgt_len, src_len], + message=( + f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is" + f" {shape_list(attn_weights)}" + ), + ) + + if attention_mask is not None: + tf.debugging.assert_equal( + shape_list(attention_mask), + [bsz, 1, tgt_len, src_len], + message=( + f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is" + f" {shape_list(attention_mask)}" + ), + ) + + attention_mask = tf.cast(attention_mask, dtype=attn_weights.dtype) + attn_weights = tf.reshape(attn_weights, (bsz, self.num_heads, tgt_len, src_len)) + attention_mask + attn_weights = tf.reshape(attn_weights, (bsz * self.num_heads, tgt_len, src_len)) + + attn_weights = stable_softmax(attn_weights, axis=-1) + + if layer_head_mask is not None: + tf.debugging.assert_equal( + shape_list(layer_head_mask), + [self.num_heads], + message=( + f"Head mask for a single layer should be of size {(self.num_heads)}, but is" + f" {shape_list(layer_head_mask)}" + ), + ) + + attn_weights = tf.reshape(layer_head_mask, (1, -1, 1, 1)) * tf.reshape( + attn_weights, (bsz, self.num_heads, tgt_len, src_len) + ) + attn_weights = tf.reshape(attn_weights, (bsz * self.num_heads, tgt_len, src_len)) + + attn_probs = self.dropout(attn_weights, training=training) + attn_output = tf.matmul(attn_probs, value_states) + + tf.debugging.assert_equal( + shape_list(attn_output), + [bsz * self.num_heads, tgt_len, self.head_dim], + message=( + f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is" + f" {shape_list(attn_output)}" + ), + ) + + attn_output = tf.transpose( + tf.reshape(attn_output, (bsz, self.num_heads, tgt_len, self.head_dim)), (0, 2, 1, 3) + ) + attn_output = tf.reshape(attn_output, (bsz, tgt_len, embed_dim)) + + attn_output = self.out_proj(attn_output) + attn_weights: tf.Tensor = tf.reshape(attn_weights, (bsz, self.num_heads, tgt_len, src_len)) + + return attn_output, attn_weights, past_key_value + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "k_proj", None) is not None: + with tf.name_scope(self.k_proj.name): + self.k_proj.build([None, None, self.embed_dim]) + if getattr(self, "q_proj", None) is not None: + with tf.name_scope(self.q_proj.name): + self.q_proj.build([None, None, self.embed_dim]) + if getattr(self, "v_proj", None) is not None: + with tf.name_scope(self.v_proj.name): + self.v_proj.build([None, None, self.embed_dim]) + if getattr(self, "out_proj", None) is not None: + with tf.name_scope(self.out_proj.name): + self.out_proj.build([None, None, self.embed_dim]) + + +class TFBartEncoderLayer(keras.layers.Layer): + def __init__(self, config: BartConfig, **kwargs): + super().__init__(**kwargs) + self.embed_dim = config.d_model + self.self_attn = TFBartAttention( + self.embed_dim, config.encoder_attention_heads, dropout=config.attention_dropout, name="self_attn" + ) + self.self_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm") + self.dropout = keras.layers.Dropout(config.dropout) + self.activation_fn = get_tf_activation(config.activation_function) + self.activation_dropout = keras.layers.Dropout(config.activation_dropout) + self.fc1 = keras.layers.Dense(config.encoder_ffn_dim, name="fc1") + self.fc2 = keras.layers.Dense(self.embed_dim, name="fc2") + self.final_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") + self.config = config + + def call( + self, + hidden_states: tf.Tensor, + attention_mask: np.ndarray | tf.Tensor | None, + layer_head_mask: tf.Tensor | None, + training: bool | None = False, + ) -> tf.Tensor: + """ + Args: + hidden_states (`tf.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`tf.Tensor`): attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size + `(encoder_attention_heads,)` + """ + residual = hidden_states + hidden_states, self_attn_weights, _ = self.self_attn( + hidden_states=hidden_states, attention_mask=attention_mask, layer_head_mask=layer_head_mask + ) + + tf.debugging.assert_equal( + shape_list(hidden_states), + shape_list(residual), + message=f"Self attn modified the shape of query {shape_list(residual)} to {shape_list(hidden_states)}", + ) + + hidden_states = self.dropout(hidden_states, training=training) + hidden_states = residual + hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + + residual = hidden_states + hidden_states = self.activation_fn(self.fc1(hidden_states)) + hidden_states = self.activation_dropout(hidden_states, training=training) + hidden_states = self.fc2(hidden_states) + hidden_states = self.dropout(hidden_states, training=training) + hidden_states = residual + hidden_states + hidden_states = self.final_layer_norm(hidden_states) + + return hidden_states, self_attn_weights + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "self_attn", None) is not None: + with tf.name_scope(self.self_attn.name): + self.self_attn.build(None) + if getattr(self, "self_attn_layer_norm", None) is not None: + with tf.name_scope(self.self_attn_layer_norm.name): + self.self_attn_layer_norm.build([None, None, self.embed_dim]) + if getattr(self, "fc1", None) is not None: + with tf.name_scope(self.fc1.name): + self.fc1.build([None, None, self.embed_dim]) + if getattr(self, "fc2", None) is not None: + with tf.name_scope(self.fc2.name): + self.fc2.build([None, None, self.config.encoder_ffn_dim]) + if getattr(self, "final_layer_norm", None) is not None: + with tf.name_scope(self.final_layer_norm.name): + self.final_layer_norm.build([None, None, self.embed_dim]) + + +class TFBartDecoderLayer(keras.layers.Layer): + def __init__(self, config: BartConfig, **kwargs): + super().__init__(**kwargs) + self.embed_dim = config.d_model + self.self_attn = TFBartAttention( + embed_dim=self.embed_dim, + num_heads=config.decoder_attention_heads, + dropout=config.attention_dropout, + name="self_attn", + is_decoder=True, + ) + self.dropout = keras.layers.Dropout(config.dropout) + self.activation_fn = get_tf_activation(config.activation_function) + self.activation_dropout = keras.layers.Dropout(config.activation_dropout) + + self.self_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm") + self.encoder_attn = TFBartAttention( + self.embed_dim, + config.decoder_attention_heads, + dropout=config.attention_dropout, + name="encoder_attn", + is_decoder=True, + ) + self.encoder_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="encoder_attn_layer_norm") + self.fc1 = keras.layers.Dense(config.decoder_ffn_dim, name="fc1") + self.fc2 = keras.layers.Dense(self.embed_dim, name="fc2") + self.final_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") + self.config = config + + def call( + self, + hidden_states: tf.Tensor, + attention_mask: np.ndarray | tf.Tensor | None = None, + encoder_hidden_states: np.ndarray | tf.Tensor | None = None, + encoder_attention_mask: np.ndarray | tf.Tensor | None = None, + layer_head_mask: tf.Tensor | None = None, + cross_attn_layer_head_mask: tf.Tensor | None = None, + past_key_value: tuple[tuple[np.ndarray | tf.Tensor]] | None = None, + training: bool | None = False, + ) -> tuple[tf.Tensor, tf.Tensor, tuple[tuple[tf.Tensor]]]: + """ + Args: + hidden_states (`tf.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`tf.Tensor`): attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + encoder_hidden_states (`tf.Tensor`): + cross attention input to the layer of shape `(batch, seq_len, embed_dim)` + encoder_attention_mask (`tf.Tensor`): encoder attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size + `(decoder_attention_heads,)` + cross_attn_layer_head_mask (`tf.Tensor`): mask for heads of the cross-attention module. + `(decoder_attention_heads,)` + past_key_value (`Tuple(tf.Tensor)`): cached past key and value projection states + """ + residual = hidden_states + + # Self Attention + # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 + self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None + # add present self-attn cache to positions 1,2 of present_key_value tuple + hidden_states, self_attn_weights, present_key_value = self.self_attn( + hidden_states=hidden_states, + past_key_value=self_attn_past_key_value, + attention_mask=attention_mask, + layer_head_mask=layer_head_mask, + ) + hidden_states = self.dropout(hidden_states, training=training) + hidden_states = residual + hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + + # Cross-Attention Block + cross_attn_present_key_value = None + cross_attn_weights = None + if encoder_hidden_states is not None: + residual = hidden_states + + # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple + cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None + hidden_states, cross_attn_weights, cross_attn_present_key_value = self.encoder_attn( + hidden_states=hidden_states, + key_value_states=encoder_hidden_states, + attention_mask=encoder_attention_mask, + layer_head_mask=cross_attn_layer_head_mask, + past_key_value=cross_attn_past_key_value, + ) + hidden_states = self.dropout(hidden_states, training=training) + hidden_states = residual + hidden_states + hidden_states = self.encoder_attn_layer_norm(hidden_states) + + # add cross-attn to positions 3,4 of present_key_value tuple + present_key_value = present_key_value + cross_attn_present_key_value + + # Fully Connected + residual = hidden_states + hidden_states = self.activation_fn(self.fc1(hidden_states)) + hidden_states = self.activation_dropout(hidden_states, training=training) + hidden_states = self.fc2(hidden_states) + hidden_states = self.dropout(hidden_states, training=training) + hidden_states = residual + hidden_states + hidden_states = self.final_layer_norm(hidden_states) + + return ( + hidden_states, + self_attn_weights, + cross_attn_weights, + present_key_value, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "self_attn", None) is not None: + with tf.name_scope(self.self_attn.name): + self.self_attn.build(None) + if getattr(self, "self_attn_layer_norm", None) is not None: + with tf.name_scope(self.self_attn_layer_norm.name): + self.self_attn_layer_norm.build([None, None, self.embed_dim]) + if getattr(self, "encoder_attn", None) is not None: + with tf.name_scope(self.encoder_attn.name): + self.encoder_attn.build(None) + if getattr(self, "encoder_attn_layer_norm", None) is not None: + with tf.name_scope(self.encoder_attn_layer_norm.name): + self.encoder_attn_layer_norm.build([None, None, self.embed_dim]) + if getattr(self, "fc1", None) is not None: + with tf.name_scope(self.fc1.name): + self.fc1.build([None, None, self.embed_dim]) + if getattr(self, "fc2", None) is not None: + with tf.name_scope(self.fc2.name): + self.fc2.build([None, None, self.config.decoder_ffn_dim]) + if getattr(self, "final_layer_norm", None) is not None: + with tf.name_scope(self.final_layer_norm.name): + self.final_layer_norm.build([None, None, self.embed_dim]) + + +class TFBartClassificationHead(keras.layers.Layer): + """Head for sentence-level classification tasks.""" + + def __init__(self, inner_dim: int, num_classes: int, pooler_dropout: float, name: str, **kwargs): + super().__init__(name=name, **kwargs) + self.dense = keras.layers.Dense(inner_dim, name="dense") + self.dropout = keras.layers.Dropout(pooler_dropout) + self.out_proj = keras.layers.Dense(num_classes, name="out_proj") + self.input_dim = inner_dim + self.inner_dim = inner_dim + + def call(self, inputs): + hidden_states = self.dropout(inputs) + hidden_states = self.dense(hidden_states) + hidden_states = keras.activations.tanh(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.out_proj(hidden_states) + return hidden_states + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build([None, None, self.input_dim]) + if getattr(self, "out_proj", None) is not None: + with tf.name_scope(self.out_proj.name): + self.out_proj.build([None, None, self.inner_dim]) + + +class TFBartPretrainedModel(TFPreTrainedModel): + config_class = BartConfig + base_model_prefix = "model" + + @property + def dummy_inputs(self): + dummy_inputs = super().dummy_inputs + # Dummy inputs should not contain the default val of 1 + # as this is the padding token and some assertions check it + dummy_inputs["input_ids"] = dummy_inputs["input_ids"] * 2 + if "decoder_input_ids" in dummy_inputs: + dummy_inputs["decoder_input_ids"] = dummy_inputs["decoder_input_ids"] * 2 + return dummy_inputs + + def tf_to_pt_weight_rename(self, tf_weight): + if tf_weight == "model.shared.weight": + return tf_weight, "model.decoder.embed_tokens.weight" + else: + return (tf_weight,) + + +BART_START_DOCSTRING = r""" + This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and + behavior. + + + + TensorFlow models and layers in `transformers` accept two formats as input: + + - having all inputs as keyword arguments (like PyTorch models), or + - having all inputs as a list, tuple or dict in the first positional argument. + + The reason the second format is supported is that Keras methods prefer this format when passing inputs to models + and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just + pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second + format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with + the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first + positional argument: + + - a single Tensor with `input_ids` only and nothing else: `model(input_ids)` + - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring: + `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])` + - a dictionary with one or several input Tensors associated to the input names given in the docstring: + `model({"input_ids": input_ids, "token_type_ids": token_type_ids})` + + Note that when creating models and layers with + [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry + about any of this, as you can just pass inputs like you would to any other Python function! + + + + Args: + config ([`BartConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights. +""" + + +BART_GENERATION_EXAMPLE = r""" + Summarization example: + + ```python + >>> from transformers import AutoTokenizer, TFBartForConditionalGeneration + + >>> model = TFBartForConditionalGeneration.from_pretrained("facebook/bart-large") + >>> tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large") + + >>> ARTICLE_TO_SUMMARIZE = "My friends are cool but they eat too many carbs." + >>> inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors="tf") + + >>> # Generate Summary + >>> summary_ids = model.generate(inputs["input_ids"], num_beams=4, max_length=5) + >>> print(tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)) + ``` + + Mask filling example: + + ```python + >>> from transformers import AutoTokenizer, TFBartForConditionalGeneration + + >>> tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large") + >>> TXT = "My friends are but they eat too many carbs." + + >>> model = TFBartForConditionalGeneration.from_pretrained("facebook/bart-large") + >>> input_ids = tokenizer([TXT], return_tensors="tf")["input_ids"] + >>> logits = model(input_ids).logits + >>> probs = tf.nn.softmax(logits[0]) + >>> # probs[5] is associated with the mask token + ``` +""" + + +BART_INPUTS_DOCSTRING = r""" + Args: + input_ids (`tf.Tensor` of shape `({0})`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`tf.Tensor` of shape `({0})`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + decoder_input_ids (`tf.Tensor` of shape `(batch_size, target_sequence_length)`, *optional*): + Indices of decoder input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are decoder input IDs?](../glossary#decoder-input-ids) + + Bart uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values` + is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`). + + For translation and summarization training, `decoder_input_ids` should be provided. If no + `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right + for denoising pre-training following the paper. + decoder_attention_mask (`tf.Tensor` of shape `(batch_size, target_sequence_length)`, *optional*): + will be made by default and ignore pad tokens. It is not recommended to set this for most use cases. + decoder_position_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the + range `[0, config.max_position_embeddings - 1]`. + head_mask (`tf.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + decoder_head_mask (`tf.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + cross_attn_head_mask (`tf.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + encoder_outputs (`tf.FloatTensor`, *optional*): + hidden states at the output of the last layer of the encoder. Used in the cross-attention of the decoder. + of shape `(batch_size, sequence_length, hidden_size)` is a sequence of + past_key_values (`tuple[tuple[tf.Tensor]]` of length `config.n_layers`) + contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `(batch_size, sequence_length)`. + inputs_embeds (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert `input_ids` indices into associated vectors + than the model's internal embedding lookup matrix. + use_cache (`bool`, *optional*, defaults to `True`): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). Set to `False` during training, `True` during generation + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the + config will be used instead. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. This argument can be used only in eager mode, in graph mode the value in the config will be + used instead. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in + eager mode, in graph mode the value will always be set to True. + training (`bool`, *optional*, defaults to `False`): + Whether or not to use the model in training mode (some modules like dropout modules have different + behaviors between training and evaluation). +""" + + +@keras_serializable +class TFBartEncoder(keras.layers.Layer): + config_class = BartConfig + """ + Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a + [`TFBartEncoderLayer`]. + + Args: + config: BartConfig + """ + + def __init__(self, config: BartConfig, embed_tokens: keras.layers.Embedding | None = None, **kwargs): + super().__init__(**kwargs) + self.config = config + self.dropout = keras.layers.Dropout(config.dropout) + self.layerdrop = config.encoder_layerdrop + self.padding_idx = config.pad_token_id + self.max_source_positions = config.max_position_embeddings + self.embed_scale = tf.math.sqrt(float(config.d_model)) if config.scale_embedding else 1.0 + + self.embed_tokens = embed_tokens + self.embed_positions = TFBartLearnedPositionalEmbedding( + config.max_position_embeddings, + config.d_model, + name="embed_positions", + ) + self.layers = [TFBartEncoderLayer(config, name=f"layers.{i}") for i in range(config.encoder_layers)] + self.layernorm_embedding = keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_embedding") + self.embed_dim = config.d_model + + @unpack_inputs + def call( + self, + input_ids: TFModelInputType | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + training: bool | None = False, + ) -> TFBaseModelOutput | tuple[tf.Tensor]: + """ + Args: + input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you + provide it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + head_mask (`tf.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, `optional): + Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + inputs_embeds (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert `input_ids` indices into associated vectors + than the model's internal embedding lookup matrix. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + """ + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + input_shape = shape_list(input_ids) + elif inputs_embeds is not None: + input_shape = shape_list(inputs_embeds)[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + if inputs_embeds is None: + check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim) + inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale + + embed_pos = self.embed_positions(input_shape) + hidden_states = inputs_embeds + embed_pos + hidden_states = self.layernorm_embedding(hidden_states) + hidden_states = self.dropout(hidden_states, training=training) + + # check attention mask and invert + if attention_mask is not None: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + attention_mask = _expand_mask(attention_mask) + else: + attention_mask = None + + encoder_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + + # check if head_mask has a correct number of layers specified if desired + if head_mask is not None: + tf.debugging.assert_equal( + shape_list(head_mask)[0], + len(self.layers), + message=( + f"The head_mask should be specified for {len(self.layers)} layers, but it is for" + f" {shape_list(head_mask)[0]}." + ), + ) + + # encoder layers + for idx, encoder_layer in enumerate(self.layers): + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + # add LayerDrop (see https://huggingface.co/papers/1909.11556 for description) + dropout_probability = random.uniform(0, 1) + if training and (dropout_probability < self.layerdrop): # skip the layer + continue + + hidden_states, attn = encoder_layer( + hidden_states, + attention_mask, + head_mask[idx] if head_mask is not None else None, + ) + + if output_attentions: + all_attentions += (attn,) + + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) + return TFBaseModelOutput( + last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "embed_positions", None) is not None: + with tf.name_scope(self.embed_positions.name): + self.embed_positions.build(None) + if getattr(self, "layernorm_embedding", None) is not None: + with tf.name_scope(self.layernorm_embedding.name): + self.layernorm_embedding.build([None, None, self.embed_dim]) + if getattr(self, "layers", None) is not None: + for layer in self.layers: + with tf.name_scope(layer.name): + layer.build(None) + + +@keras_serializable +class TFBartDecoder(keras.layers.Layer): + config_class = BartConfig + """ + Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`TFBartDecoderLayer`] + + Args: + config: BartConfig + embed_tokens: output embedding + """ + + def __init__(self, config: BartConfig, embed_tokens: keras.layers.Embedding | None = None, **kwargs): + super().__init__(**kwargs) + self.config = config + self.padding_idx = config.pad_token_id + self.embed_tokens = embed_tokens + self.layerdrop = config.decoder_layerdrop + self.embed_positions = TFBartLearnedPositionalEmbedding( + config.max_position_embeddings, + config.d_model, + name="embed_positions", + ) + self.embed_scale = tf.math.sqrt(float(config.d_model)) if config.scale_embedding else 1.0 + self.layers = [TFBartDecoderLayer(config, name=f"layers.{i}") for i in range(config.decoder_layers)] + self.layernorm_embedding = keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_embedding") + + self.dropout = keras.layers.Dropout(config.dropout) + + @unpack_inputs + def call( + self, + input_ids: TFModelInputType | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + encoder_hidden_states: np.ndarray | tf.Tensor | None = None, + encoder_attention_mask: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + cross_attn_head_mask: np.ndarray | tf.Tensor | None = None, + past_key_values: tuple[tuple[np.ndarray | tf.Tensor]] | None = None, + use_cache: bool | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + training: bool | None = False, + ) -> TFBaseModelOutputWithPastAndCrossAttentions | tuple[tf.Tensor]: + r""" + Args: + input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you + provide it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + position_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the + range `[0, config.max_position_embeddings - 1]`. + encoder_hidden_states (`tf.Tensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention + of the decoder. + encoder_attention_mask (`tf.Tensor` of shape `(batch_size, encoder_sequence_length)`, *optional*): + Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values + selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + head_mask (`tf.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + cross_attn_head_mask (`tf.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + past_key_values (`tuple[tuple[tf.Tensor]]` of length `config.n_layers` with each tuple having 2 tuples each of which has 2 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): + Contains precomputed key and value hidden-states of the attention blocks. Can be used to speed up + decoding. + + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those + that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of + all `decoder_input_ids` of shape `(batch_size, sequence_length)`. + inputs_embeds (`tf.tTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert `input_ids` indices into associated vectors + than the model's internal embedding lookup matrix. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + """ + + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time") + elif input_ids is not None: + input_shape = shape_list(input_ids) + elif inputs_embeds is not None: + input_shape = shape_list(inputs_embeds)[:-1] + else: + raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds") + + past_key_values_length = shape_list(past_key_values[0][0])[2] if past_key_values is not None else 0 + + # embed positions + if position_ids is None: + positions = self.embed_positions(input_shape, past_key_values_length) + else: + positions = self.embed_positions(input_shape, position_ids=position_ids) + + if inputs_embeds is None: + check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim) + inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale + + hidden_states = inputs_embeds + + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + if input_shape[-1] > 1: + combined_attention_mask = _make_causal_mask(input_shape, past_key_values_length=past_key_values_length) + else: + combined_attention_mask = _expand_mask( + tf.ones((input_shape[0], input_shape[1] + past_key_values_length)), tgt_len=input_shape[-1] + ) + + if attention_mask is not None: + combined_attention_mask = combined_attention_mask + _expand_mask(attention_mask, tgt_len=input_shape[-1]) + + if encoder_hidden_states is not None and encoder_attention_mask is not None: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + encoder_attention_mask = _expand_mask(encoder_attention_mask, tgt_len=input_shape[-1]) + + hidden_states = self.layernorm_embedding(hidden_states + positions) + hidden_states = self.dropout(hidden_states, training=training) + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + all_cross_attns = () if (output_attentions and encoder_hidden_states is not None) else None + present_key_values = () if use_cache else None + + # check if head_mask and cross_attn_head_mask have a correct number of layers specified if desired + for attn_mask_name, attn_mask in [("head_mask", head_mask), ("cross_attn_head_mask", cross_attn_head_mask)]: + if attn_mask is not None: + tf.debugging.assert_equal( + shape_list(attn_mask)[0], + len(self.layers), + message=( + f"The {attn_mask_name} should be specified for {len(self.layers)} layers, but it is for" + f" {shape_list(attn_mask)[0]}." + ), + ) + + for idx, decoder_layer in enumerate(self.layers): + # add LayerDrop (see https://huggingface.co/papers/1909.11556 for description) + if output_hidden_states: + all_hidden_states += (hidden_states,) + + dropout_probability = random.uniform(0, 1) + + if training and (dropout_probability < self.layerdrop): + continue + + past_key_value = past_key_values[idx] if past_key_values is not None else None + + hidden_states, layer_self_attn, layer_cross_attn, present_key_value = decoder_layer( + hidden_states, + attention_mask=combined_attention_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + layer_head_mask=head_mask[idx] if head_mask is not None else None, + cross_attn_layer_head_mask=cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None, + past_key_value=past_key_value, + ) + + if use_cache: + present_key_values += (present_key_value,) + + if output_attentions: + all_self_attns += (layer_self_attn,) + + if encoder_hidden_states is not None: + all_cross_attns += (layer_cross_attn,) + + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if not return_dict: + return hidden_states, present_key_values, all_hidden_states, all_self_attns, all_cross_attns + else: + return TFBaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=present_key_values, + hidden_states=all_hidden_states, + attentions=all_self_attns, + cross_attentions=all_cross_attns, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "embed_positions", None) is not None: + with tf.name_scope(self.embed_positions.name): + self.embed_positions.build(None) + if getattr(self, "layernorm_embedding", None) is not None: + with tf.name_scope(self.layernorm_embedding.name): + self.layernorm_embedding.build([None, None, self.config.d_model]) + if getattr(self, "layers", None) is not None: + for layer in self.layers: + with tf.name_scope(layer.name): + layer.build(None) + + +@keras_serializable +class TFBartMainLayer(keras.layers.Layer): + config_class = BartConfig + + def __init__(self, config: BartConfig, load_weight_prefix=None, **kwargs): + super().__init__(**kwargs) + self.config = config + self.shared = keras.layers.Embedding( + input_dim=config.vocab_size, + output_dim=config.d_model, + embeddings_initializer=keras.initializers.TruncatedNormal(stddev=self.config.init_std), + name="model.shared", + ) + # Additional attribute to specify the expected name scope of the layer (for loading/storing weights) + self.shared.load_weight_prefix = "model.shared" if load_weight_prefix is None else load_weight_prefix + + self.encoder = TFBartEncoder(config, self.shared, name="encoder") + self.decoder = TFBartDecoder(config, self.shared, name="decoder") + + def get_input_embeddings(self): + return self.shared + + def set_input_embeddings(self, new_embeddings): + self.shared = new_embeddings + self.encoder.embed_tokens = self.shared + self.decoder.embed_tokens = self.shared + + @unpack_inputs + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + decoder_input_ids: np.ndarray | tf.Tensor | None = None, + decoder_attention_mask: np.ndarray | tf.Tensor | None = None, + decoder_position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + decoder_head_mask: np.ndarray | tf.Tensor | None = None, + cross_attn_head_mask: np.ndarray | tf.Tensor | None = None, + encoder_outputs: tuple | TFBaseModelOutput | None = None, + past_key_values: tuple[tuple[np.ndarray | tf.Tensor]] | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + decoder_inputs_embeds: np.ndarray | tf.Tensor | None = None, + use_cache: bool | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + training: bool | None = False, + **kwargs, + ) -> TFSeq2SeqModelOutput | tuple[tf.Tensor]: + # different to other models, Bart automatically creates decoder_input_ids from + # input_ids if no decoder_input_ids are provided + if decoder_input_ids is None and decoder_inputs_embeds is None: + if input_ids is None: + raise ValueError( + "If no `decoder_input_ids` or `decoder_inputs_embeds` are " + "passed, `input_ids` cannot be `None`. Please pass either " + "`input_ids` or `decoder_input_ids` or `decoder_inputs_embeds`." + ) + + decoder_input_ids = shift_tokens_right( + input_ids, self.config.pad_token_id, self.config.decoder_start_token_id + ) + + if encoder_outputs is None: + encoder_outputs = self.encoder( + input_ids=input_ids, + attention_mask=attention_mask, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + # If the user passed a tuple for encoder_outputs, we wrap it in a TFBaseModelOutput when return_dict=True + elif return_dict and not isinstance(encoder_outputs, TFBaseModelOutput): + encoder_outputs = TFBaseModelOutput( + last_hidden_state=encoder_outputs[0], + hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, + attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None, + ) + # If the user passed a TFBaseModelOutput for encoder_outputs, we wrap it in a tuple when return_dict=False + elif not return_dict and not isinstance(encoder_outputs, tuple): + encoder_outputs = encoder_outputs.to_tuple() + + decoder_outputs = self.decoder( + decoder_input_ids, + attention_mask=decoder_attention_mask, + position_ids=decoder_position_ids, + encoder_hidden_states=encoder_outputs[0], + encoder_attention_mask=attention_mask, + head_mask=decoder_head_mask, + cross_attn_head_mask=cross_attn_head_mask, + past_key_values=past_key_values, + inputs_embeds=decoder_inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + if not return_dict: + return decoder_outputs + encoder_outputs + + return TFSeq2SeqModelOutput( + last_hidden_state=decoder_outputs.last_hidden_state, + past_key_values=decoder_outputs.past_key_values, + decoder_hidden_states=decoder_outputs.hidden_states, + decoder_attentions=decoder_outputs.attentions, + cross_attentions=decoder_outputs.cross_attentions, + encoder_last_hidden_state=encoder_outputs.last_hidden_state, + encoder_hidden_states=encoder_outputs.hidden_states, + encoder_attentions=encoder_outputs.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + # The shared/tied weights expect to be in the model base namespace + # Adding "/" to the end (not the start!) of a tf.name_scope puts it in the root namespace rather than + # the current one. + with tf.name_scope(self.shared.load_weight_prefix + "/" + self.shared.name + "/"): + self.shared.build(None) + if getattr(self, "encoder", None) is not None: + with tf.name_scope(self.encoder.name): + self.encoder.build(None) + if getattr(self, "decoder", None) is not None: + with tf.name_scope(self.decoder.name): + self.decoder.build(None) + + +@add_start_docstrings( + "The bare BART Model outputting raw hidden-states without any specific head on top.", + BART_START_DOCSTRING, +) +class TFBartModel(TFBartPretrainedModel): + _requires_load_weight_prefix = True + + def __init__(self, config: BartConfig, load_weight_prefix=None, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + self.model = TFBartMainLayer(config, load_weight_prefix=load_weight_prefix, name="model") + + def get_encoder(self): + return self.model.encoder + + def get_decoder(self): + return self.model.decoder + + @add_start_docstrings_to_model_forward(BART_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFSeq2SeqModelOutput, + config_class=_CONFIG_FOR_DOC, + ) + @unpack_inputs + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + decoder_input_ids: np.ndarray | tf.Tensor | None = None, + decoder_attention_mask: np.ndarray | tf.Tensor | None = None, + decoder_position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + decoder_head_mask: np.ndarray | tf.Tensor | None = None, + cross_attn_head_mask: np.ndarray | tf.Tensor | None = None, + encoder_outputs: tuple | TFBaseModelOutput | None = None, + past_key_values: tuple[tuple[np.ndarray | tf.Tensor]] | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + decoder_inputs_embeds: np.ndarray | tf.Tensor | None = None, + use_cache: bool | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + training: bool | None = False, + **kwargs, + ) -> TFBaseModelOutput | tuple[tf.Tensor]: + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + decoder_input_ids=decoder_input_ids, + decoder_attention_mask=decoder_attention_mask, + decoder_position_ids=decoder_position_ids, + head_mask=head_mask, + decoder_head_mask=decoder_head_mask, + cross_attn_head_mask=cross_attn_head_mask, + encoder_outputs=encoder_outputs, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + decoder_inputs_embeds=decoder_inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + return outputs + + def serving_output(self, output): + pkv = tf.tuple(output.past_key_values)[1] if self.config.use_cache else None + dec_hs = tf.convert_to_tensor(output.decoder_hidden_states) if self.config.output_hidden_states else None + dec_attns = tf.convert_to_tensor(output.decoder_attentions) if self.config.output_attentions else None + cross_attns = tf.convert_to_tensor(output.cross_attentions) if self.config.output_attentions else None + enc_hs = tf.convert_to_tensor(output.encoder_hidden_states) if self.config.output_hidden_states else None + enc_attns = tf.convert_to_tensor(output.encoder_attentions) if self.config.output_attentions else None + + return TFSeq2SeqModelOutput( + last_hidden_state=output.last_hidden_state, + past_key_values=pkv, + decoder_hidden_states=dec_hs, + decoder_attentions=dec_attns, + cross_attentions=cross_attns, + encoder_last_hidden_state=output.encoder_last_hidden_state, + encoder_hidden_states=enc_hs, + encoder_attentions=enc_attns, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "model", None) is not None: + with tf.name_scope(self.model.name): + self.model.build(None) + + +class BiasLayer(keras.layers.Layer): + """ + Bias as a layer. It is used for serialization purposes: `keras.Model.save_weights` stores on a per-layer basis, + so all weights have to be registered in a layer. + """ + + def __init__(self, shape, initializer, trainable, name, **kwargs): + super().__init__(name=name, **kwargs) + # Note: the name of this variable will NOT be scoped when serialized, i.e. it will not be in the format of + # "outer_layer/inner_layer/.../name:0". Instead, it will be "name:0". For further details, see: + # https://github.com/huggingface/transformers/pull/18833#issuecomment-1233090214 + self.bias = self.add_weight(name=name, shape=shape, initializer=initializer, trainable=trainable) + + def call(self, x): + return x + self.bias + + +@add_start_docstrings( + "The BART Model with a language modeling head. Can be used for summarization.", + BART_START_DOCSTRING, +) +class TFBartForConditionalGeneration(TFBartPretrainedModel, TFCausalLanguageModelingLoss): + _keys_to_ignore_on_load_missing = [r"final_logits_bias"] + _requires_load_weight_prefix = True + + def __init__(self, config, load_weight_prefix=None, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + self.model = TFBartMainLayer(config, load_weight_prefix=load_weight_prefix, name="model") + self.use_cache = config.use_cache + # final_bias_logits is registered as a buffer in pytorch, so not trainable for the sake of consistency. + self.bias_layer = BiasLayer( + name="final_logits_bias", shape=[1, config.vocab_size], initializer="zeros", trainable=False + ) + + def get_decoder(self): + return self.model.decoder + + def get_encoder(self): + return self.model.encoder + + def get_output_embeddings(self): + return self.get_input_embeddings() + + def set_output_embeddings(self, value): + self.set_input_embeddings(value) + + def get_bias(self): + return {"final_logits_bias": self.bias_layer.bias} + + def set_bias(self, value): + # Replaces the existing layers containing bias for correct (de)serialization. + vocab_size = value["final_logits_bias"].shape[-1] + self.bias_layer = BiasLayer( + name="final_logits_bias", shape=[1, vocab_size], initializer="zeros", trainable=False + ) + self.bias_layer.bias.assign(value["final_logits_bias"]) + + @add_start_docstrings_to_model_forward(BART_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=TFSeq2SeqLMOutput, config_class=_CONFIG_FOR_DOC) + @add_end_docstrings(BART_GENERATION_EXAMPLE) + @unpack_inputs + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + decoder_input_ids: np.ndarray | tf.Tensor | None = None, + decoder_attention_mask: np.ndarray | tf.Tensor | None = None, + decoder_position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + decoder_head_mask: np.ndarray | tf.Tensor | None = None, + cross_attn_head_mask: np.ndarray | tf.Tensor | None = None, + encoder_outputs: TFBaseModelOutput | None = None, + past_key_values: tuple[tuple[np.ndarray | tf.Tensor]] | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + decoder_inputs_embeds: np.ndarray | tf.Tensor | None = None, + use_cache: bool | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + labels: tf.Tensor | None = None, + training: bool | None = False, + ) -> TFSeq2SeqLMOutput | tuple[tf.Tensor]: + r""" + labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Returns: + + """ + + if labels is not None: + labels = tf.where( + labels == self.config.pad_token_id, + tf.cast(tf.fill(shape_list(labels), -100), labels.dtype), + labels, + ) + use_cache = False + if decoder_input_ids is None and decoder_inputs_embeds is None: + decoder_input_ids = shift_tokens_right( + labels, self.config.pad_token_id, self.config.decoder_start_token_id + ) + + outputs = self.model( + input_ids, + attention_mask=attention_mask, + decoder_input_ids=decoder_input_ids, + encoder_outputs=encoder_outputs, + decoder_attention_mask=decoder_attention_mask, + decoder_position_ids=decoder_position_ids, + head_mask=head_mask, + decoder_head_mask=decoder_head_mask, + cross_attn_head_mask=cross_attn_head_mask, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + decoder_inputs_embeds=decoder_inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + lm_logits = tf.matmul(outputs[0], self.model.shared.weights, transpose_b=True) + lm_logits = self.bias_layer(lm_logits) + masked_lm_loss = None if labels is None else self.hf_compute_loss(labels, lm_logits) + + if not return_dict: + output = (lm_logits,) + outputs[1:] + return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output + return TFSeq2SeqLMOutput( + loss=masked_lm_loss, + logits=lm_logits, + past_key_values=outputs.past_key_values, # index 1 of d outputs + decoder_hidden_states=outputs.decoder_hidden_states, # index 2 of d outputs + decoder_attentions=outputs.decoder_attentions, # index 3 of d outputs + cross_attentions=outputs.cross_attentions, # index 4 of d outputs + encoder_last_hidden_state=outputs.encoder_last_hidden_state, # index 0 of encoder outputs + encoder_hidden_states=outputs.encoder_hidden_states, # 1 of e out + encoder_attentions=outputs.encoder_attentions, # 2 of e out + ) + + def serving_output(self, output): + pkv = tf.tuple(output.past_key_values)[1] if self.config.use_cache else None + dec_hs = tf.convert_to_tensor(output.decoder_hidden_states) if self.config.output_hidden_states else None + dec_attns = tf.convert_to_tensor(output.decoder_attentions) if self.config.output_attentions else None + cross_attns = tf.convert_to_tensor(output.cross_attentions) if self.config.output_attentions else None + enc_hs = tf.convert_to_tensor(output.encoder_hidden_states) if self.config.output_hidden_states else None + enc_attns = tf.convert_to_tensor(output.encoder_attentions) if self.config.output_attentions else None + + return TFSeq2SeqLMOutput( + logits=output.logits, + past_key_values=pkv, + decoder_hidden_states=dec_hs, + decoder_attentions=dec_attns, + cross_attentions=cross_attns, + encoder_last_hidden_state=output.encoder_last_hidden_state, + encoder_hidden_states=enc_hs, + encoder_attentions=enc_attns, + ) + + def prepare_inputs_for_generation( + self, + decoder_input_ids, + past_key_values=None, + attention_mask=None, + decoder_attention_mask=None, + head_mask=None, + decoder_head_mask=None, + cross_attn_head_mask=None, + use_cache=None, + encoder_outputs=None, + **kwargs, + ): + # cut decoder_input_ids if past_key_values is used + if past_key_values is not None: + decoder_input_ids = decoder_input_ids[:, -1:] + + if decoder_attention_mask is not None: # xla + decoder_position_ids = tf.math.cumsum(decoder_attention_mask, axis=-1, exclusive=True)[:, -1:] + elif past_key_values is not None: # no xla + past_key_values + decoder_position_ids = past_key_values[0][0].shape[2] + else: # no xla + no past_key_values + decoder_position_ids = tf.range(decoder_input_ids.shape[1]) + + return { + "input_ids": None, # encoder_outputs is defined. input_ids not needed + "encoder_outputs": encoder_outputs, + "past_key_values": past_key_values, + "decoder_input_ids": decoder_input_ids, + "attention_mask": attention_mask, + "decoder_attention_mask": decoder_attention_mask, + "decoder_position_ids": decoder_position_ids, + "head_mask": head_mask, + "decoder_head_mask": decoder_head_mask, + "cross_attn_head_mask": cross_attn_head_mask, + "use_cache": use_cache, # change this to avoid caching (presumably for debugging) + } + + def prepare_decoder_input_ids_from_labels(self, labels: tf.Tensor): + return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "model", None) is not None: + with tf.name_scope(self.model.name): + self.model.build(None) + if getattr(self, "bias_layer", None) is not None: + with tf.name_scope(self.bias_layer.name): + self.bias_layer.build(None) + + +@add_start_docstrings( + """ + Bart model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE + tasks. + """, + BART_START_DOCSTRING, +) +class TFBartForSequenceClassification(TFBartPretrainedModel, TFSequenceClassificationLoss): + def __init__(self, config: BartConfig, load_weight_prefix=None, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + self.model = TFBartMainLayer(config, load_weight_prefix=load_weight_prefix, name="model") + self.classification_head = TFBartClassificationHead( + config.d_model, config.num_labels, config.classifier_dropout, name="classification_head" + ) + + @add_start_docstrings_to_model_forward(BART_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=TFSeq2SeqSequenceClassifierOutput, config_class=_CONFIG_FOR_DOC) + @unpack_inputs + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + decoder_input_ids: np.ndarray | tf.Tensor | None = None, + decoder_attention_mask: np.ndarray | tf.Tensor | None = None, + decoder_position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + decoder_head_mask: np.ndarray | tf.Tensor | None = None, + cross_attn_head_mask: np.ndarray | tf.Tensor | None = None, + encoder_outputs: TFBaseModelOutput | None = None, + past_key_values: tuple[tuple[np.ndarray | tf.Tensor]] | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + decoder_inputs_embeds: np.ndarray | tf.Tensor | None = None, + use_cache: bool | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + labels: tf.Tensor | None = None, + training: bool | None = False, + ) -> TFSeq2SeqSequenceClassifierOutput | tuple[tf.Tensor]: + r""" + labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + + Returns: + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + if labels is not None: + use_cache = False + + if input_ids is None and inputs_embeds is not None: + raise NotImplementedError( + f"Passing input embeddings is currently not supported for {self.__class__.__name__}" + ) + + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + decoder_input_ids=decoder_input_ids, + decoder_attention_mask=decoder_attention_mask, + decoder_position_ids=decoder_position_ids, + head_mask=head_mask, + decoder_head_mask=decoder_head_mask, + cross_attn_head_mask=cross_attn_head_mask, + encoder_outputs=encoder_outputs, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + decoder_inputs_embeds=decoder_inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + last_hidden_state = outputs[0] + eos_mask = tf.equal(input_ids, self.config.eos_token_id) + # out the rows with False where present. Then verify all the final + # entries are True + self_masked = tf.reshape(tf.boolean_mask(eos_mask, eos_mask), (tf.shape(input_ids)[0], -1)) + tf.Assert(tf.reduce_all(self_masked[:, -1]), ["All examples must have the same number of tokens."]) + + masked = tf.reshape( + tf.boolean_mask(last_hidden_state, eos_mask), + (tf.shape(input_ids)[0], tf.shape(self_masked)[1], tf.shape(last_hidden_state)[-1]), + ) + + sentence_representation = masked[:, -1, :] + logits = self.classification_head(sentence_representation) + loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=logits) + + if not return_dict: + output = (logits,) + outputs[1:] + return ((loss,) + output) if loss is not None else output + + return TFSeq2SeqSequenceClassifierOutput( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + decoder_hidden_states=outputs.decoder_hidden_states, + decoder_attentions=outputs.decoder_attentions, + cross_attentions=outputs.cross_attentions, + encoder_last_hidden_state=outputs.encoder_last_hidden_state, + encoder_hidden_states=outputs.encoder_hidden_states, + encoder_attentions=outputs.encoder_attentions, + ) + + def serving_output(self, output): + logits = tf.convert_to_tensor(output.logits) + pkv = tf.tuple(output.past_key_values)[1] if self.config.use_cache else None + dec_hs = tf.convert_to_tensor(output.decoder_hidden_states) if self.config.output_hidden_states else None + dec_attns = tf.convert_to_tensor(output.decoder_attentions) if self.config.output_attentions else None + cross_attns = tf.convert_to_tensor(output.cross_attentions) if self.config.output_attentions else None + enc_hs = tf.convert_to_tensor(output.encoder_hidden_states) if self.config.output_hidden_states else None + enc_attns = tf.convert_to_tensor(output.encoder_attentions) if self.config.output_attentions else None + + return TFSeq2SeqSequenceClassifierOutput( + logits=logits, + past_key_values=pkv, + decoder_hidden_states=dec_hs, + decoder_attentions=dec_attns, + cross_attentions=cross_attns, + encoder_last_hidden_state=output.encoder_last_hidden_state, + encoder_hidden_states=enc_hs, + encoder_attentions=enc_attns, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "model", None) is not None: + with tf.name_scope(self.model.name): + self.model.build(None) + if getattr(self, "classification_head", None) is not None: + with tf.name_scope(self.classification_head.name): + self.classification_head.build(None) + + +__all__ = ["TFBartForConditionalGeneration", "TFBartForSequenceClassification", "TFBartModel", "TFBartPretrainedModel"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bart/tokenization_bart.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bart/tokenization_bart.py new file mode 100644 index 0000000000000000000000000000000000000000..f674afe1a4126781ef64f8b5e96436b0025490c2 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bart/tokenization_bart.py @@ -0,0 +1,393 @@ +# coding=utf-8 +# Copyright 2020 The Facebook AI Research Team Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import os +from functools import lru_cache +from typing import Optional + +import regex as re + +from ...tokenization_utils import AddedToken, PreTrainedTokenizer +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt"} + +# See all BART models at https://huggingface.co/models?filter=bart + + +@lru_cache +def bytes_to_unicode(): + """ + Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control + characters the bpe code barfs on. + + The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab + if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for + decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup + tables between utf-8 bytes and unicode strings. + """ + bs = ( + list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1)) + ) + cs = bs[:] + n = 0 + for b in range(2**8): + if b not in bs: + bs.append(b) + cs.append(2**8 + n) + n += 1 + cs = [chr(n) for n in cs] + return dict(zip(bs, cs)) + + +def get_pairs(word): + """ + Return set of symbol pairs in a word. + + Word is represented as tuple of symbols (symbols being variable-length strings). + """ + pairs = set() + prev_char = word[0] + for char in word[1:]: + pairs.add((prev_char, char)) + prev_char = char + return pairs + + +class BartTokenizer(PreTrainedTokenizer): + """ + Constructs a BART tokenizer, which is smilar to the ROBERTa tokenizer, using byte-level Byte-Pair-Encoding. + + This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will + be encoded differently whether it is at the beginning of the sentence (without space) or not: + + ```python + >>> from transformers import BartTokenizer + + >>> tokenizer = BartTokenizer.from_pretrained("facebook/bart-base") + >>> tokenizer("Hello world")["input_ids"] + [0, 31414, 232, 2] + + >>> tokenizer(" Hello world")["input_ids"] + [0, 20920, 232, 2] + ``` + + You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer or when you + call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance. + + + + When used with `is_split_into_words=True`, this tokenizer will add a space before each word (even the first one). + + + + This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to + this superclass for more information regarding those methods. + + Args: + vocab_file (`str`): + Path to the vocabulary file. + merges_file (`str`): + Path to the merges file. + errors (`str`, *optional*, defaults to `"replace"`): + Paradigm to follow when decoding bytes to UTF-8. See + [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information. + bos_token (`str`, *optional*, defaults to `""`): + The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token. + + + + When building a sequence using special tokens, this is not the token that is used for the beginning of + sequence. The token used is the `cls_token`. + + + + eos_token (`str`, *optional*, defaults to `""`): + The end of sequence token. + + + + When building a sequence using special tokens, this is not the token that is used for the end of sequence. + The token used is the `sep_token`. + + + + sep_token (`str`, *optional*, defaults to `""`): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for + sequence classification or for a text and a question for question answering. It is also used as the last + token of a sequence built with special tokens. + cls_token (`str`, *optional*, defaults to `""`): + The classifier token which is used when doing sequence classification (classification of the whole sequence + instead of per-token classification). It is the first token of the sequence when built with special tokens. + unk_token (`str`, *optional*, defaults to `""`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + pad_token (`str`, *optional*, defaults to `""`): + The token used for padding, for example when batching sequences of different lengths. + mask_token (`str`, *optional*, defaults to `""`): + The token used for masking values. This is the token used when training this model with masked language + modeling. This is the token which the model will try to predict. + add_prefix_space (`bool`, *optional*, defaults to `False`): + Whether or not to add an initial space to the input. This allows to treat the leading word just as any + other word. (BART tokenizer detect beginning of words by the preceding space). + """ + + vocab_files_names = VOCAB_FILES_NAMES + model_input_names = ["input_ids", "attention_mask"] + + def __init__( + self, + vocab_file, + merges_file, + errors="replace", + bos_token="", + eos_token="", + sep_token="", + cls_token="", + unk_token="", + pad_token="", + mask_token="", + add_prefix_space=False, + **kwargs, + ): + bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token + eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token + sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token + cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token + unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token + pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token + + # Mask token behave like a normal word, i.e. include the space before it + mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token + + with open(vocab_file, encoding="utf-8") as vocab_handle: + self.encoder = json.load(vocab_handle) + self.decoder = {v: k for k, v in self.encoder.items()} + self.errors = errors # how to handle errors in decoding + self.byte_encoder = bytes_to_unicode() + self.byte_decoder = {v: k for k, v in self.byte_encoder.items()} + with open(merges_file, encoding="utf-8") as merges_handle: + bpe_merges = merges_handle.read().split("\n")[1:-1] + bpe_merges = [tuple(merge.split()) for merge in bpe_merges] + self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges)))) + self.cache = {} + self.add_prefix_space = add_prefix_space + + # Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions + self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""") + + super().__init__( + errors=errors, + bos_token=bos_token, + eos_token=eos_token, + unk_token=unk_token, + sep_token=sep_token, + cls_token=cls_token, + pad_token=pad_token, + mask_token=mask_token, + add_prefix_space=add_prefix_space, + **kwargs, + ) + + @property + def vocab_size(self): + return len(self.encoder) + + def get_vocab(self): + return dict(self.encoder, **self.added_tokens_encoder) + + def bpe(self, token): + if token in self.cache: + return self.cache[token] + word = tuple(token) + pairs = get_pairs(word) + + if not pairs: + return token + + while True: + bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf"))) + if bigram not in self.bpe_ranks: + break + first, second = bigram + new_word = [] + i = 0 + while i < len(word): + try: + j = word.index(first, i) + except ValueError: + new_word.extend(word[i:]) + break + else: + new_word.extend(word[i:j]) + i = j + + if word[i] == first and i < len(word) - 1 and word[i + 1] == second: + new_word.append(first + second) + i += 2 + else: + new_word.append(word[i]) + i += 1 + new_word = tuple(new_word) + word = new_word + if len(word) == 1: + break + else: + pairs = get_pairs(word) + word = " ".join(word) + self.cache[token] = word + return word + + def _tokenize(self, text): + """Tokenize a string.""" + bpe_tokens = [] + for token in re.findall(self.pat, text): + token = "".join( + self.byte_encoder[b] for b in token.encode("utf-8") + ) # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case) + bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" ")) + return bpe_tokens + + def _convert_token_to_id(self, token): + """Converts a token (str) in an id using the vocab.""" + return self.encoder.get(token, self.encoder.get(self.unk_token)) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.decoder.get(index) + + def convert_tokens_to_string(self, tokens): + """Converts a sequence of tokens (string) in a single string.""" + text = "".join(tokens) + text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors) + return text + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]: + if not os.path.isdir(save_directory): + logger.error(f"Vocabulary path ({save_directory}) should be a directory") + return + vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + merge_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"] + ) + + with open(vocab_file, "w", encoding="utf-8") as f: + f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n") + + index = 0 + with open(merge_file, "w", encoding="utf-8") as writer: + writer.write("#version: 0.2\n") + for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]): + if index != token_index: + logger.warning( + f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive." + " Please check that the tokenizer is not corrupted!" + ) + index = token_index + writer.write(" ".join(bpe_tokens) + "\n") + index += 1 + + return vocab_file, merge_file + + def build_inputs_with_special_tokens( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None + ) -> list[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A BART sequence has the following format: + + - single sequence: ` X ` + - pair of sequences: ` A B ` + + Args: + token_ids_0 (`list[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`list[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `list[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + if token_ids_1 is None: + return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] + cls = [self.cls_token_id] + sep = [self.sep_token_id] + return cls + token_ids_0 + sep + sep + token_ids_1 + sep + + def get_special_tokens_mask( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False + ) -> list[int]: + """ + Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer `prepare_for_model` method. + + Args: + token_ids_0 (`list[int]`): + List of IDs. + token_ids_1 (`list[int]`, *optional*): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not the token list is already formatted with special tokens for the model. + + Returns: + `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + if already_has_special_tokens: + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) + + if token_ids_1 is None: + return [1] + ([0] * len(token_ids_0)) + [1] + return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1] + + def create_token_type_ids_from_sequences( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None + ) -> list[int]: + """ + Create a mask from the two sequences passed to be used in a sequence-pair classification task. BART does not + make use of token type ids, therefore a list of zeros is returned. + + Args: + token_ids_0 (`list[int]`): + List of IDs. + token_ids_1 (`list[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `list[int]`: List of zeros. + """ + sep = [self.sep_token_id] + cls = [self.cls_token_id] + + if token_ids_1 is None: + return len(cls + token_ids_0 + sep) * [0] + return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0] + + def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs): + add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space) + if (is_split_into_words or add_prefix_space) and (len(text) > 0 and not text[0].isspace()): + text = " " + text + return (text, kwargs) + + +__all__ = ["BartTokenizer"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bart/tokenization_bart_fast.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bart/tokenization_bart_fast.py new file mode 100644 index 0000000000000000000000000000000000000000..88b002f595299f8d16b605c0ce4fd59330e5c4c4 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bart/tokenization_bart_fast.py @@ -0,0 +1,271 @@ +# coding=utf-8 +# Copyright 2020 The Facebook AI Research Team Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +from typing import Optional + +from tokenizers import processors + +from ...tokenization_utils_base import AddedToken, BatchEncoding +from ...tokenization_utils_fast import PreTrainedTokenizerFast +from ...utils import logging +from .tokenization_bart import BartTokenizer + + +logger = logging.get_logger(__name__) + + +VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"} + +# See all BART models at https://huggingface.co/models?filter=bart + + +class BartTokenizerFast(PreTrainedTokenizerFast): + r""" + Construct a "fast" BART tokenizer (backed by HuggingFace's *tokenizers* library), derived from the GPT-2 tokenizer, + using byte-level Byte-Pair-Encoding. + + This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will + be encoded differently whether it is at the beginning of the sentence (without space) or not: + + ```python + >>> from transformers import BartTokenizerFast + + >>> tokenizer = BartTokenizerFast.from_pretrained("facebook/bart-base") + >>> tokenizer("Hello world")["input_ids"] + [0, 31414, 232, 2] + + >>> tokenizer(" Hello world")["input_ids"] + [0, 20920, 232, 2] + ``` + + You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer or when you + call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance. + + + + When used with `is_split_into_words=True`, this tokenizer needs to be instantiated with `add_prefix_space=True`. + + + + This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should + refer to this superclass for more information regarding those methods. + + Args: + vocab_file (`str`): + Path to the vocabulary file. + merges_file (`str`): + Path to the merges file. + errors (`str`, *optional*, defaults to `"replace"`): + Paradigm to follow when decoding bytes to UTF-8. See + [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information. + bos_token (`str`, *optional*, defaults to `""`): + The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token. + + + + When building a sequence using special tokens, this is not the token that is used for the beginning of + sequence. The token used is the `cls_token`. + + + + eos_token (`str`, *optional*, defaults to `""`): + The end of sequence token. + + + + When building a sequence using special tokens, this is not the token that is used for the end of sequence. + The token used is the `sep_token`. + + + + sep_token (`str`, *optional*, defaults to `""`): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for + sequence classification or for a text and a question for question answering. It is also used as the last + token of a sequence built with special tokens. + cls_token (`str`, *optional*, defaults to `""`): + The classifier token which is used when doing sequence classification (classification of the whole sequence + instead of per-token classification). It is the first token of the sequence when built with special tokens. + unk_token (`str`, *optional*, defaults to `""`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + pad_token (`str`, *optional*, defaults to `""`): + The token used for padding, for example when batching sequences of different lengths. + mask_token (`str`, *optional*, defaults to `""`): + The token used for masking values. This is the token used when training this model with masked language + modeling. This is the token which the model will try to predict. + add_prefix_space (`bool`, *optional*, defaults to `False`): + Whether or not to add an initial space to the input. This allows to treat the leading word just as any + other word. (BART tokenizer detect beginning of words by the preceding space). + trim_offsets (`bool`, *optional*, defaults to `True`): + Whether the post processing step should trim offsets to avoid including whitespaces. + """ + + vocab_files_names = VOCAB_FILES_NAMES + model_input_names = ["input_ids", "attention_mask"] + slow_tokenizer_class = BartTokenizer + + def __init__( + self, + vocab_file=None, + merges_file=None, + tokenizer_file=None, + errors="replace", + bos_token="", + eos_token="", + sep_token="", + cls_token="", + unk_token="", + pad_token="", + mask_token="", + add_prefix_space=False, + trim_offsets=True, + **kwargs, + ): + # we have to specify that this tokens is special otherwise adding it will reset the normalized flag to `False` in `add_special_tokens` + mask_token = ( + AddedToken(mask_token, lstrip=True, normalized=True, special=True) + if isinstance(mask_token, str) + else mask_token + ) + super().__init__( + vocab_file, + merges_file, + tokenizer_file=tokenizer_file, + errors=errors, + bos_token=bos_token, + eos_token=eos_token, + sep_token=sep_token, + cls_token=cls_token, + unk_token=unk_token, + pad_token=pad_token, + mask_token=mask_token, + add_prefix_space=add_prefix_space, + trim_offsets=trim_offsets, + **kwargs, + ) + + # the pre_tokenizer is already updated in the GPT2TokenizerFast `__init__` + tokenizer_component = "post_processor" + tokenizer_component_instance = getattr(self.backend_tokenizer, tokenizer_component, None) + if tokenizer_component_instance: + state = json.loads(tokenizer_component_instance.__getstate__()) + + # The lists 'sep' and 'cls' must be cased in tuples for the object `post_processor_class` + if "sep" in state: + state["sep"] = tuple(state["sep"]) + if "cls" in state: + state["cls"] = tuple(state["cls"]) + + changes_to_apply = False + + if state.get("add_prefix_space", add_prefix_space) != add_prefix_space: + state["add_prefix_space"] = add_prefix_space + changes_to_apply = True + + if state.get("trim_offsets", trim_offsets) != trim_offsets: + state["trim_offsets"] = trim_offsets + changes_to_apply = True + + if changes_to_apply: + component_class = getattr(processors, state.pop("type")) + new_value = component_class(**state) + setattr(self.backend_tokenizer, tokenizer_component, new_value) + + @property + def mask_token(self) -> str: + """ + `str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while not + having been set. + + BART tokenizer has a special mask token to be usable in the fill-mask pipeline. The mask token will greedily + comprise the space before the **. + """ + if self._mask_token is None: + if self.verbose: + logger.error("Using mask_token, but it is not set yet.") + return None + return str(self._mask_token) + + @mask_token.setter + def mask_token(self, value): + """ + Overriding the default behavior of the mask token to have it eat the space before it. + + This is needed to preserve backward compatibility with all the previously used models based on Bart. + """ + # Mask token behave like a normal word, i.e. include the space before it + # So we set lstrip to True + value = AddedToken(value, lstrip=True, rstrip=False) if isinstance(value, str) else value + self._mask_token = value + + def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding: + is_split_into_words = kwargs.get("is_split_into_words", False) + + if is_split_into_words and not self.add_prefix_space: + raise ValueError( + f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True " + "to use it with pretokenized inputs." + ) + + return super()._batch_encode_plus(*args, **kwargs) + + def _encode_plus(self, *args, **kwargs) -> BatchEncoding: + is_split_into_words = kwargs.get("is_split_into_words", False) + + if is_split_into_words and not self.add_prefix_space: + raise ValueError( + f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True " + "to use it with pretokenized inputs." + ) + + return super()._encode_plus(*args, **kwargs) + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]: + files = self._tokenizer.model.save(save_directory, name=filename_prefix) + return tuple(files) + + def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): + output = [self.bos_token_id] + token_ids_0 + [self.eos_token_id] + if token_ids_1 is None: + return output + + return output + [self.eos_token_id] + token_ids_1 + [self.eos_token_id] + + def create_token_type_ids_from_sequences( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None + ) -> list[int]: + """ + Create a mask from the two sequences passed to be used in a sequence-pair classification task. BART does not + make use of token type ids, therefore a list of zeros is returned. + + Args: + token_ids_0 (`list[int]`): + List of IDs. + token_ids_1 (`list[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `list[int]`: List of zeros. + """ + sep = [self.sep_token_id] + cls = [self.cls_token_id] + + if token_ids_1 is None: + return len(cls + token_ids_0 + sep) * [0] + return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0] + + +__all__ = ["BartTokenizerFast"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/barthez/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/barthez/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..323fe2fe8af9823d4478957b2f94b078ec39b7f3 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/barthez/__init__.py @@ -0,0 +1,27 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .tokenization_barthez import * + from .tokenization_barthez_fast import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/barthez/tokenization_barthez.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/barthez/tokenization_barthez.py new file mode 100644 index 0000000000000000000000000000000000000000..bc583e0cd5dc455fbd91841de386e6bf54ad02b9 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/barthez/tokenization_barthez.py @@ -0,0 +1,291 @@ +# coding=utf-8 +# Copyright 2020 Ecole Polytechnique and the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License +"""Tokenization classes for the BARThez model.""" + +import os +from shutil import copyfile +from typing import Any, Optional + +import sentencepiece as spm + +from ...tokenization_utils import AddedToken, PreTrainedTokenizer +from ...utils import logging +from ...utils.import_utils import requires + + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model"} + + +SPIECE_UNDERLINE = "▁" + +# TODO this class is useless. This is the most standard sentencpiece model. Let's find which one is closest and nuke this. + + +@requires(backends=("sentencepiece",)) +class BarthezTokenizer(PreTrainedTokenizer): + """ + Adapted from [`CamembertTokenizer`] and [`BartTokenizer`]. Construct a BARThez tokenizer. Based on + [SentencePiece](https://github.com/google/sentencepiece). + + This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to + this superclass for more information regarding those methods. + + Args: + vocab_file (`str`): + [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that + contains the vocabulary necessary to instantiate a tokenizer. + bos_token (`str`, *optional*, defaults to `""`): + The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token. + + + + When building a sequence using special tokens, this is not the token that is used for the beginning of + sequence. The token used is the `cls_token`. + + + + eos_token (`str`, *optional*, defaults to `""`): + The end of sequence token. + + + + When building a sequence using special tokens, this is not the token that is used for the end of sequence. + The token used is the `sep_token`. + + + + sep_token (`str`, *optional*, defaults to `""`): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for + sequence classification or for a text and a question for question answering. It is also used as the last + token of a sequence built with special tokens. + cls_token (`str`, *optional*, defaults to `""`): + The classifier token which is used when doing sequence classification (classification of the whole sequence + instead of per-token classification). It is the first token of the sequence when built with special tokens. + unk_token (`str`, *optional*, defaults to `""`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + pad_token (`str`, *optional*, defaults to `""`): + The token used for padding, for example when batching sequences of different lengths. + mask_token (`str`, *optional*, defaults to `""`): + The token used for masking values. This is the token used when training this model with masked language + modeling. This is the token which the model will try to predict. + sp_model_kwargs (`dict`, *optional*): + Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for + SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things, + to set: + + - `enable_sampling`: Enable subword regularization. + - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout. + + - `nbest_size = {0,1}`: No sampling is performed. + - `nbest_size > 1`: samples from the nbest_size results. + - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice) + using forward-filtering-and-backward-sampling algorithm. + + - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for + BPE-dropout. + + Attributes: + sp_model (`SentencePieceProcessor`): + The *SentencePiece* processor that is used for every conversion (string, tokens and IDs). + """ + + vocab_files_names = VOCAB_FILES_NAMES + model_input_names = ["input_ids", "attention_mask"] + + def __init__( + self, + vocab_file, + bos_token="", + eos_token="", + sep_token="", + cls_token="", + unk_token="", + pad_token="", + mask_token="", + sp_model_kwargs: Optional[dict[str, Any]] = None, + **kwargs, + ) -> None: + # Mask token behave like a normal word, i.e. include the space before it. Will have normalized=False by default this way + mask_token = AddedToken(mask_token, lstrip=True, special=True) if isinstance(mask_token, str) else mask_token + + self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + + self.vocab_file = vocab_file + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) + self.sp_model.Load(str(vocab_file)) + super().__init__( + bos_token=bos_token, + eos_token=eos_token, + unk_token=unk_token, + sep_token=sep_token, + cls_token=cls_token, + pad_token=pad_token, + mask_token=mask_token, + sp_model_kwargs=self.sp_model_kwargs, + **kwargs, + ) + + def build_inputs_with_special_tokens( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None + ) -> list[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A BARThez sequence has the following format: + + - single sequence: ` X ` + - pair of sequences: ` A B ` + + Args: + token_ids_0 (`list[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`list[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `list[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + + if token_ids_1 is None: + return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] + cls = [self.cls_token_id] + sep = [self.sep_token_id] + return cls + token_ids_0 + sep + sep + token_ids_1 + sep + + def get_special_tokens_mask( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False + ) -> list[int]: + """ + Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer `prepare_for_model` method. + + Args: + token_ids_0 (`list[int]`): + List of IDs. + token_ids_1 (`list[int]`, *optional*): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not the token list is already formatted with special tokens for the model. + + Returns: + `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + if already_has_special_tokens: + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) + + if token_ids_1 is None: + return [1] + ([0] * len(token_ids_0)) + [1] + return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1] + + def create_token_type_ids_from_sequences( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None + ) -> list[int]: + """ + Create a mask from the two sequences passed to be used in a sequence-pair classification task. + + Args: + token_ids_0 (`list[int]`): + List of IDs. + token_ids_1 (`list[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `list[int]`: List of zeros. + """ + sep = [self.sep_token_id] + cls = [self.cls_token_id] + + if token_ids_1 is None: + return len(cls + token_ids_0 + sep) * [0] + return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0] + + @property + def vocab_size(self): + return len(self.sp_model) + + def get_vocab(self): + vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab + + def _tokenize(self, text: str) -> list[str]: + return self.sp_model.encode(text, out_type=str) + + def _convert_token_to_id(self, token): + """Converts a token (str) in an id using the vocab.""" + return self.sp_model.PieceToId(token) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.sp_model.IdToPiece(index) + + # Copied from transformers.models.albert.tokenization_albert.AlbertTokenizer.convert_tokens_to_string + def convert_tokens_to_string(self, tokens): + """Converts a sequence of tokens (string) in a single string.""" + current_sub_tokens = [] + out_string = "" + prev_is_special = False + for token in tokens: + # make sure that special tokens are not decoded using sentencepiece model + if token in self.all_special_tokens: + if not prev_is_special: + out_string += " " + out_string += self.sp_model.decode(current_sub_tokens) + token + prev_is_special = True + current_sub_tokens = [] + else: + current_sub_tokens.append(token) + prev_is_special = False + out_string += self.sp_model.decode(current_sub_tokens) + return out_string.strip() + + def __getstate__(self): + state = self.__dict__.copy() + state["sp_model"] = None + return state + + def __setstate__(self, d): + self.__dict__ = d + + # for backward compatibility + if not hasattr(self, "sp_model_kwargs"): + self.sp_model_kwargs = {} + + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) + self.sp_model.Load(self.vocab_file) + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]: + if not os.path.isdir(save_directory): + logger.error(f"Vocabulary path ({save_directory}) should be a directory") + return + out_vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + + if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file): + copyfile(self.vocab_file, out_vocab_file) + elif not os.path.isfile(self.vocab_file): + with open(out_vocab_file, "wb") as fi: + content_spiece_model = self.sp_model.serialized_model_proto() + fi.write(content_spiece_model) + + return (out_vocab_file,) + + +__all__ = ["BarthezTokenizer"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/barthez/tokenization_barthez_fast.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/barthez/tokenization_barthez_fast.py new file mode 100644 index 0000000000000000000000000000000000000000..64050ca8848f57c25d272e0bc31a3f878040e14c --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/barthez/tokenization_barthez_fast.py @@ -0,0 +1,193 @@ +# coding=utf-8 +# Copyright 2020 Ecole Polytechnique and the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License +"""Tokenization classes for the BARThez model.""" + +import os +from shutil import copyfile +from typing import Optional + +from ...tokenization_utils import AddedToken +from ...tokenization_utils_fast import PreTrainedTokenizerFast +from ...utils import is_sentencepiece_available, logging + + +if is_sentencepiece_available(): + from .tokenization_barthez import BarthezTokenizer +else: + BarthezTokenizer = None + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model", "tokenizer_file": "tokenizer.json"} + + +SPIECE_UNDERLINE = "▁" + + +class BarthezTokenizerFast(PreTrainedTokenizerFast): + """ + Adapted from [`CamembertTokenizer`] and [`BartTokenizer`]. Construct a "fast" BARThez tokenizer. Based on + [SentencePiece](https://github.com/google/sentencepiece). + + This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should + refer to this superclass for more information regarding those methods. + + Args: + vocab_file (`str`): + [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that + contains the vocabulary necessary to instantiate a tokenizer. + bos_token (`str`, *optional*, defaults to `""`): + The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token. + + + + When building a sequence using special tokens, this is not the token that is used for the beginning of + sequence. The token used is the `cls_token`. + + + + eos_token (`str`, *optional*, defaults to `""`): + The end of sequence token. + + + + When building a sequence using special tokens, this is not the token that is used for the end of sequence. + The token used is the `sep_token`. + + + + sep_token (`str`, *optional*, defaults to `""`): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for + sequence classification or for a text and a question for question answering. It is also used as the last + token of a sequence built with special tokens. + cls_token (`str`, *optional*, defaults to `""`): + The classifier token which is used when doing sequence classification (classification of the whole sequence + instead of per-token classification). It is the first token of the sequence when built with special tokens. + unk_token (`str`, *optional*, defaults to `""`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + pad_token (`str`, *optional*, defaults to `""`): + The token used for padding, for example when batching sequences of different lengths. + mask_token (`str`, *optional*, defaults to `""`): + The token used for masking values. This is the token used when training this model with masked language + modeling. This is the token which the model will try to predict. + additional_special_tokens (`list[str]`, *optional*, defaults to `["NOTUSED", "NOTUSED"]`): + Additional special tokens used by the tokenizer. + """ + + vocab_files_names = VOCAB_FILES_NAMES + model_input_names = ["input_ids", "attention_mask"] + slow_tokenizer_class = BarthezTokenizer + + def __init__( + self, + vocab_file=None, + tokenizer_file=None, + bos_token="", + eos_token="", + sep_token="", + cls_token="", + unk_token="", + pad_token="", + mask_token="", + **kwargs, + ): + # Mask token behave like a normal word, i.e. include the space before it + mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token + + super().__init__( + vocab_file, + tokenizer_file=tokenizer_file, + bos_token=bos_token, + eos_token=eos_token, + unk_token=unk_token, + sep_token=sep_token, + cls_token=cls_token, + pad_token=pad_token, + mask_token=mask_token, + **kwargs, + ) + + self.vocab_file = vocab_file + + def build_inputs_with_special_tokens( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None + ) -> list[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A BARThez sequence has the following format: + + - single sequence: ` X ` + - pair of sequences: ` A B ` + + Args: + token_ids_0 (`list[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`list[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `list[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + + if token_ids_1 is None: + return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] + cls = [self.cls_token_id] + sep = [self.sep_token_id] + return cls + token_ids_0 + sep + sep + token_ids_1 + sep + + def create_token_type_ids_from_sequences( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None + ) -> list[int]: + """ + Create a mask from the two sequences passed to be used in a sequence-pair classification task. + + Args: + token_ids_0 (`list[int]`): + List of IDs. + token_ids_1 (`list[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `list[int]`: List of zeros. + """ + sep = [self.sep_token_id] + cls = [self.cls_token_id] + + if token_ids_1 is None: + return len(cls + token_ids_0 + sep) * [0] + return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0] + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]: + if not self.can_save_slow_tokenizer: + raise ValueError( + "Your fast tokenizer does not have the necessary information to save the vocabulary for a slow " + "tokenizer." + ) + + if not os.path.isdir(save_directory): + logger.error(f"Vocabulary path ({save_directory}) should be a directory") + return + out_vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + + if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file): + copyfile(self.vocab_file, out_vocab_file) + + return (out_vocab_file,) + + +__all__ = ["BarthezTokenizerFast"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bartpho/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bartpho/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..597be95d8175cac9d48edf3eb4d42a2a91b95833 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bartpho/__init__.py @@ -0,0 +1,26 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .tokenization_bartpho import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bartpho/tokenization_bartpho.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bartpho/tokenization_bartpho.py new file mode 100644 index 0000000000000000000000000000000000000000..41a122bf913c54526cdd10c14fbbea5896da2d00 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bartpho/tokenization_bartpho.py @@ -0,0 +1,318 @@ +# coding=utf-8 +# Copyright 2021 VinAI Research and the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License +"""Tokenization classes for BARTpho-syllable model.""" + +import os +from shutil import copyfile +from typing import Any, Optional + +import sentencepiece as spm + +from ...tokenization_utils import AddedToken, PreTrainedTokenizer +from ...utils import logging +from ...utils.import_utils import requires + + +logger = logging.get_logger(__name__) + +SPIECE_UNDERLINE = "▁" + +VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model", "monolingual_vocab_file": "dict.txt"} + + +@requires(backends=("sentencepiece",)) +class BartphoTokenizer(PreTrainedTokenizer): + """ + Adapted from [`XLMRobertaTokenizer`]. Based on [SentencePiece](https://github.com/google/sentencepiece). + + This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to + this superclass for more information regarding those methods. + + Args: + vocab_file (`str`): + Path to the vocabulary file. This vocabulary is the pre-trained SentencePiece model available from the + multilingual XLM-RoBERTa, also used in mBART, consisting of 250K types. + monolingual_vocab_file (`str`): + Path to the monolingual vocabulary file. This monolingual vocabulary consists of Vietnamese-specialized + types extracted from the multilingual vocabulary vocab_file of 250K types. + bos_token (`str`, *optional*, defaults to `""`): + The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token. + + + + When building a sequence using special tokens, this is not the token that is used for the beginning of + sequence. The token used is the `cls_token`. + + + + eos_token (`str`, *optional*, defaults to `""`): + The end of sequence token. + + + + When building a sequence using special tokens, this is not the token that is used for the end of sequence. + The token used is the `sep_token`. + + + + sep_token (`str`, *optional*, defaults to `""`): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for + sequence classification or for a text and a question for question answering. It is also used as the last + token of a sequence built with special tokens. + cls_token (`str`, *optional*, defaults to `""`): + The classifier token which is used when doing sequence classification (classification of the whole sequence + instead of per-token classification). It is the first token of the sequence when built with special tokens. + unk_token (`str`, *optional*, defaults to `""`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + pad_token (`str`, *optional*, defaults to `""`): + The token used for padding, for example when batching sequences of different lengths. + mask_token (`str`, *optional*, defaults to `""`): + The token used for masking values. This is the token used when training this model with masked language + modeling. This is the token which the model will try to predict. + sp_model_kwargs (`dict`, *optional*): + Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for + SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things, + to set: + + - `enable_sampling`: Enable subword regularization. + - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout. + + - `nbest_size = {0,1}`: No sampling is performed. + - `nbest_size > 1`: samples from the nbest_size results. + - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice) + using forward-filtering-and-backward-sampling algorithm. + + - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for + BPE-dropout. + + Attributes: + sp_model (`SentencePieceProcessor`): + The *SentencePiece* processor that is used for every conversion (string, tokens and IDs). + """ + + vocab_files_names = VOCAB_FILES_NAMES + model_input_names = ["input_ids", "attention_mask"] + + def __init__( + self, + vocab_file, + monolingual_vocab_file, + bos_token="", + eos_token="", + sep_token="", + cls_token="", + unk_token="", + pad_token="", + mask_token="", + sp_model_kwargs: Optional[dict[str, Any]] = None, + **kwargs, + ) -> None: + # Mask token behave like a normal word, i.e. include the space before it + mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token + + self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + + self.vocab_file = vocab_file + self.monolingual_vocab_file = monolingual_vocab_file + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) + self.sp_model.Load(str(vocab_file)) + + # Load the reduced vocab + + # Keep order of special tokens for backward compatibility + self.fairseq_tokens_to_ids = {} + cnt = 0 + for token in [bos_token, pad_token, eos_token, unk_token, sep_token, cls_token]: + if str(token) not in self.fairseq_tokens_to_ids: + self.fairseq_tokens_to_ids[str(token)] = cnt + cnt += 1 + with open(monolingual_vocab_file, "r", encoding="utf-8") as f: + for line in f.readlines(): + token = line.strip().split()[0] + self.fairseq_tokens_to_ids[token] = len(self.fairseq_tokens_to_ids) + if str(mask_token) not in self.fairseq_tokens_to_ids: + self.fairseq_tokens_to_ids[str(mask_token)] = len(self.fairseq_tokens_to_ids) + + self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()} + + super().__init__( + bos_token=bos_token, + eos_token=eos_token, + unk_token=unk_token, + sep_token=sep_token, + cls_token=cls_token, + pad_token=pad_token, + mask_token=mask_token, + sp_model_kwargs=self.sp_model_kwargs, + **kwargs, + ) + + def __getstate__(self): + state = self.__dict__.copy() + state["sp_model"] = None + state["sp_model_proto"] = self.sp_model.serialized_model_proto() + return state + + def __setstate__(self, d): + self.__dict__ = d + + # for backward compatibility + if not hasattr(self, "sp_model_kwargs"): + self.sp_model_kwargs = {} + + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) + self.sp_model.LoadFromSerializedProto(self.sp_model_proto) + + def build_inputs_with_special_tokens( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None + ) -> list[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. An BARTPho sequence has the following format: + + - single sequence: ` X ` + - pair of sequences: ` A B ` + + Args: + token_ids_0 (`list[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`list[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `list[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + + if token_ids_1 is None: + return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] + cls = [self.cls_token_id] + sep = [self.sep_token_id] + return cls + token_ids_0 + sep + sep + token_ids_1 + sep + + def get_special_tokens_mask( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False + ) -> list[int]: + """ + Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer `prepare_for_model` method. + + Args: + token_ids_0 (`list[int]`): + List of IDs. + token_ids_1 (`list[int]`, *optional*): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not the token list is already formatted with special tokens for the model. + + Returns: + `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + + if already_has_special_tokens: + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) + + if token_ids_1 is None: + return [1] + ([0] * len(token_ids_0)) + [1] + return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1] + + def create_token_type_ids_from_sequences( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None + ) -> list[int]: + """ + Create a mask from the two sequences passed to be used in a sequence-pair classification task. BARTPho does not + make use of token type ids, therefore a list of zeros is returned. + + Args: + token_ids_0 (`list[int]`): + List of IDs. + token_ids_1 (`list[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `list[int]`: List of zeros. + + """ + + sep = [self.sep_token_id] + cls = [self.cls_token_id] + + if token_ids_1 is None: + return len(cls + token_ids_0 + sep) * [0] + return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0] + + @property + def vocab_size(self): + return len(self.fairseq_ids_to_tokens) + + def get_vocab(self): + vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab + + def _tokenize(self, text: str) -> list[str]: + return self.sp_model.encode(text, out_type=str) + + def _convert_token_to_id(self, token): + """Converts a token (str) in an id using the vocab.""" + if token in self.fairseq_tokens_to_ids: + return self.fairseq_tokens_to_ids[token] + else: + return self.unk_token_id + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.fairseq_ids_to_tokens[index] + + def convert_tokens_to_string(self, tokens): + """Converts a sequence of tokens (strings for sub-words) in a single string.""" + out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip() + return out_string + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]: + if not os.path.isdir(save_directory): + logger.error(f"Vocabulary path ({save_directory}) should be a directory") + return + out_vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + out_monolingual_vocab_file = os.path.join( + save_directory, + (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["monolingual_vocab_file"], + ) + + if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file): + copyfile(self.vocab_file, out_vocab_file) + elif not os.path.isfile(self.vocab_file): + with open(out_vocab_file, "wb") as fi: + content_spiece_model = self.sp_model.serialized_model_proto() + fi.write(content_spiece_model) + + if os.path.abspath(self.monolingual_vocab_file) != os.path.abspath( + out_monolingual_vocab_file + ) and os.path.isfile(self.monolingual_vocab_file): + copyfile(self.monolingual_vocab_file, out_monolingual_vocab_file) + elif not os.path.isfile(self.monolingual_vocab_file): + with open(out_monolingual_vocab_file, "w", encoding="utf-8") as fp: + for token in self.fairseq_tokens_to_ids: + if token not in self.all_special_tokens: + fp.write(f"{str(token)} \n") + + return out_vocab_file, out_monolingual_vocab_file + + +__all__ = ["BartphoTokenizer"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bert_japanese/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bert_japanese/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..f5296087db1d007eab946f795d0c9c8fa4bdaafe --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bert_japanese/__init__.py @@ -0,0 +1,26 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .tokenization_bert_japanese import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bert_japanese/tokenization_bert_japanese.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bert_japanese/tokenization_bert_japanese.py new file mode 100644 index 0000000000000000000000000000000000000000..cacacd87574a28527c5dad2c2c5391c3babc9b8b --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bert_japanese/tokenization_bert_japanese.py @@ -0,0 +1,952 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes.""" + +import collections +import copy +import os +import unicodedata +from typing import Any, Optional + +from ...tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace +from ...utils import is_sentencepiece_available, is_sudachi_projection_available, logging + + +if is_sentencepiece_available(): + import sentencepiece as spm +else: + spm = None + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "spm_file": "spiece.model"} + +SPIECE_UNDERLINE = "▁" + + +# Copied from transformers.models.bert.tokenization_bert.load_vocab +def load_vocab(vocab_file): + """Loads a vocabulary file into a dictionary.""" + vocab = collections.OrderedDict() + with open(vocab_file, "r", encoding="utf-8") as reader: + tokens = reader.readlines() + for index, token in enumerate(tokens): + token = token.rstrip("\n") + vocab[token] = index + return vocab + + +# Copied from transformers.models.bert.tokenization_bert.whitespace_tokenize +def whitespace_tokenize(text): + """Runs basic whitespace cleaning and splitting on a piece of text.""" + text = text.strip() + if not text: + return [] + tokens = text.split() + return tokens + + +class BertJapaneseTokenizer(PreTrainedTokenizer): + r""" + Construct a BERT tokenizer for Japanese text. + + This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer + to: this superclass for more information regarding those methods. + + Args: + vocab_file (`str`): + Path to a one-wordpiece-per-line vocabulary file. + spm_file (`str`, *optional*): + Path to [SentencePiece](https://github.com/google/sentencepiece) file (generally has a .spm or .model + extension) that contains the vocabulary. + do_lower_case (`bool`, *optional*, defaults to `True`): + Whether to lower case the input. Only has an effect when do_basic_tokenize=True. + do_word_tokenize (`bool`, *optional*, defaults to `True`): + Whether to do word tokenization. + do_subword_tokenize (`bool`, *optional*, defaults to `True`): + Whether to do subword tokenization. + word_tokenizer_type (`str`, *optional*, defaults to `"basic"`): + Type of word tokenizer. Choose from ["basic", "mecab", "sudachi", "jumanpp"]. + subword_tokenizer_type (`str`, *optional*, defaults to `"wordpiece"`): + Type of subword tokenizer. Choose from ["wordpiece", "character", "sentencepiece",]. + mecab_kwargs (`dict`, *optional*): + Dictionary passed to the `MecabTokenizer` constructor. + sudachi_kwargs (`dict`, *optional*): + Dictionary passed to the `SudachiTokenizer` constructor. + jumanpp_kwargs (`dict`, *optional*): + Dictionary passed to the `JumanppTokenizer` constructor. + """ + + vocab_files_names = VOCAB_FILES_NAMES + + def __init__( + self, + vocab_file, + spm_file=None, + do_lower_case=False, + do_word_tokenize=True, + do_subword_tokenize=True, + word_tokenizer_type="basic", + subword_tokenizer_type="wordpiece", + never_split=None, + unk_token="[UNK]", + sep_token="[SEP]", + pad_token="[PAD]", + cls_token="[CLS]", + mask_token="[MASK]", + mecab_kwargs=None, + sudachi_kwargs=None, + jumanpp_kwargs=None, + **kwargs, + ): + if subword_tokenizer_type == "sentencepiece": + if not os.path.isfile(spm_file): + raise ValueError( + f"Can't find a vocabulary file at path '{spm_file}'. To load the vocabulary from a Google" + " pretrained model use `tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`" + ) + self.spm_file = spm_file + else: + if not os.path.isfile(vocab_file): + raise ValueError( + f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google" + " pretrained model use `tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`" + ) + self.vocab = load_vocab(vocab_file) + self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()]) + + self.do_word_tokenize = do_word_tokenize + self.word_tokenizer_type = word_tokenizer_type + self.lower_case = do_lower_case + self.never_split = never_split + self.mecab_kwargs = copy.deepcopy(mecab_kwargs) + self.sudachi_kwargs = copy.deepcopy(sudachi_kwargs) + self.jumanpp_kwargs = copy.deepcopy(jumanpp_kwargs) + if do_word_tokenize: + if word_tokenizer_type == "basic": + self.word_tokenizer = BasicTokenizer( + do_lower_case=do_lower_case, never_split=never_split, tokenize_chinese_chars=False + ) + elif word_tokenizer_type == "mecab": + self.word_tokenizer = MecabTokenizer( + do_lower_case=do_lower_case, never_split=never_split, **(mecab_kwargs or {}) + ) + elif word_tokenizer_type == "sudachi": + self.word_tokenizer = SudachiTokenizer( + do_lower_case=do_lower_case, never_split=never_split, **(sudachi_kwargs or {}) + ) + elif word_tokenizer_type == "jumanpp": + self.word_tokenizer = JumanppTokenizer( + do_lower_case=do_lower_case, never_split=never_split, **(jumanpp_kwargs or {}) + ) + else: + raise ValueError(f"Invalid word_tokenizer_type '{word_tokenizer_type}' is specified.") + + self.do_subword_tokenize = do_subword_tokenize + self.subword_tokenizer_type = subword_tokenizer_type + if do_subword_tokenize: + if subword_tokenizer_type == "wordpiece": + self.subword_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token)) + elif subword_tokenizer_type == "character": + self.subword_tokenizer = CharacterTokenizer(vocab=self.vocab, unk_token=str(unk_token)) + elif subword_tokenizer_type == "sentencepiece": + self.subword_tokenizer = SentencepieceTokenizer(vocab=self.spm_file, unk_token=str(unk_token)) + else: + raise ValueError(f"Invalid subword_tokenizer_type '{subword_tokenizer_type}' is specified.") + super().__init__( + spm_file=spm_file, + unk_token=unk_token, + sep_token=sep_token, + pad_token=pad_token, + cls_token=cls_token, + mask_token=mask_token, + do_lower_case=do_lower_case, + do_word_tokenize=do_word_tokenize, + do_subword_tokenize=do_subword_tokenize, + word_tokenizer_type=word_tokenizer_type, + subword_tokenizer_type=subword_tokenizer_type, + never_split=never_split, + mecab_kwargs=mecab_kwargs, + sudachi_kwargs=sudachi_kwargs, + jumanpp_kwargs=jumanpp_kwargs, + **kwargs, + ) + + @property + def do_lower_case(self): + return self.lower_case + + def __getstate__(self): + state = dict(self.__dict__) + if self.word_tokenizer_type in ["mecab", "sudachi", "jumanpp"]: + del state["word_tokenizer"] + return state + + def __setstate__(self, state): + self.__dict__ = state + if self.word_tokenizer_type == "mecab": + self.word_tokenizer = MecabTokenizer( + do_lower_case=self.do_lower_case, never_split=self.never_split, **(self.mecab_kwargs or {}) + ) + elif self.word_tokenizer_type == "sudachi": + self.word_tokenizer = SudachiTokenizer( + do_lower_case=self.do_lower_case, never_split=self.never_split, **(self.sudachi_kwargs or {}) + ) + elif self.word_tokenizer_type == "jumanpp": + self.word_tokenizer = JumanppTokenizer( + do_lower_case=self.do_lower_case, never_split=self.never_split, **(self.jumanpp_kwargs or {}) + ) + + def _tokenize(self, text): + if self.do_word_tokenize: + tokens = self.word_tokenizer.tokenize(text, never_split=self.all_special_tokens) + else: + tokens = [text] + + if self.do_subword_tokenize: + split_tokens = [sub_token for token in tokens for sub_token in self.subword_tokenizer.tokenize(token)] + else: + split_tokens = tokens + + return split_tokens + + @property + def vocab_size(self): + if self.subword_tokenizer_type == "sentencepiece": + return len(self.subword_tokenizer.sp_model) + return len(self.vocab) + + def get_vocab(self): + if self.subword_tokenizer_type == "sentencepiece": + vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab + return dict(self.vocab, **self.added_tokens_encoder) + + def _convert_token_to_id(self, token): + """Converts a token (str) in an id using the vocab.""" + if self.subword_tokenizer_type == "sentencepiece": + return self.subword_tokenizer.sp_model.PieceToId(token) + return self.vocab.get(token, self.vocab.get(self.unk_token)) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + if self.subword_tokenizer_type == "sentencepiece": + return self.subword_tokenizer.sp_model.IdToPiece(index) + return self.ids_to_tokens.get(index, self.unk_token) + + def convert_tokens_to_string(self, tokens): + """Converts a sequence of tokens (string) in a single string.""" + if self.subword_tokenizer_type == "sentencepiece": + return self.subword_tokenizer.sp_model.decode(tokens) + out_string = " ".join(tokens).replace(" ##", "").strip() + return out_string + + # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.build_inputs_with_special_tokens + def build_inputs_with_special_tokens( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None + ) -> list[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A BERT sequence has the following format: + + - single sequence: `[CLS] X [SEP]` + - pair of sequences: `[CLS] A [SEP] B [SEP]` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + if token_ids_1 is None: + return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] + cls = [self.cls_token_id] + sep = [self.sep_token_id] + return cls + token_ids_0 + sep + token_ids_1 + sep + + # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.get_special_tokens_mask + def get_special_tokens_mask( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False + ) -> list[int]: + """ + Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer `prepare_for_model` method. + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not the token list is already formatted with special tokens for the model. + + Returns: + `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + + if already_has_special_tokens: + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) + + if token_ids_1 is not None: + return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] + return [1] + ([0] * len(token_ids_0)) + [1] + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]: + if os.path.isdir(save_directory): + if self.subword_tokenizer_type == "sentencepiece": + vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["spm_file"] + ) + else: + vocab_file = os.path.join( + save_directory, + (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"], + ) + else: + vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory + + if self.subword_tokenizer_type == "sentencepiece": + with open(vocab_file, "wb") as writer: + content_spiece_model = self.subword_tokenizer.sp_model.serialized_model_proto() + writer.write(content_spiece_model) + else: + with open(vocab_file, "w", encoding="utf-8") as writer: + index = 0 + for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]): + if index != token_index: + logger.warning( + f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive." + " Please check that the vocabulary is not corrupted!" + ) + index = token_index + writer.write(token + "\n") + index += 1 + return (vocab_file,) + + +class MecabTokenizer: + """Runs basic tokenization with MeCab morphological parser.""" + + def __init__( + self, + do_lower_case=False, + never_split=None, + normalize_text=True, + mecab_dic: Optional[str] = "unidic_lite", + mecab_option: Optional[str] = None, + ): + """ + Constructs a MecabTokenizer. + + Args: + **do_lower_case**: (*optional*) boolean (default True) + Whether to lowercase the input. + **never_split**: (*optional*) list of str + Kept for backward compatibility purposes. Now implemented directly at the base class level (see + [`PreTrainedTokenizer.tokenize`]) List of tokens not to split. + **normalize_text**: (*optional*) boolean (default True) + Whether to apply unicode normalization to text before tokenization. + **mecab_dic**: (*optional*) string (default "ipadic") + Name of dictionary to be used for MeCab initialization. If you are using a system-installed dictionary, + set this option to `None` and modify *mecab_option*. + **mecab_option**: (*optional*) string + String passed to MeCab constructor. + """ + self.do_lower_case = do_lower_case + self.never_split = never_split if never_split is not None else [] + self.normalize_text = normalize_text + + try: + import fugashi + except ModuleNotFoundError as error: + raise error.__class__( + "You need to install fugashi to use MecabTokenizer. " + "See https://pypi.org/project/fugashi/ for installation." + ) + + mecab_option = mecab_option or "" + + if mecab_dic is not None: + if mecab_dic == "ipadic": + try: + import ipadic + except ModuleNotFoundError as error: + raise error.__class__( + "The ipadic dictionary is not installed. " + "See https://github.com/polm/ipadic-py for installation." + ) + + dic_dir = ipadic.DICDIR + + elif mecab_dic == "unidic_lite": + try: + import unidic_lite + except ModuleNotFoundError as error: + raise error.__class__( + "The unidic_lite dictionary is not installed. " + "See https://github.com/polm/unidic-lite for installation." + ) + + dic_dir = unidic_lite.DICDIR + + elif mecab_dic == "unidic": + try: + import unidic + except ModuleNotFoundError as error: + raise error.__class__( + "The unidic dictionary is not installed. " + "See https://github.com/polm/unidic-py for installation." + ) + + dic_dir = unidic.DICDIR + if not os.path.isdir(dic_dir): + raise RuntimeError( + "The unidic dictionary itself is not found. " + "See https://github.com/polm/unidic-py for installation." + ) + + else: + raise ValueError("Invalid mecab_dic is specified.") + + mecabrc = os.path.join(dic_dir, "mecabrc") + mecab_option = f'-d "{dic_dir}" -r "{mecabrc}" ' + mecab_option + + self.mecab = fugashi.GenericTagger(mecab_option) + + def tokenize(self, text, never_split=None, **kwargs): + """Tokenizes a piece of text.""" + if self.normalize_text: + text = unicodedata.normalize("NFKC", text) + + never_split = self.never_split + (never_split if never_split is not None else []) + tokens = [] + + for word in self.mecab(text): + token = word.surface + + if self.do_lower_case and token not in never_split: + token = token.lower() + + tokens.append(token) + + return tokens + + +class SudachiTokenizer: + """Runs basic tokenization with Sudachi morphological parser.""" + + def __init__( + self, + do_lower_case=False, + never_split=None, + normalize_text=True, + trim_whitespace=False, + sudachi_split_mode="A", + sudachi_config_path=None, + sudachi_resource_dir=None, + sudachi_dict_type="core", + sudachi_projection=None, + ): + """ + Constructs a SudachiTokenizer. + + Args: + **do_lower_case**: (*optional*) boolean (default True) + Whether to lowercase the input. + **never_split**: (*optional*) list of str + Kept for backward compatibility purposes. Now implemented directly at the base class level (see + [`PreTrainedTokenizer.tokenize`]) List of tokens not to split. + **normalize_text**: (*optional*) boolean (default True) + Whether to apply unicode normalization to text before tokenization. + **trim_whitespace**: (*optional*) boolean (default False) + Whether to trim all whitespace, tab, newline from tokens. + **sudachi_split_mode**: (*optional*) string + Split mode of sudachi, choose from `["A", "B", "C"]`. + **sudachi_config_path**: (*optional*) string + **sudachi_resource_dir**: (*optional*) string + **sudachi_dict_type**: (*optional*) string + dict type of sudachi, choose from `["small", "core", "full"]`. + **sudachi_projection**: (*optional*) string + Word projection mode of sudachi, choose from `["surface", "normalized", "reading", "dictionary", "dictionary_and_surface", "normalized_and_surface", "normalized_nouns"]`. + """ + + self.do_lower_case = do_lower_case + self.never_split = never_split if never_split is not None else [] + self.normalize_text = normalize_text + self.trim_whitespace = trim_whitespace + + try: + from sudachipy import dictionary, tokenizer + except ImportError: + raise ImportError( + "You need to install sudachipy to use SudachiTokenizer. " + "See https://github.com/WorksApplications/SudachiPy for installation." + ) + + if sudachi_split_mode == "A": + self.split_mode = tokenizer.Tokenizer.SplitMode.A + elif sudachi_split_mode == "B": + self.split_mode = tokenizer.Tokenizer.SplitMode.B + elif sudachi_split_mode == "C": + self.split_mode = tokenizer.Tokenizer.SplitMode.C + else: + raise ValueError("Invalid sudachi_split_mode is specified.") + + self.projection = sudachi_projection + + sudachi_dictionary = dictionary.Dictionary( + config_path=sudachi_config_path, resource_dir=sudachi_resource_dir, dict=sudachi_dict_type + ) + if is_sudachi_projection_available(): + self.sudachi = sudachi_dictionary.create(self.split_mode, projection=self.projection) + elif self.projection is not None: + raise ImportError("You need to install sudachipy>=0.6.8 to specify `projection` field in sudachi_kwargs.") + else: + self.sudachi = sudachi_dictionary.create(self.split_mode) + + def tokenize(self, text, never_split=None, **kwargs): + """Tokenizes a piece of text.""" + if self.normalize_text: + text = unicodedata.normalize("NFKC", text) + + never_split = self.never_split + (never_split if never_split is not None else []) + tokens = [] + + for word in self.sudachi.tokenize(text): + token = word.surface() + + if self.do_lower_case and token not in never_split: + token = token.lower() + + if self.trim_whitespace: + if token.strip() == "": + continue + else: + token = token.strip() + + tokens.append(token) + + return tokens + + +class JumanppTokenizer: + """Runs basic tokenization with jumanpp morphological parser.""" + + def __init__( + self, + do_lower_case=False, + never_split=None, + normalize_text=True, + trim_whitespace=False, + ): + """ + Constructs a JumanppTokenizer. + + Args: + **do_lower_case**: (*optional*) boolean (default True) + Whether to lowercase the input. + **never_split**: (*optional*) list of str + Kept for backward compatibility purposes. Now implemented directly at the base class level (see + [`PreTrainedTokenizer.tokenize`]) List of tokens not to split. + **normalize_text**: (*optional*) boolean (default True) + Whether to apply unicode normalization to text before tokenization. + **trim_whitespace**: (*optional*) boolean (default False) + Whether to trim all whitespace, tab, newline from tokens. + """ + + self.do_lower_case = do_lower_case + self.never_split = never_split if never_split is not None else [] + self.normalize_text = normalize_text + self.trim_whitespace = trim_whitespace + + try: + import rhoknp + except ImportError: + raise ImportError( + "You need to install rhoknp to use JumanppTokenizer. " + "See https://github.com/ku-nlp/rhoknp for installation." + ) + + self.juman = rhoknp.Jumanpp() + + def tokenize(self, text, never_split=None, **kwargs): + """Tokenizes a piece of text.""" + if self.normalize_text: + text = unicodedata.normalize("NFKC", text) + + text = text.strip() + + never_split = self.never_split + (never_split if never_split is not None else []) + tokens = [] + + for mrph in self.juman.apply_to_sentence(text).morphemes: + token = mrph.text + + if self.do_lower_case and token not in never_split: + token = token.lower() + + if self.trim_whitespace: + if token.strip() == "": + continue + else: + token = token.strip() + + tokens.append(token) + + return tokens + + +class CharacterTokenizer: + """Runs Character tokenization.""" + + def __init__(self, vocab, unk_token, normalize_text=True): + """ + Constructs a CharacterTokenizer. + + Args: + **vocab**: + Vocabulary object. + **unk_token**: str + A special symbol for out-of-vocabulary token. + **normalize_text**: (`optional`) boolean (default True) + Whether to apply unicode normalization to text before tokenization. + """ + self.vocab = vocab + self.unk_token = unk_token + self.normalize_text = normalize_text + + def tokenize(self, text): + """ + Tokenizes a piece of text into characters. + + For example, `input = "apple""` will return as output `["a", "p", "p", "l", "e"]`. + + Args: + text: A single token or whitespace separated tokens. + This should have already been passed through *BasicTokenizer*. + + Returns: + A list of characters. + """ + if self.normalize_text: + text = unicodedata.normalize("NFKC", text) + + output_tokens = [] + for char in text: + if char not in self.vocab: + output_tokens.append(self.unk_token) + continue + + output_tokens.append(char) + + return output_tokens + + +# Copied from transformers.models.bert.tokenization_bert.BasicTokenizer +class BasicTokenizer: + """ + Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.). + + Args: + do_lower_case (`bool`, *optional*, defaults to `True`): + Whether or not to lowercase the input when tokenizing. + never_split (`Iterable`, *optional*): + Collection of tokens which will never be split during tokenization. Only has an effect when + `do_basic_tokenize=True` + tokenize_chinese_chars (`bool`, *optional*, defaults to `True`): + Whether or not to tokenize Chinese characters. + + This should likely be deactivated for Japanese (see this + [issue](https://github.com/huggingface/transformers/issues/328)). + strip_accents (`bool`, *optional*): + Whether or not to strip all accents. If this option is not specified, then it will be determined by the + value for `lowercase` (as in the original BERT). + do_split_on_punc (`bool`, *optional*, defaults to `True`): + In some instances we want to skip the basic punctuation splitting so that later tokenization can capture + the full context of the words, such as contractions. + """ + + def __init__( + self, + do_lower_case=True, + never_split=None, + tokenize_chinese_chars=True, + strip_accents=None, + do_split_on_punc=True, + ): + if never_split is None: + never_split = [] + self.do_lower_case = do_lower_case + self.never_split = set(never_split) + self.tokenize_chinese_chars = tokenize_chinese_chars + self.strip_accents = strip_accents + self.do_split_on_punc = do_split_on_punc + + def tokenize(self, text, never_split=None): + """ + Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer. + + Args: + never_split (`List[str]`, *optional*) + Kept for backward compatibility purposes. Now implemented directly at the base class level (see + [`PreTrainedTokenizer.tokenize`]) List of token not to split. + """ + # union() returns a new set by concatenating the two sets. + never_split = self.never_split.union(set(never_split)) if never_split else self.never_split + text = self._clean_text(text) + + # This was added on November 1st, 2018 for the multilingual and Chinese + # models. This is also applied to the English models now, but it doesn't + # matter since the English models were not trained on any Chinese data + # and generally don't have any Chinese data in them (there are Chinese + # characters in the vocabulary because Wikipedia does have some Chinese + # words in the English Wikipedia.). + if self.tokenize_chinese_chars: + text = self._tokenize_chinese_chars(text) + # prevents treating the same character with different unicode codepoints as different characters + unicode_normalized_text = unicodedata.normalize("NFC", text) + orig_tokens = whitespace_tokenize(unicode_normalized_text) + split_tokens = [] + for token in orig_tokens: + if token not in never_split: + if self.do_lower_case: + token = token.lower() + if self.strip_accents is not False: + token = self._run_strip_accents(token) + elif self.strip_accents: + token = self._run_strip_accents(token) + split_tokens.extend(self._run_split_on_punc(token, never_split)) + + output_tokens = whitespace_tokenize(" ".join(split_tokens)) + return output_tokens + + def _run_strip_accents(self, text): + """Strips accents from a piece of text.""" + text = unicodedata.normalize("NFD", text) + output = [] + for char in text: + cat = unicodedata.category(char) + if cat == "Mn": + continue + output.append(char) + return "".join(output) + + def _run_split_on_punc(self, text, never_split=None): + """Splits punctuation on a piece of text.""" + if not self.do_split_on_punc or (never_split is not None and text in never_split): + return [text] + chars = list(text) + i = 0 + start_new_word = True + output = [] + while i < len(chars): + char = chars[i] + if _is_punctuation(char): + output.append([char]) + start_new_word = True + else: + if start_new_word: + output.append([]) + start_new_word = False + output[-1].append(char) + i += 1 + + return ["".join(x) for x in output] + + def _tokenize_chinese_chars(self, text): + """Adds whitespace around any CJK character.""" + output = [] + for char in text: + cp = ord(char) + if self._is_chinese_char(cp): + output.append(" ") + output.append(char) + output.append(" ") + else: + output.append(char) + return "".join(output) + + def _is_chinese_char(self, cp): + """Checks whether CP is the codepoint of a CJK character.""" + # This defines a "chinese character" as anything in the CJK Unicode block: + # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) + # + # Note that the CJK Unicode block is NOT all Japanese and Korean characters, + # despite its name. The modern Korean Hangul alphabet is a different block, + # as is Japanese Hiragana and Katakana. Those alphabets are used to write + # space-separated words, so they are not treated specially and handled + # like the all of the other languages. + if ( + (cp >= 0x4E00 and cp <= 0x9FFF) + or (cp >= 0x3400 and cp <= 0x4DBF) + or (cp >= 0x20000 and cp <= 0x2A6DF) + or (cp >= 0x2A700 and cp <= 0x2B73F) + or (cp >= 0x2B740 and cp <= 0x2B81F) + or (cp >= 0x2B820 and cp <= 0x2CEAF) + or (cp >= 0xF900 and cp <= 0xFAFF) + or (cp >= 0x2F800 and cp <= 0x2FA1F) + ): + return True + + return False + + def _clean_text(self, text): + """Performs invalid character removal and whitespace cleanup on text.""" + output = [] + for char in text: + cp = ord(char) + if cp == 0 or cp == 0xFFFD or _is_control(char): + continue + if _is_whitespace(char): + output.append(" ") + else: + output.append(char) + return "".join(output) + + +# Copied from transformers.models.bert.tokenization_bert.WordpieceTokenizer +class WordpieceTokenizer: + """Runs WordPiece tokenization.""" + + def __init__(self, vocab, unk_token, max_input_chars_per_word=100): + self.vocab = vocab + self.unk_token = unk_token + self.max_input_chars_per_word = max_input_chars_per_word + + def tokenize(self, text): + """ + Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform + tokenization using the given vocabulary. + + For example, `input = "unaffable"` will return as output `["un", "##aff", "##able"]`. + + Args: + text: A single token or whitespace separated tokens. This should have + already been passed through *BasicTokenizer*. + + Returns: + A list of wordpiece tokens. + """ + + output_tokens = [] + for token in whitespace_tokenize(text): + chars = list(token) + if len(chars) > self.max_input_chars_per_word: + output_tokens.append(self.unk_token) + continue + + is_bad = False + start = 0 + sub_tokens = [] + while start < len(chars): + end = len(chars) + cur_substr = None + while start < end: + substr = "".join(chars[start:end]) + if start > 0: + substr = "##" + substr + if substr in self.vocab: + cur_substr = substr + break + end -= 1 + if cur_substr is None: + is_bad = True + break + sub_tokens.append(cur_substr) + start = end + + if is_bad: + output_tokens.append(self.unk_token) + else: + output_tokens.extend(sub_tokens) + return output_tokens + + +class SentencepieceTokenizer: + """ + Runs sentencepiece tokenization. Based on transformers.models.albert.tokenization_albert.AlbertTokenizer. + """ + + def __init__( + self, + vocab, + unk_token, + do_lower_case=False, + remove_space=True, + keep_accents=True, + sp_model_kwargs: Optional[dict[str, Any]] = None, + ): + self.vocab = vocab + self.unk_token = unk_token + self.do_lower_case = do_lower_case + self.remove_space = remove_space + self.keep_accents = keep_accents + + self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) + self.sp_model.Load(self.vocab) + + def preprocess_text(self, inputs): + if self.remove_space: + outputs = " ".join(inputs.strip().split()) + else: + outputs = inputs + outputs = outputs.replace("``", '"').replace("''", '"') + + if not self.keep_accents: + outputs = unicodedata.normalize("NFKD", outputs) + outputs = "".join([c for c in outputs if not unicodedata.combining(c)]) + if self.do_lower_case: + outputs = outputs.lower() + + return outputs + + def tokenize(self, text): + """ + Tokenizes text by sentencepiece. Based on [SentencePiece](https://github.com/google/sentencepiece). + Tokenization needs the given vocabulary. + + Args: + text: A string needs to be tokenized. + + Returns: + A list of sentencepiece tokens. + """ + text = self.preprocess_text(text) + pieces = self.sp_model.encode(text, out_type=str) + new_pieces = [] + for piece in pieces: + if len(piece) > 1 and piece[-1] == "," and piece[-2].isdigit(): + cur_pieces = self.sp_model.EncodeAsPieces(piece[:-1].replace(SPIECE_UNDERLINE, "")) + if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE: + if len(cur_pieces[0]) == 1: + cur_pieces = cur_pieces[1:] + else: + cur_pieces[0] = cur_pieces[0][1:] + cur_pieces.append(piece[-1]) + new_pieces.extend(cur_pieces) + else: + new_pieces.append(piece) + + return new_pieces + + +__all__ = ["BertJapaneseTokenizer", "CharacterTokenizer", "MecabTokenizer"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/big_bird/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/big_bird/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..87419e69e5c7f0572076832237e10f236186ec33 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/big_bird/__init__.py @@ -0,0 +1,30 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_big_bird import * + from .modeling_big_bird import * + from .modeling_flax_big_bird import * + from .tokenization_big_bird import * + from .tokenization_big_bird_fast import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/big_bird/configuration_big_bird.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/big_bird/configuration_big_bird.py new file mode 100644 index 0000000000000000000000000000000000000000..8e29439bc4bd54f06689b7a3e93008baf93427ad --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/big_bird/configuration_big_bird.py @@ -0,0 +1,176 @@ +# coding=utf-8 +# Copyright 2021 Google Research and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""BigBird model configuration""" + +from collections import OrderedDict +from collections.abc import Mapping + +from ...configuration_utils import PretrainedConfig +from ...onnx import OnnxConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class BigBirdConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`BigBirdModel`]. It is used to instantiate an + BigBird model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of the BigBird + [google/bigbird-roberta-base](https://huggingface.co/google/bigbird-roberta-base) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 50358): + Vocabulary size of the BigBird model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`BigBirdModel`]. + hidden_size (`int`, *optional*, defaults to 768): + Dimension of the encoder layers and the pooler layer. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + intermediate_size (`int`, *optional*, defaults to 3072): + Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu_new"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"` and `"gelu_new"` are supported. + hidden_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout ratio for the attention probabilities. + max_position_embeddings (`int`, *optional*, defaults to 4096): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 1024 or 2048 or 4096). + type_vocab_size (`int`, *optional*, defaults to 2): + The vocabulary size of the `token_type_ids` passed when calling [`BigBirdModel`]. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-12): + The epsilon used by the layer normalization layers. + is_decoder (`bool`, *optional*, defaults to `False`): + Whether the model is used as a decoder or not. If `False`, the model is used as an encoder. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + attention_type (`str`, *optional*, defaults to `"block_sparse"`) + Whether to use block sparse attention (with n complexity) as introduced in paper or original attention + layer (with n^2 complexity). Possible values are `"original_full"` and `"block_sparse"`. + use_bias (`bool`, *optional*, defaults to `True`) + Whether to use bias in query, key, value. + rescale_embeddings (`bool`, *optional*, defaults to `False`) + Whether to rescale embeddings with (hidden_size ** 0.5). + block_size (`int`, *optional*, defaults to 64) + Size of each block. Useful only when `attention_type == "block_sparse"`. + num_random_blocks (`int`, *optional*, defaults to 3) + Each query is going to attend these many number of random blocks. Useful only when `attention_type == + "block_sparse"`. + classifier_dropout (`float`, *optional*): + The dropout ratio for the classification head. + + Example: + + ```python + >>> from transformers import BigBirdConfig, BigBirdModel + + >>> # Initializing a BigBird google/bigbird-roberta-base style configuration + >>> configuration = BigBirdConfig() + + >>> # Initializing a model (with random weights) from the google/bigbird-roberta-base style configuration + >>> model = BigBirdModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "big_bird" + + def __init__( + self, + vocab_size=50358, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + hidden_act="gelu_new", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=4096, + type_vocab_size=2, + initializer_range=0.02, + layer_norm_eps=1e-12, + use_cache=True, + pad_token_id=0, + bos_token_id=1, + eos_token_id=2, + sep_token_id=66, + attention_type="block_sparse", + use_bias=True, + rescale_embeddings=False, + block_size=64, + num_random_blocks=3, + classifier_dropout=None, + **kwargs, + ): + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + sep_token_id=sep_token_id, + **kwargs, + ) + + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.initializer_range = initializer_range + self.type_vocab_size = type_vocab_size + self.layer_norm_eps = layer_norm_eps + self.use_cache = use_cache + + self.rescale_embeddings = rescale_embeddings + self.attention_type = attention_type + self.use_bias = use_bias + self.block_size = block_size + self.num_random_blocks = num_random_blocks + self.classifier_dropout = classifier_dropout + + +class BigBirdOnnxConfig(OnnxConfig): + @property + def inputs(self) -> Mapping[str, Mapping[int, str]]: + if self.task == "multiple-choice": + dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"} + else: + dynamic_axis = {0: "batch", 1: "sequence"} + return OrderedDict( + [ + ("input_ids", dynamic_axis), + ("attention_mask", dynamic_axis), + ] + ) + + +__all__ = ["BigBirdConfig", "BigBirdOnnxConfig"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/big_bird/modeling_big_bird.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/big_bird/modeling_big_bird.py new file mode 100644 index 0000000000000000000000000000000000000000..eb89d9872be8816c83158faa4ac81c2238c634d7 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/big_bird/modeling_big_bird.py @@ -0,0 +1,2957 @@ +# coding=utf-8 +# Copyright 2021 Google Research and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch BigBird model.""" + +import math +import os +from dataclasses import dataclass +from typing import Optional, Union + +import numpy as np +import torch +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from ...activations import ACT2FN +from ...cache_utils import Cache, DynamicCache +from ...generation import GenerationMixin +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_outputs import ( + BaseModelOutputWithPastAndCrossAttentions, + BaseModelOutputWithPoolingAndCrossAttentions, + CausalLMOutputWithCrossAttentions, + MaskedLMOutput, + MultipleChoiceModelOutput, + SequenceClassifierOutput, + TokenClassifierOutput, +) +from ...modeling_utils import PreTrainedModel +from ...pytorch_utils import apply_chunking_to_forward +from ...utils import ModelOutput, auto_docstring, logging +from ...utils.deprecation import deprecate_kwarg +from .configuration_big_bird import BigBirdConfig + + +logger = logging.get_logger(__name__) + + +_TRIVIA_QA_MAPPING = { + "big_bird_attention": "attention/self", + "output_layer_norm": "output/LayerNorm", + "attention_output": "attention/output/dense", + "output": "output/dense", + "self_attention_layer_norm": "attention/output/LayerNorm", + "intermediate": "intermediate/dense", + "word_embeddings": "bert/embeddings/word_embeddings", + "position_embedding": "bert/embeddings/position_embeddings", + "type_embeddings": "bert/embeddings/token_type_embeddings", + "embeddings": "bert/embeddings", + "layer_normalization": "output/LayerNorm", + "layer_norm": "LayerNorm", + "trivia_qa_head": "qa_classifier", + "dense": "intermediate/dense", + "dense_1": "qa_outputs", +} + + +def load_tf_weights_in_big_bird(model, tf_checkpoint_path, is_trivia_qa=False): + """Load tf checkpoints in a pytorch model.""" + + def load_tf_weights_bert(init_vars, tf_path): + names = [] + tf_weights = {} + + for name, shape in init_vars: + array = tf.train.load_variable(tf_path, name) + name = name.replace("bert/encoder/LayerNorm", "bert/embeddings/LayerNorm") + logger.info(f"Loading TF weight {name} with shape {shape}") + names.append(name) + tf_weights[name] = array + + return names, tf_weights + + def load_tf_weights_trivia_qa(init_vars): + names = [] + tf_weights = {} + + for i, var in enumerate(init_vars): + name_items = var.name.split("/") + + if "transformer_scaffold" in name_items[0]: + layer_name_items = name_items[0].split("_") + if len(layer_name_items) < 3: + layer_name_items += [0] + + name_items[0] = f"bert/encoder/layer_{layer_name_items[2]}" + + name = "/".join([_TRIVIA_QA_MAPPING.get(x, x) for x in name_items])[:-2] # remove last :0 in variable + + if "self/attention/output" in name: + name = name.replace("self/attention/output", "output") + + if i >= len(init_vars) - 2: + name = name.replace("intermediate", "output") + + logger.info(f"Loading TF weight {name} with shape {var.shape}") + array = var.value().numpy() + names.append(name) + tf_weights[name] = array + + return names, tf_weights + + try: + import re + + import numpy as np + import tensorflow as tf + except ImportError: + logger.error( + "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see " + "https://www.tensorflow.org/install/ for installation instructions." + ) + raise + tf_path = os.path.abspath(tf_checkpoint_path) + logger.info(f"Converting TensorFlow checkpoint from {tf_path}") + + # Load weights from TF model + init_vars = tf.saved_model.load(tf_path).variables if is_trivia_qa else tf.train.list_variables(tf_path) + + if len(init_vars) <= 0: + raise ValueError("Loaded trained variables cannot be empty.") + + pt_names = list(model.state_dict().keys()) + + if is_trivia_qa: + names, tf_weights = load_tf_weights_trivia_qa(init_vars) + else: + names, tf_weights = load_tf_weights_bert(init_vars, tf_path) + + for txt_name in names: + array = tf_weights[txt_name] + name = txt_name.split("/") + # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v + # which are not required for using pretrained model + if any( + n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"] + for n in name + ): + logger.info(f"Skipping {'/'.join(name)}") + continue + pointer = model + pt_name = [] + for m_name in name: + if re.fullmatch(r"[A-Za-z]+_\d+", m_name): + scope_names = re.split(r"_(\d+)", m_name) + else: + scope_names = [m_name] + if scope_names[0] == "kernel" or scope_names[0] == "gamma": + pointer = getattr(pointer, "weight") + pt_name.append("weight") + elif scope_names[0] == "output_bias" or scope_names[0] == "beta": + pointer = getattr(pointer, "bias") + pt_name.append("bias") + elif scope_names[0] == "output_weights": + pointer = getattr(pointer, "weight") + pt_name.append("weight") + elif scope_names[0] == "squad": + pointer = getattr(pointer, "classifier") + pt_name.append("classifier") + elif scope_names[0] == "transform": + pointer = getattr(pointer, "transform") + pt_name.append("transform") + if ("bias" in name) or ("kernel" in name): + pointer = getattr(pointer, "dense") + pt_name.append("dense") + elif ("beta" in name) or ("gamma" in name): + pointer = getattr(pointer, "LayerNorm") + pt_name.append("LayerNorm") + else: + try: + pointer = getattr(pointer, scope_names[0]) + pt_name.append(f"{scope_names[0]}") + except AttributeError: + logger.info(f"Skipping {m_name}") + continue + if len(scope_names) >= 2: + num = int(scope_names[1]) + pointer = pointer[num] + pt_name.append(f"{num}") + if m_name[-11:] == "_embeddings" or m_name == "embeddings": + pointer = getattr(pointer, "weight") + pt_name.append("weight") + elif m_name == "kernel": + array = np.transpose(array) + try: + if len(array.shape) > len(pointer.shape) and math.prod(array.shape) == math.prod(pointer.shape): + # print(txt_name, array.shape) + if ( + txt_name.endswith("attention/self/key/kernel") + or txt_name.endswith("attention/self/query/kernel") + or txt_name.endswith("attention/self/value/kernel") + ): + array = array.transpose(1, 0, 2).reshape(pointer.shape) + elif txt_name.endswith("attention/output/dense/kernel"): + array = array.transpose(0, 2, 1).reshape(pointer.shape) + else: + array = array.reshape(pointer.shape) + + if pointer.shape != array.shape: + raise ValueError( + f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched of {txt_name}." + ) + except ValueError as e: + e.args += (pointer.shape, array.shape) + raise + pt_weight_name = ".".join(pt_name) + logger.info(f"Initialize PyTorch weight {pt_weight_name} from {txt_name}.") + pointer.data = torch.from_numpy(array) + tf_weights.pop(txt_name, None) + pt_names.remove(pt_weight_name) + + logger.info(f"Weights not copied to PyTorch model: {', '.join(tf_weights.keys())}.") + logger.info(f"Weights not initialized in PyTorch model: {', '.join(pt_names)}.") + return model + + +class BigBirdEmbeddings(nn.Module): + """Construct the embeddings from word, position and token_type embeddings.""" + + # Copied from transformers.models.bert.modeling_bert.BertEmbeddings.__init__ + def __init__(self, config): + super().__init__() + self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id) + self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) + self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size) + + # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load + # any TensorFlow checkpoint file + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + # position_ids (1, len position emb) is contiguous in memory and exported when serialized + self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) + self.register_buffer( + "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False + ) + # End copy + + self.rescale_embeddings = config.rescale_embeddings + self.hidden_size = config.hidden_size + + def forward( + self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0 + ): + if input_ids is not None: + input_shape = input_ids.size() + else: + input_shape = inputs_embeds.size()[:-1] + + seq_length = input_shape[1] + + if position_ids is None: + position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length] + + # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs + # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves + # issue #5664 + if token_type_ids is None: + if hasattr(self, "token_type_ids"): + buffered_token_type_ids = self.token_type_ids[:, :seq_length] + buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length) + token_type_ids = buffered_token_type_ids_expanded + else: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device) + + if inputs_embeds is None: + inputs_embeds = self.word_embeddings(input_ids) + + if self.rescale_embeddings: + inputs_embeds = inputs_embeds * (self.hidden_size**0.5) + + token_type_embeddings = self.token_type_embeddings(token_type_ids) + + embeddings = inputs_embeds + token_type_embeddings + + position_embeddings = self.position_embeddings(position_ids) + embeddings += position_embeddings + + embeddings = self.dropout(embeddings) + embeddings = self.LayerNorm(embeddings) + return embeddings + + +class BigBirdSelfAttention(nn.Module): + def __init__(self, config, layer_idx=None): + super().__init__() + if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): + raise ValueError( + f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention " + f"heads ({config.num_attention_heads})" + ) + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.use_bias) + self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.use_bias) + self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.use_bias) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + self.is_decoder = config.is_decoder + self.layer_idx = layer_idx + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_values=None, + output_attentions=False, + cache_position=None, + ): + batch_size, seq_length, _ = hidden_states.shape + query_layer = ( + self.query(hidden_states) + .view(batch_size, -1, self.num_attention_heads, self.attention_head_size) + .transpose(1, 2) + ) + + is_cross_attention = encoder_hidden_states is not None + current_states = encoder_hidden_states if is_cross_attention else hidden_states + attention_mask = encoder_attention_mask if is_cross_attention else attention_mask + if is_cross_attention and past_key_values is not None and past_key_values.get_seq_length(self.layer_idx) > 0: + # reuse k,v, cross_attentions + key_layer = past_key_values.layers[self.layer_idx].keys + value_layer = past_key_values.layers[self.layer_idx].values + else: + key_layer = ( + self.key(current_states) + .view(batch_size, -1, self.num_attention_heads, self.attention_head_size) + .transpose(1, 2) + ) + value_layer = ( + self.value(current_states) + .view(batch_size, -1, self.num_attention_heads, self.attention_head_size) + .transpose(1, 2) + ) + + if past_key_values is not None: + # save all key/value_layer to cache to be re-used for fast auto-regressive generation + key_layer, value_layer = past_key_values.update( + key_layer, + value_layer, + self.layer_idx, + ) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in BigBirdModel forward() function) + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = nn.functional.softmax(attention_scores, dim=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = attention_probs * head_mask + + context_layer = torch.matmul(attention_probs, value_layer) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(*new_context_layer_shape) + + return context_layer, attention_probs + + +class BigBirdBlockSparseAttention(nn.Module): + def __init__(self, config, seed=None): + super().__init__() + + self.max_seqlen = config.max_position_embeddings + self.seed = seed + + if config.hidden_size % config.num_attention_heads != 0: + raise ValueError( + f"The hidden size {config.hidden_size} is not a multiple of the number of attention " + f"heads {config.num_attention_heads}." + ) + + self.num_attention_heads = config.num_attention_heads + self.num_random_blocks = config.num_random_blocks + self.block_size = config.block_size + + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.use_bias) + self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.use_bias) + self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.use_bias) + + def forward( + self, + hidden_states, + band_mask=None, + from_mask=None, + to_mask=None, + from_blocked_mask=None, + to_blocked_mask=None, + output_attentions=None, + ): + # Currently this `class` can't be used in decoder. + + batch_size, seqlen, _ = hidden_states.size() + to_seq_length = from_seq_length = seqlen + from_block_size = to_block_size = self.block_size + + if from_seq_length % from_block_size != 0: + raise ValueError("Query sided sequence length must be multiple of block size") + + if to_seq_length % to_block_size != 0: + raise ValueError("Key/Value sided sequence length must be multiple of block size") + + query_layer = ( + self.query(hidden_states) + .view(batch_size, -1, self.num_attention_heads, self.attention_head_size) + .transpose(1, 2) + ) + key_layer = ( + self.key(hidden_states) + .view(batch_size, -1, self.num_attention_heads, self.attention_head_size) + .transpose(1, 2) + ) + value_layer = ( + self.value(hidden_states) + .view(batch_size, -1, self.num_attention_heads, self.attention_head_size) + .transpose(1, 2) + ) + + context_layer, attention_probs = self.bigbird_block_sparse_attention( + query_layer, + key_layer, + value_layer, + band_mask, + from_mask, + to_mask, + from_blocked_mask, + to_blocked_mask, + self.num_attention_heads, + self.num_random_blocks, + self.attention_head_size, + from_block_size, + to_block_size, + batch_size, + from_seq_length, + to_seq_length, + seed=self.seed, + plan_from_length=None, + plan_num_rand_blocks=None, + output_attentions=output_attentions, + ) + + context_layer = context_layer.contiguous().view(batch_size, from_seq_length, -1) + return context_layer, attention_probs + + @staticmethod + def torch_bmm_nd(inp_1, inp_2, ndim=None): + """Fast nd matrix multiplication""" + # faster replacement of torch.einsum ("bhqk,bhkd->bhqd") + return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view( + inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 1]) + ) + + @staticmethod + def torch_bmm_nd_transpose(inp_1, inp_2, ndim=None): + """Fast nd matrix multiplication with transpose""" + # faster replacement of torch.einsum (bhqd,bhkd->bhqk) + return torch.bmm( + inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2) + ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2])) + + def bigbird_block_sparse_attention( + self, + query_layer, + key_layer, + value_layer, + band_mask, + from_mask, + to_mask, + from_blocked_mask, + to_blocked_mask, + n_heads, + n_rand_blocks, + attention_head_size, + from_block_size, + to_block_size, + batch_size, + from_seq_len, + to_seq_len, + seed, + plan_from_length, + plan_num_rand_blocks, + output_attentions, + ): + # BigBird block-sparse attention as suggested in paper + + # ITC: + # global tokens: 2 x block_size + # window tokens: 3 x block_size + # random tokens: num_rand_tokens x block_size + + # ETC: + # global tokens: extra_globals_tokens + 2 x block_size + # window tokens: 3 x block_size + # random tokens: num_rand_tokens x block_size + + # Note: + # 1) Currently, ETC is not supported. + # 2) Window size is fixed to 3 blocks & it can be changed only by + # changing `block_size`. + # 3) Number of global blocks are fixed (2 blocks here) & global tokens can be + # controlled only by `block_size`. + + # attention is calculated separately for q[0], q[1], q[2:-2], q[-2], q[-1] in order to use special trick of shifting tokens (for calculating sliding attention) + # hence following code can be divided into 5 parts. + + if from_seq_len // from_block_size != to_seq_len // to_block_size: + raise ValueError("Error the number of blocks needs to be same!") + + rsqrt_d = 1 / math.sqrt(attention_head_size) + bsz = batch_size + attn_mask_penalty = -10000.0 + + # generate random attention and corresponding masks + np.random.seed(seed) + if from_seq_len in [1024, 3072, 4096]: # old plans used in paper + rand_attn = [ + self._bigbird_block_rand_mask( + self.max_seqlen, self.max_seqlen, from_block_size, to_block_size, n_rand_blocks, last_idx=1024 + )[: (from_seq_len // from_block_size - 2)] + for _ in range(n_heads) + ] + else: + if plan_from_length is None: + plan_from_length, plan_num_rand_blocks = self._get_rand_attn_plan( + from_seq_len, from_block_size, n_rand_blocks + ) + + rand_attn = self._bigbird_block_rand_mask_with_head( + from_seq_length=from_seq_len, + to_seq_length=to_seq_len, + from_block_size=from_block_size, + to_block_size=to_block_size, + num_heads=n_heads, + plan_from_length=plan_from_length, + plan_num_rand_blocks=plan_num_rand_blocks, + ) + + rand_attn = np.stack(rand_attn, axis=0) + rand_attn = torch.tensor(rand_attn, device=query_layer.device, dtype=torch.long) + rand_attn.unsqueeze_(0) + rand_attn = torch.cat([rand_attn for _ in range(batch_size)], dim=0) + + rand_mask = self._create_rand_mask_from_inputs( + from_blocked_mask, to_blocked_mask, rand_attn, n_heads, n_rand_blocks, bsz, from_seq_len, from_block_size + ) + + blocked_query_matrix = query_layer.view(bsz, n_heads, from_seq_len // from_block_size, from_block_size, -1) + blocked_key_matrix = key_layer.view(bsz, n_heads, to_seq_len // to_block_size, to_block_size, -1) + blocked_value_matrix = value_layer.view(bsz, n_heads, to_seq_len // to_block_size, to_block_size, -1) + + # preparing block for randn attn + gathered_key = self.torch_gather_b2(blocked_key_matrix, rand_attn) + gathered_key = gathered_key.view( + bsz, n_heads, to_seq_len // to_block_size - 2, n_rand_blocks * to_block_size, -1 + ) # [bsz, n_heads, to_seq_len//to_block_size-2, n_rand_blocks, to_block_size, -1] + gathered_value = self.torch_gather_b2(blocked_value_matrix, rand_attn) + gathered_value = gathered_value.view( + bsz, n_heads, to_seq_len // to_block_size - 2, n_rand_blocks * to_block_size, -1 + ) # [bsz, n_heads, to_seq_len//to_block_size-2, n_rand_blocks, to_block_size, -1] + + # 1st PART + # 1st block (global block) attention scores + # q[0] x (k[0], k[1], k[2], k[3], k[4] .... ) + + # [bsz, n_heads, from_block_size, -1] x [bsz, n_heads, to_seq_len, -1] ==> [bsz, n_heads, from_block_size, to_seq_len] + first_product = self.torch_bmm_nd_transpose(blocked_query_matrix[:, :, 0], key_layer, ndim=4) + + first_product = first_product * rsqrt_d + first_product += (1.0 - to_mask) * attn_mask_penalty + first_attn_weights = nn.functional.softmax( + first_product, dim=-1 + ) # [bsz, n_heads, from_block_size, to_seq_len] + + # [bsz, n_heads, from_block_size, to_seq_len] x [bsz, n_heads, to_seq_len, -1] ==> [bsz, n_heads, from_block_size, -1] + first_context_layer = self.torch_bmm_nd(first_attn_weights, value_layer, ndim=4) + first_context_layer.unsqueeze_(2) + + # 2nd PART + # 2nd block attention scores + # q[1] x (sliding_keys, random_keys, global_keys) + # sliding key blocks -> 2nd, 3rd blocks + # global key blocks -> 1st block + + second_key_mat = torch.cat( + [ + blocked_key_matrix[:, :, 0], + blocked_key_matrix[:, :, 1], + blocked_key_matrix[:, :, 2], + blocked_key_matrix[:, :, -1], + gathered_key[:, :, 0], + ], + dim=2, + ) # [bsz, n_heads, (4+n_rand_blocks)*to_block_size, -1] + second_value_mat = torch.cat( + [ + blocked_value_matrix[:, :, 0], + blocked_value_matrix[:, :, 1], + blocked_value_matrix[:, :, 2], + blocked_value_matrix[:, :, -1], + gathered_value[:, :, 0], + ], + dim=2, + ) # [bsz, n_heads, (4+n_rand_blocks)*to_block_size, -1] + + # [bsz, n_heads, from_block_size, -1] x [bsz, n_heads, (4+n_rand_blocks)*to_block_size, -1] ==> [bsz, n_heads, from_block_size, (4+n_rand_blocks)*to_block_size] + second_product = self.torch_bmm_nd_transpose(blocked_query_matrix[:, :, 1], second_key_mat, ndim=4) + second_seq_pad = torch.cat( + [ + to_mask[:, :, :, : 3 * to_block_size], + to_mask[:, :, :, -to_block_size:], + to_mask.new_ones([bsz, 1, 1, n_rand_blocks * to_block_size]), + ], + dim=3, + ) + second_rand_pad = torch.cat( + [ + rand_mask.new_ones([bsz, n_heads, from_block_size, 4 * to_block_size]), + rand_mask[:, :, 0], + ], + dim=3, + ) + second_product = second_product * rsqrt_d + second_product += (1.0 - torch.minimum(second_seq_pad, second_rand_pad)) * attn_mask_penalty + second_attn_weights = nn.functional.softmax( + second_product, dim=-1 + ) # [bsz, n_heads, from_block_size, (4+n_rand_blocks)*to_block_size] + + # [bsz, n_heads, from_block_size, (4+n_rand_blocks)*to_block_size] x [bsz, n_heads, (4+n_rand_blocks)*to_block_size, -1] ==> [bsz, n_heads, from_block_size, -1] + second_context_layer = self.torch_bmm_nd(second_attn_weights, second_value_mat, ndim=4) + + second_context_layer.unsqueeze_(2) + + # 3rd PART + # Middle blocks attention scores + # q[-2:2] x (sliding_keys, random_keys, global_keys) + # sliding attn is calculated using special trick of shifting tokens as discussed in paper + # random keys are generated by taking random indices as per `rand_attn` + # global keys -> 1st & last block + + exp_blocked_key_matrix = torch.cat( + [blocked_key_matrix[:, :, 1:-3], blocked_key_matrix[:, :, 2:-2], blocked_key_matrix[:, :, 3:-1]], dim=3 + ) # [bsz, n_heads, from_seq_len//from_block_size-4, 3*to_block_size, -1] + exp_blocked_value_matrix = torch.cat( + [blocked_value_matrix[:, :, 1:-3], blocked_value_matrix[:, :, 2:-2], blocked_value_matrix[:, :, 3:-1]], + dim=3, + ) # [bsz, n_heads, from_seq_len//from_block_size-4, 3*to_block_size, -1] + middle_query_matrix = blocked_query_matrix[:, :, 2:-2] + + # sliding attention scores for q[-2:2] + # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, -1] x [b, n_heads, from_seq_len//from_block_size-4, 3*to_block_size, -1] + inner_band_product = self.torch_bmm_nd_transpose(middle_query_matrix, exp_blocked_key_matrix, ndim=5) + # ==> [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, 3*to_block_size] + inner_band_product = inner_band_product * rsqrt_d + + # randn attention scores for q[-2:2] + # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, -1] x [bsz, n_heads, from_seq_len//from_block_size-4, n_rand_blocks*to_block_size, -1] + rand_band_product = self.torch_bmm_nd_transpose(middle_query_matrix, gathered_key[:, :, 1:-1], ndim=5) + # ==> [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, n_rand_blocks*to_block_size] + rand_band_product = rand_band_product * rsqrt_d + + # Including 1st block (since it's global) + first_band_product = torch.einsum( + "bhlqd,bhkd->bhlqk", middle_query_matrix, blocked_key_matrix[:, :, 0] + ) # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, -1] x [bsz, n_heads, to_block_size, -1] ==> [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, to_block_size] + first_band_product = first_band_product * rsqrt_d + + # Including last block (since it's global) + last_band_product = torch.einsum( + "bhlqd,bhkd->bhlqk", middle_query_matrix, blocked_key_matrix[:, :, -1] + ) # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, -1] x [bsz, n_heads, to_block_size, -1] ==> [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, to_block_size] + last_band_product = last_band_product * rsqrt_d + + # masking padded tokens + inner_band_product += (1.0 - band_mask) * attn_mask_penalty + first_band_product += (1.0 - to_mask[:, :, :, :to_block_size].unsqueeze(3)) * attn_mask_penalty + last_band_product += (1.0 - to_mask[:, :, :, -to_block_size:].unsqueeze(3)) * attn_mask_penalty + rand_band_product += (1.0 - rand_mask[:, :, 1:-1]) * attn_mask_penalty + + # completing attention scores matrix for all q[-2:2] + band_product = torch.cat( + [first_band_product, inner_band_product, rand_band_product, last_band_product], dim=-1 + ) # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, (5+n_rand_blocks)*to_block_size] + + # safely doing softmax since attention matrix is completed + attn_weights = nn.functional.softmax( + band_product, dim=-1 + ) # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, (5+n_rand_blocks)*to_block_size] + + # contribution of sliding keys + # [bsz, n_heads, m//from_block_size-4, from_block_size, 3*to_block_size] x [bsz, n_heads, from_seq_len//from_block_size-4, 3*to_block_size, -1] + context_layer = self.torch_bmm_nd( + attn_weights[:, :, :, :, to_block_size : 4 * to_block_size], exp_blocked_value_matrix, ndim=5 + ) + # ==> [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, -1] + + # adding contribution of random keys + # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, n_rand_blocks*to_block_size] x [bsz, n_heads, from_seq_len//from_block_size-4, n_rand_blocks*to_block_size, -1] + context_layer += self.torch_bmm_nd( + attn_weights[:, :, :, :, 4 * to_block_size : -to_block_size], gathered_value[:, :, 1:-1], ndim=5 + ) + # ==> [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, -1] + + # adding contribution of global keys + context_layer += torch.einsum( + "bhlqk,bhkd->bhlqd", attn_weights[:, :, :, :, :to_block_size], blocked_value_matrix[:, :, 0] + ) # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, to_block_size] x [bsz, n_heads, to_block_size, -1] ==> [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, -1] + context_layer += torch.einsum( + "bhlqk,bhkd->bhlqd", attn_weights[:, :, :, :, -to_block_size:], blocked_value_matrix[:, :, -1] + ) # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, to_block_size] x [bsz, n_heads, to_block_size, -1] ==> [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, -1] + + # 4th PART + # last 2nd token attention scores + # q[-2] x (sliding_keys, random_keys, global_keys) + # sliding key blocks -> last 3 blocks + # global key block -> 1st block + # random key block -> based on indices stored in `randn_attn` + + second_last_key_mat = torch.cat( + [ + blocked_key_matrix[:, :, 0], + blocked_key_matrix[:, :, -3], + blocked_key_matrix[:, :, -2], + blocked_key_matrix[:, :, -1], + gathered_key[:, :, -1], + ], + dim=2, + ) # [bsz, n_heads, (4+n_random_blocks)*to_block_size, -1] + second_last_value_mat = torch.cat( + [ + blocked_value_matrix[:, :, 0], + blocked_value_matrix[:, :, -3], + blocked_value_matrix[:, :, -2], + blocked_value_matrix[:, :, -1], + gathered_value[:, :, -1], + ], + dim=2, + ) # [bsz, n_heads, (4+r)*to_block_size, -1] + + # [bsz, n_heads, from_block_size, -1] x [bsz, n_heads, (4+n_rand_blocks)*to_block_size, -1] ==> [bsz, n_heads, from_block_size, (4+n_rand_blocks)*to_block_size] + second_last_product = self.torch_bmm_nd_transpose(blocked_query_matrix[:, :, -2], second_last_key_mat, ndim=4) + second_last_seq_pad = torch.cat( + [ + to_mask[:, :, :, :to_block_size], + to_mask[:, :, :, -3 * to_block_size :], + to_mask.new_ones([bsz, 1, 1, n_rand_blocks * to_block_size]), + ], + dim=3, + ) + second_last_rand_pad = torch.cat( + [ + rand_mask.new_ones([bsz, n_heads, from_block_size, 4 * to_block_size]), + rand_mask[:, :, -1], + ], + dim=3, + ) + second_last_product = second_last_product * rsqrt_d + second_last_product += (1.0 - torch.minimum(second_last_seq_pad, second_last_rand_pad)) * attn_mask_penalty + second_last_attn_weights = nn.functional.softmax( + second_last_product, dim=-1 + ) # [bsz, n_heads, from_block_size, (4+n_rand_blocks)*to_block_size] + + # [bsz, n_heads, from_block_size, (4+n_rand_blocks)*to_block_size] x [bsz, n_heads, (4+n_rand_blocks)*to_block_size, -1] ==> [bsz, n_heads, from_block_size, -1] + second_last_context_layer = self.torch_bmm_nd(second_last_attn_weights, second_last_value_mat, ndim=4) + second_last_context_layer.unsqueeze_(2) + + # 5th PART + # last block (global) attention scores + # q[-1] x (k[0], k[1], k[2], k[3], .... ) + + # [bsz, n_heads, from_block_size, -1] x [bsz, n_heads, to_seq_len, -1] ==> [bsz, n_heads, from_block_size, to_seq_len] + last_product = self.torch_bmm_nd_transpose(blocked_query_matrix[:, :, -1], key_layer, ndim=4) + last_product = last_product * rsqrt_d + last_product += (1.0 - to_mask) * attn_mask_penalty + last_attn_weights = nn.functional.softmax(last_product, dim=-1) # [bsz, n_heads, from_block_size, n] + + # [bsz, n_heads, from_block_size, to_seq_len] x [bsz, n_heads, to_seq_len, -1] ==> [bsz, n_heads, from_block_size, -1] + last_context_layer = self.torch_bmm_nd(last_attn_weights, value_layer, ndim=4) + last_context_layer.unsqueeze_(2) + + # combining representations of all tokens + context_layer = torch.cat( + [first_context_layer, second_context_layer, context_layer, second_last_context_layer, last_context_layer], + dim=2, + ) + context_layer = context_layer.view((bsz, n_heads, from_seq_len, -1)) * from_mask + context_layer = torch.transpose(context_layer, 1, 2) + + # this is just for visualizing; forward pass doesn't depend on following code + if output_attentions: + # TODO(PVP): need to verify if below code is correct + attention_probs = torch.zeros( + bsz, n_heads, from_seq_len, to_seq_len, dtype=torch.float, device=context_layer.device + ) + + # 1st query block + # corresponding to `first_context_layer` + attention_probs[:, :, :from_block_size, :] = first_attn_weights # all keys global + + # 2nd query block + # corresponding to `second_context_layer` + attention_probs[:, :, from_block_size : 2 * from_block_size, : 3 * to_block_size] = second_attn_weights[ + :, :, :, : 3 * to_block_size + ] # 1st three key blocks (global + sliding) + attention_probs[:, :, from_block_size : 2 * from_block_size, -to_block_size:] = second_attn_weights[ + :, :, :, 3 * to_block_size : 4 * to_block_size + ] # last key block (global) + # random keys + for p1, i1, w1 in zip(range(bsz), rand_attn, second_attn_weights): + # p1, i1, w1 corresponds to batch_dim i.e. following operation is done for each sequence in batch + for p2, i2, w2 in zip(range(n_heads), i1, w1): + # p2, i2, w2 corresponds to head_dim i.e. following operation is done for each heads + attn_probs_view = attention_probs.view( + bsz, + n_heads, + from_seq_len // from_block_size, + from_block_size, + to_seq_len // to_block_size, + to_block_size, + ) + right_slice = w2[:, 4 * to_block_size :] + attn_probs_view[p1, p2, 1, :, i2[0]] = right_slice.view( + from_block_size, n_rand_blocks, to_block_size + ) + + # Middle query blocks + # corresponding to `context_layer` + # sliding keys + for q_idx in range(from_seq_len // from_block_size - 4): + attn_probs_view = attention_probs.view( + bsz, + n_heads, + from_seq_len // from_block_size, + from_block_size, + to_seq_len // to_block_size, + to_block_size, + )[:, :, 2:-2, :, 1:-1, :] + right_slice = attn_weights[:, :, q_idx, :, to_block_size : 4 * to_block_size] + attn_probs_view[:, :, q_idx, :, q_idx : q_idx + 3, :] = right_slice.view( + bsz, n_heads, from_block_size, 3, to_block_size + ) # inner_band_product + # global keys (corresponding to 1st key block) + attention_probs[:, :, 2 * from_block_size : -2 * from_block_size, :to_block_size] = attn_weights[ + :, :, :, :, :to_block_size + ].view(bsz, n_heads, -1, to_block_size) # first_band_product + # global keys (corresponding to last key block) + attention_probs[:, :, 2 * from_block_size : -2 * from_block_size, -to_block_size:] = attn_weights[ + :, :, :, :, -to_block_size: + ].view(bsz, n_heads, -1, to_block_size) # last_band_product + # random keys + for p1, i1, w1 in zip(range(bsz), rand_attn, attn_weights): + # p1, i1, w1 corresponds to batch_dim i.e. following operation is done for each sequence in batch + for p2, i2, w2 in zip(range(n_heads), i1, w1): + # p2, i2, w2 corresponds to head_dim i.e. following operation is done for each heads + for q_idx in range(1, len(i2) - 1): + attn_probs_view = attention_probs.view( + bsz, + n_heads, + from_seq_len // from_block_size, + from_block_size, + to_seq_len // to_block_size, + to_block_size, + ) + right_slice = w2[q_idx - 1, :, 4 * to_block_size : -to_block_size] + attn_probs_view[p1, p2, q_idx + 1, :, i2[q_idx]] = right_slice.view( + from_block_size, n_rand_blocks, to_block_size + ) + + # Second-last query block + # corresponding to `second_last_context_layer` + attention_probs[:, :, -2 * from_block_size : -from_block_size, :to_block_size] = second_last_attn_weights[ + :, :, :, :to_block_size + ] # 1st key block (global) + attention_probs[:, :, -2 * from_block_size : -from_block_size, -3 * to_block_size :] = ( + second_last_attn_weights[:, :, :, to_block_size : 4 * to_block_size] + ) # last three blocks (global + sliding) + # random keys + for p1, i1, w1 in zip(range(bsz), rand_attn, second_last_attn_weights): + # p1, i1, w1 corresponds to batch_dim i.e. following operation is done for each sequence in batch + for p2, i2, w2 in zip(range(n_heads), i1, w1): + # p2, i2, w2 corresponds to head_dim i.e. following operation is done for each heads + attn_probs_view = attention_probs.view( + bsz, + n_heads, + from_seq_len // from_block_size, + from_block_size, + to_seq_len // to_block_size, + to_block_size, + ) + right_slice = w2[:, 4 * to_block_size :] + attn_probs_view[p1, p2, -2, :, i2[-1]] = right_slice.view( + from_block_size, n_rand_blocks, to_block_size + ) + + # last query block + # corresponding to `last_context_layer` + attention_probs[:, :, -from_block_size:, :] = last_attn_weights # all keys global + + else: + attention_probs = None + + return context_layer, attention_probs + + @staticmethod + def torch_gather_b2(params, indices): + # this operation is equivalent to tf.gather when batch_dims=2 + + if params.shape[:2] != indices.shape[:2]: + raise ValueError( + "Make sure that the first two dimensions of params and indices are identical, but" + f" they are params: {params.shape[:2]} vs. indices: {indices.shape[:2]}" + ) + num_indices_to_gather = indices.shape[-2] * indices.shape[-1] + num_indices_to_pick_from = params.shape[2] + + shift = torch.arange(indices.shape[0] * indices.shape[1] * num_indices_to_gather, device=indices.device) + indices_shift = torch.div(shift, num_indices_to_gather, rounding_mode="floor") * num_indices_to_pick_from + + flattened_indices = indices.view(-1) + indices_shift + flattened_params = params.reshape(-1, params.shape[-2], params.shape[-1]) + + out_flattened = flattened_params.index_select(0, flattened_indices) + + out = out_flattened.reshape(params.shape[:2] + (num_indices_to_gather,) + params.shape[3:]) + return out + + @staticmethod + def _create_rand_mask_from_inputs( + from_blocked_mask, + to_blocked_mask, + rand_attn, + num_attention_heads, + num_rand_blocks, + batch_size, + from_seq_length, + from_block_size, + ): + """ + Create 3D attention mask from a 2D tensor mask. + + Args: + from_blocked_mask: 2D Tensor of shape [batch_size, + from_seq_length//from_block_size, from_block_size]. + to_blocked_mask: int32 Tensor of shape [batch_size, + to_seq_length//to_block_size, to_block_size]. + rand_attn: [batch_size, num_attention_heads, + from_seq_length//from_block_size-2, num_rand_blocks] + num_attention_heads: int. Number of attention heads. + num_rand_blocks: int. Number of random chunks per row. + batch_size: int. Batch size for computation. + from_seq_length: int. length of from sequence. + from_block_size: int. size of block in from sequence. + + Returns: + float Tensor of shape [batch_size, num_attention_heads, from_seq_length//from_block_size-2, + from_block_size, num_rand_blocks*to_block_size]. + """ + num_windows = from_seq_length // from_block_size - 2 + rand_mask = torch.stack([p1[i1.flatten()] for p1, i1 in zip(to_blocked_mask, rand_attn)]) + rand_mask = rand_mask.view(batch_size, num_attention_heads, num_windows, num_rand_blocks * from_block_size) + rand_mask = torch.einsum("blq,bhlk->bhlqk", from_blocked_mask[:, 1:-1], rand_mask) + return rand_mask + + @staticmethod + def _get_rand_attn_plan(from_seq_length, from_block_size, num_rand_blocks): + """ + Gives the plan of where to put random attention. + + Args: + from_seq_length: int. length of from sequence. + from_block_size: int. size of block in from sequence. + num_rand_blocks: int. Number of random chunks per row. + + Returns: + plan_from_length: ending location of from block plan_num_rand_blocks: number of random ending location for + each block + """ + + plan_from_length = [] + plan_num_rand_blocks = [] + if (2 * num_rand_blocks + 5) < (from_seq_length // from_block_size): + plan_from_length.append(int((2 * num_rand_blocks + 5) * from_block_size)) + plan_num_rand_blocks.append(num_rand_blocks) + plan_from_length.append(from_seq_length) + plan_num_rand_blocks.append(0) + elif (num_rand_blocks + 5) < (from_seq_length // from_block_size): + plan_from_length.append(int((num_rand_blocks + 5) * from_block_size)) + plan_num_rand_blocks.append(num_rand_blocks // 2) + plan_from_length.append(from_seq_length) + plan_num_rand_blocks.append(num_rand_blocks - (num_rand_blocks // 2)) + else: + plan_from_length.append(from_seq_length) + plan_num_rand_blocks.append(num_rand_blocks) + + return plan_from_length, plan_num_rand_blocks + + def _bigbird_block_rand_mask( + self, from_seq_length, to_seq_length, from_block_size, to_block_size, num_rand_blocks, last_idx=-1 + ): + """ + Create adjacency list of random attention. + + Args: + from_seq_length: int. length of from sequence. + to_seq_length: int. length of to sequence. + from_block_size: int. size of block in from sequence. + to_block_size: int. size of block in to sequence. + num_rand_blocks: int. Number of random chunks per row. + last_idx: if -1 then num_rand_blocks blocks chosen anywhere in to sequence, + if positive then num_rand_blocks blocks chosen only up to last_idx. + + Returns: + adjacency list of size from_seq_length//from_block_size-2 by num_rand_blocks + """ + # using this method when from_seq_length in [1024, 3072, 4096] + + if from_seq_length // from_block_size != to_seq_length // to_block_size: + raise ValueError("Error the number of blocks needs to be same!") + + rand_attn = np.zeros((from_seq_length // from_block_size - 2, num_rand_blocks), dtype=np.int32) + # During inference (eval) no randomness + if not self.training: + return rand_attn + middle_seq = np.arange(1, to_seq_length // to_block_size - 1, dtype=np.int32) + last = to_seq_length // to_block_size - 1 + if last_idx > (2 * to_block_size): + last = (last_idx // to_block_size) - 1 + + r = num_rand_blocks # shorthand + for i in range(1, from_seq_length // from_block_size - 1): + start = i - 2 + end = i + if i == 1: + rand_attn[i - 1, :] = np.random.permutation(middle_seq[2:last])[:r] + elif i == 2: + rand_attn[i - 1, :] = np.random.permutation(middle_seq[3:last])[:r] + elif i == from_seq_length // from_block_size - 3: + rand_attn[i - 1, :] = np.random.permutation(middle_seq[:last])[:r] + # Missing -3: should have been sliced till last-3 + elif i == from_seq_length // from_block_size - 2: + rand_attn[i - 1, :] = np.random.permutation(middle_seq[:last])[:r] + # Missing -4: should have been sliced till last-4 + else: + if start > last: + start = last + rand_attn[i - 1, :] = np.random.permutation(middle_seq[:start])[:r] + elif (end + 1) == last: + rand_attn[i - 1, :] = np.random.permutation(middle_seq[:start])[:r] + else: + rand_attn[i - 1, :] = np.random.permutation( + np.concatenate((middle_seq[:start], middle_seq[end + 1 : last])) + )[:r] + return rand_attn + + def _bigbird_block_rand_mask_with_head( + self, + from_seq_length, + to_seq_length, + from_block_size, + to_block_size, + num_heads, + plan_from_length, + plan_num_rand_blocks, + window_block_left=1, + window_block_right=1, + global_block_top=1, + global_block_bottom=1, + global_block_left=1, + global_block_right=1, + ): + """ + Create adjacency list of random attention. + + Args: + from_seq_length: int. length of from sequence. + to_seq_length: int. length of to sequence. + from_block_size: int. size of block in from sequence. + to_block_size: int. size of block in to sequence. + num_heads: int. total number of heads. + plan_from_length: list. plan from length where num_random_blocks are chosen from. + plan_num_rand_blocks: list. number of rand blocks within the plan. + window_block_left: int. number of blocks of window to left of a block. + window_block_right: int. number of blocks of window to right of a block. + global_block_top: int. number of blocks at the top. + global_block_bottom: int. number of blocks at the bottom. + global_block_left: int. Number of blocks globally used to the left. + global_block_right: int. Number of blocks globally used to the right. + + Returns: + adjacency list of size num_head where each element is of size from_seq_length//from_block_size-2 by + num_rand_blocks + """ + # using this method when from_seq_length not in [1024, 3072, 4096] + + if from_seq_length // from_block_size != to_seq_length // to_block_size: + raise ValueError("Error the number of blocks needs to be same!") + + if from_seq_length not in plan_from_length: + raise ValueError("Error from sequence length not in plan!") + + # Total number of blocks in the mmask + num_blocks = from_seq_length // from_block_size + # Number of blocks per plan + plan_block_length = np.array(plan_from_length) // from_block_size + # till when to follow plan + max_plan_idx = plan_from_length.index(from_seq_length) + + # Random Attention adjacency list + rand_attn = [ + np.zeros((num_blocks, np.sum(plan_num_rand_blocks[: max_plan_idx + 1])), dtype=np.int32) + for i in range(num_heads) + ] + # During inference (eval) no randomness + if not self.training: + for nh in range(num_heads): + rand_attn[nh] = rand_attn[nh][global_block_top : num_blocks - global_block_bottom, :] + return rand_attn + + # We will go iteratively over the plan blocks and pick random number of + # Attention blocks from the legally allowed blocks + for plan_idx in range(max_plan_idx + 1): + rnd_r_cnt = 0 + if plan_idx > 0: + # set the row for all from_blocks starting from 0 to + # plan_block_length[plan_idx-1] + # column indx start from plan_block_length[plan_idx-1] and ends at + # plan_block_length[plan_idx] + if plan_num_rand_blocks[plan_idx] > 0: + rnd_r_cnt = int(np.sum(plan_num_rand_blocks[:plan_idx])) + curr_r_cnt = int(np.sum(plan_num_rand_blocks[: plan_idx + 1])) + for blk_rw_idx in range(global_block_top, plan_block_length[plan_idx - 1]): + for h in range(num_heads): + rand_attn[h][blk_rw_idx, rnd_r_cnt:curr_r_cnt] = self._get_single_block_row_attention( + block_id=blk_rw_idx, + to_start_block_id=plan_block_length[plan_idx - 1], + to_end_block_id=plan_block_length[plan_idx], + num_rand_blocks=plan_num_rand_blocks[plan_idx], + window_block_left=window_block_left, + window_block_right=window_block_right, + global_block_left=global_block_left, + global_block_right=global_block_right, + ) + + for pl_id in range(plan_idx): + if plan_num_rand_blocks[pl_id] == 0: + continue + for blk_rw_idx in range(plan_block_length[plan_idx - 1], plan_block_length[plan_idx]): + rnd_r_cnt = 0 + to_start_block_id = 0 + if pl_id > 0: + rnd_r_cnt = int(np.sum(plan_num_rand_blocks[:pl_id])) + to_start_block_id = plan_block_length[pl_id - 1] + curr_r_cnt = int(np.sum(plan_num_rand_blocks[: pl_id + 1])) + for h in range(num_heads): + rand_attn[h][blk_rw_idx, rnd_r_cnt:curr_r_cnt] = self._get_single_block_row_attention( + block_id=blk_rw_idx, + to_start_block_id=to_start_block_id, + to_end_block_id=plan_block_length[pl_id], + num_rand_blocks=plan_num_rand_blocks[pl_id], + window_block_left=window_block_left, + window_block_right=window_block_right, + global_block_left=global_block_left, + global_block_right=global_block_right, + ) + + if plan_num_rand_blocks[plan_idx] == 0: + continue + curr_r_cnt = int(np.sum(plan_num_rand_blocks[: plan_idx + 1])) + from_start_block_id = global_block_top + to_start_block_id = 0 + if plan_idx > 0: + rnd_r_cnt = int(np.sum(plan_num_rand_blocks[:plan_idx])) + from_start_block_id = plan_block_length[plan_idx - 1] + to_start_block_id = plan_block_length[plan_idx - 1] + + for blk_rw_idx in range(from_start_block_id, plan_block_length[plan_idx]): + for h in range(num_heads): + rand_attn[h][blk_rw_idx, rnd_r_cnt:curr_r_cnt] = self._get_single_block_row_attention( + block_id=blk_rw_idx, + to_start_block_id=to_start_block_id, + to_end_block_id=plan_block_length[plan_idx], + num_rand_blocks=plan_num_rand_blocks[plan_idx], + window_block_left=window_block_left, + window_block_right=window_block_right, + global_block_left=global_block_left, + global_block_right=global_block_right, + ) + + for nh in range(num_heads): + rand_attn[nh] = rand_attn[nh][global_block_top : num_blocks - global_block_bottom, :] + + return rand_attn + + @staticmethod + def _get_single_block_row_attention( + block_id, + to_start_block_id, + to_end_block_id, + num_rand_blocks, + window_block_left=1, + window_block_right=1, + global_block_left=1, + global_block_right=1, + ): + """ + For a single row block get random row attention. + + Args: + block_id: int. block id of row. + to_start_block_id: int. random attention column start id. + to_end_block_id: int. random attention column end id. + num_rand_blocks: int. number of random blocks to be selected. + window_block_left: int. number of blocks of window to left of a block. + window_block_right: int. number of blocks of window to right of a block. + global_block_left: int. Number of blocks globally used to the left. + global_block_right: int. Number of blocks globally used to the right. + + Returns: + row containing the random attention vector of size num_rand_blocks. + """ + # list of to_blocks from which to choose random attention + to_block_list = np.arange(to_start_block_id, to_end_block_id, dtype=np.int32) + # permute the blocks + perm_block = np.random.permutation(to_block_list) + + # illegal blocks for the current block id, using window + illegal_blocks = list(range(block_id - window_block_left, block_id + window_block_right + 1)) + + # Add blocks at the start and at the end + illegal_blocks.extend(list(range(global_block_left))) + illegal_blocks.extend(list(range(to_end_block_id - global_block_right, to_end_block_id))) + + # The second from_block cannot choose random attention on second last to_block + if block_id == 1: + illegal_blocks.append(to_end_block_id - 2) + + # The second last from_block cannot choose random attention on second to_block + if block_id == to_end_block_id - 2: + illegal_blocks.append(1) + + selected_random_blocks = [] + + for i in range(to_end_block_id - to_start_block_id): + if perm_block[i] not in illegal_blocks: + selected_random_blocks.append(perm_block[i]) + if len(selected_random_blocks) == num_rand_blocks: + break + return np.array(selected_random_blocks, dtype=np.int32) + + +# Copied from transformers.models.bert.modeling_bert.BertSelfOutput with Bert->BigBird +class BigBirdSelfOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class BigBirdAttention(nn.Module): + def __init__(self, config, seed=None): + super().__init__() + self.attention_type = config.attention_type + self.config = config + self.seed = seed + + if self.config.attention_type == "original_full": + self.self = BigBirdSelfAttention(config, layer_idx=seed) + elif self.config.attention_type == "block_sparse": + self.self = BigBirdBlockSparseAttention(config, seed) + else: + raise ValueError( + f"attention_type can either be original_full or block_sparse, but is {self.config.attention_type}" + ) + + self.output = BigBirdSelfOutput(config) + + def set_attention_type(self, value: str, layer_idx=None): + if value not in ["original_full", "block_sparse"]: + raise ValueError( + f"attention_type can only be set to either 'original_full' or 'block_sparse', but is {value}" + ) + # attention type is already correctly set + if value == self.attention_type: + return + + self.attention_type = value + if value == "original_full": + # copy all weights to new full attention class + attn_weights = BigBirdSelfAttention(self.config, layer_idx=layer_idx) + else: + # copy all weights to new sparse attention class + attn_weights = BigBirdBlockSparseAttention(self.config, self.seed) + + attn_weights.query = self.self.query + attn_weights.value = self.self.value + attn_weights.key = self.self.key + self.self = attn_weights + if not self.training: + self.self.eval() + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_values=None, + output_attentions=False, + # block_sparse config + band_mask=None, + from_mask=None, + to_mask=None, + from_blocked_mask=None, + to_blocked_mask=None, + cache_position=None, + ): + # fp16 compatibility + if band_mask is not None: + band_mask = band_mask.to(hidden_states.dtype) + if from_mask is not None: + from_mask = from_mask.to(hidden_states.dtype) + if to_mask is not None: + to_mask = to_mask.to(hidden_states.dtype) + if self.attention_type == "original_full": + self_outputs = self.self( + hidden_states, + attention_mask=attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + past_key_values=past_key_values, + output_attentions=output_attentions, + cache_position=cache_position, + ) + else: + if encoder_hidden_states is not None: + raise ValueError("BigBird cannot be used as a decoder when config.attention_type != 'original_full'") + self_outputs = self.self( + hidden_states, band_mask, from_mask, to_mask, from_blocked_mask, to_blocked_mask, output_attentions + ) + + attention_output = self.output(self_outputs[0], hidden_states) + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + return outputs + + +# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->BigBird +class BigBirdIntermediate(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +# Copied from transformers.models.bert.modeling_bert.BertOutput with Bert->BigBird +class BigBirdOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class BigBirdLayer(GradientCheckpointingLayer): + def __init__(self, config, seed=None): + super().__init__() + self.config = config + self.attention_type = config.attention_type + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 + self.attention = BigBirdAttention(config, seed=seed) + self.is_decoder = config.is_decoder + self.add_cross_attention = config.add_cross_attention + if self.add_cross_attention: + if not self.is_decoder: + raise TypeError(f"{self} should be used as a decoder model if cross attention is added") + self.crossattention = BigBirdAttention(config, seed=seed) + self.intermediate = BigBirdIntermediate(config) + self.output = BigBirdOutput(config) + + def set_attention_type(self, value: str, layer_idx=None): + if value not in ["original_full", "block_sparse"]: + raise ValueError( + f"attention_type can only be set to either 'original_full' or 'block_sparse', but is {value}" + ) + # attention type is already correctly set + if value == self.attention_type: + return + self.attention_type = value + self.attention.set_attention_type(value, layer_idx=layer_idx) + + if self.add_cross_attention: + self.crossattention.set_attention_type(value, layer_idx=layer_idx) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + band_mask=None, + from_mask=None, + to_mask=None, + blocked_encoder_mask=None, + past_key_values=None, + output_attentions=False, + cache_position=None, + ): + # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 + self_attention_outputs = self.attention( + hidden_states, + attention_mask=attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + past_key_values=past_key_values, + output_attentions=output_attentions, + band_mask=band_mask, + from_mask=from_mask, + to_mask=to_mask, + from_blocked_mask=blocked_encoder_mask, + to_blocked_mask=blocked_encoder_mask, + cache_position=cache_position, + ) + attention_output = self_attention_outputs[0] + outputs = self_attention_outputs[1:] # add self attentions if we output attention weights + + if self.is_decoder and encoder_hidden_states is not None: + if not hasattr(self, "crossattention"): + raise ValueError( + f"If `encoder_hidden_states` are passed, {self} has to be instantiated with " + " cross-attention layers by setting `config.add_cross_attention=True`" + ) + + cross_attention_outputs = self.crossattention( + attention_output, + attention_mask=encoder_attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + past_key_values=past_key_values, + output_attentions=output_attentions, + cache_position=cache_position, + ) + attention_output = cross_attention_outputs[0] + outputs = outputs + cross_attention_outputs[1:] # add cross attentions if we output attention weights + + layer_output = apply_chunking_to_forward( + self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output + ) + + return (layer_output,) + outputs + + def feed_forward_chunk(self, attention_output): + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + return layer_output + + +class BigBirdEncoder(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.attention_type = config.attention_type + + self.layer = nn.ModuleList( + [BigBirdLayer(config, seed=layer_idx) for layer_idx in range(config.num_hidden_layers)] + ) + self.gradient_checkpointing = False + + def set_attention_type(self, value: str): + if value not in ["original_full", "block_sparse"]: + raise ValueError( + f"attention_type can only be set to either 'original_full' or 'block_sparse', but is {value}" + ) + # attention type is already correctly set + if value == self.attention_type: + return + self.attention_type = value + for i, layer in enumerate(self.layer): + layer.set_attention_type(value, layer_idx=i) + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_values=None, + use_cache=None, + output_attentions=False, + output_hidden_states=False, + band_mask=None, + from_mask=None, + to_mask=None, + blocked_encoder_mask=None, + return_dict=True, + cache_position=None, + ) -> Union[BaseModelOutputWithPastAndCrossAttentions, tuple]: + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + if use_cache and past_key_values is None: + past_key_values = DynamicCache(config=self.config) + if use_cache and isinstance(past_key_values, tuple): + logger.warning_once( + "Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. " + "You should pass an instance of `DynamicCache` instead, e.g. " + "`past_key_values=DynamicCache.from_legacy_cache(past_key_values)`." + ) + past_key_values = DynamicCache.from_legacy_cache(past_key_values) + + for i, layer_module in enumerate(self.layer): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_head_mask = head_mask[i] if head_mask is not None else None + + layer_outputs = layer_module( + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + band_mask, + from_mask, + to_mask, + blocked_encoder_mask, + past_key_values, + output_attentions, + cache_position, + ) + + hidden_states = layer_outputs[0] + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + if self.config.add_cross_attention: + all_cross_attentions = all_cross_attentions + (layer_outputs[2],) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple( + v + for v in [ + hidden_states, + past_key_values, + all_hidden_states, + all_self_attentions, + all_cross_attentions, + ] + if v is not None + ) + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=past_key_values, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + cross_attentions=all_cross_attentions, + ) + + +# Copied from transformers.models.bert.modeling_bert.BertPredictionHeadTransform with Bert->BigBird +class BigBirdPredictionHeadTransform(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + if isinstance(config.hidden_act, str): + self.transform_act_fn = ACT2FN[config.hidden_act] + else: + self.transform_act_fn = config.hidden_act + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.transform_act_fn(hidden_states) + hidden_states = self.LayerNorm(hidden_states) + return hidden_states + + +# Copied from transformers.models.bert.modeling_bert.BertLMPredictionHead with Bert->BigBird +class BigBirdLMPredictionHead(nn.Module): + def __init__(self, config): + super().__init__() + self.transform = BigBirdPredictionHeadTransform(config) + + # The output weights are the same as the input embeddings, but there is + # an output-only bias for each token. + self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + self.bias = nn.Parameter(torch.zeros(config.vocab_size)) + + # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` + self.decoder.bias = self.bias + + def _tie_weights(self): + self.decoder.bias = self.bias + + def forward(self, hidden_states): + hidden_states = self.transform(hidden_states) + hidden_states = self.decoder(hidden_states) + return hidden_states + + +# Copied from transformers.models.bert.modeling_bert.BertOnlyMLMHead with Bert->BigBird +class BigBirdOnlyMLMHead(nn.Module): + def __init__(self, config): + super().__init__() + self.predictions = BigBirdLMPredictionHead(config) + + def forward(self, sequence_output: torch.Tensor) -> torch.Tensor: + prediction_scores = self.predictions(sequence_output) + return prediction_scores + + +# Copied from transformers.models.bert.modeling_bert.BertOnlyNSPHead with Bert->BigBird +class BigBirdOnlyNSPHead(nn.Module): + def __init__(self, config): + super().__init__() + self.seq_relationship = nn.Linear(config.hidden_size, 2) + + def forward(self, pooled_output): + seq_relationship_score = self.seq_relationship(pooled_output) + return seq_relationship_score + + +# Copied from transformers.models.bert.modeling_bert.BertPreTrainingHeads with Bert->BigBird +class BigBirdPreTrainingHeads(nn.Module): + def __init__(self, config): + super().__init__() + self.predictions = BigBirdLMPredictionHead(config) + self.seq_relationship = nn.Linear(config.hidden_size, 2) + + def forward(self, sequence_output, pooled_output): + prediction_scores = self.predictions(sequence_output) + seq_relationship_score = self.seq_relationship(pooled_output) + return prediction_scores, seq_relationship_score + + +@auto_docstring +class BigBirdPreTrainedModel(PreTrainedModel): + config: BigBirdConfig + load_tf_weights = load_tf_weights_in_big_bird + base_model_prefix = "bert" + supports_gradient_checkpointing = True + + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, nn.Linear): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + elif isinstance(module, BigBirdLMPredictionHead): + module.bias.data.zero_() + + +@dataclass +@auto_docstring( + custom_intro=""" + Output type of [`BigBirdForPreTraining`]. + """ +) +class BigBirdForPreTrainingOutput(ModelOutput): + r""" + loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`): + Total loss as the sum of the masked language modeling loss and the next sequence prediction + (classification) loss. + prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + seq_relationship_logits (`torch.FloatTensor` of shape `(batch_size, 2)`): + Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation + before SoftMax). + """ + + loss: Optional[torch.FloatTensor] = None + prediction_logits: Optional[torch.FloatTensor] = None + seq_relationship_logits: Optional[torch.FloatTensor] = None + hidden_states: Optional[tuple[torch.FloatTensor]] = None + attentions: Optional[tuple[torch.FloatTensor]] = None + + +@dataclass +@auto_docstring( + custom_intro=""" + Base class for outputs of question answering models. + """ +) +class BigBirdForQuestionAnsweringModelOutput(ModelOutput): + r""" + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Total span extraction loss is the sum of a Cross-Entropy for the start and end positions. + pooler_output (`torch.FloatTensor` of shape `(batch_size, 1)`): + pooler output from BigBigModel + """ + + loss: Optional[torch.FloatTensor] = None + start_logits: Optional[torch.FloatTensor] = None + end_logits: Optional[torch.FloatTensor] = None + pooler_output: Optional[torch.FloatTensor] = None + hidden_states: Optional[tuple[torch.FloatTensor]] = None + attentions: Optional[tuple[torch.FloatTensor]] = None + + +@auto_docstring +class BigBirdModel(BigBirdPreTrainedModel): + """ + + The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of + cross-attention is added between the self-attention layers, following the architecture described in [Attention is + all you need](https://huggingface.co/papers/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, + Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin. + + To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set + to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and + `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass. + """ + + def __init__(self, config, add_pooling_layer=True): + r""" + add_pooling_layer (bool, *optional*, defaults to `True`): + Whether to add a pooling layer + """ + super().__init__(config) + self.attention_type = self.config.attention_type + self.config = config + + self.block_size = self.config.block_size + + self.embeddings = BigBirdEmbeddings(config) + self.encoder = BigBirdEncoder(config) + + if add_pooling_layer: + self.pooler = nn.Linear(config.hidden_size, config.hidden_size) + self.activation = nn.Tanh() + else: + self.pooler = None + self.activation = None + + if self.attention_type != "original_full" and config.add_cross_attention: + logger.warning( + "When using `BigBirdForCausalLM` as decoder, then `attention_type` must be `original_full`. Setting" + " `attention_type=original_full`" + ) + self.set_attention_type("original_full") + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embeddings.word_embeddings + + def set_input_embeddings(self, value): + self.embeddings.word_embeddings = value + + def set_attention_type(self, value: str): + if value not in ["original_full", "block_sparse"]: + raise ValueError( + f"attention_type can only be set to either 'original_full' or 'block_sparse', but is {value}" + ) + # attention type is already correctly set + if value == self.attention_type: + return + self.attention_type = value + self.encoder.set_attention_type(value) + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_values: Optional[Cache] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.Tensor] = None, + **kwargs, # NOOP kwargs, for now + ) -> Union[BaseModelOutputWithPoolingAndCrossAttentions, tuple[torch.FloatTensor]]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if self.config.is_decoder: + use_cache = use_cache if use_cache is not None else self.config.use_cache + else: + use_cache = False + + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask) + input_shape = input_ids.size() + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + batch_size, seq_length = input_shape + device = input_ids.device if input_ids is not None else inputs_embeds.device + + past_key_values_length = 0 + if past_key_values is not None: + past_key_values_length = ( + past_key_values[0][0].shape[-2] + if not isinstance(past_key_values, Cache) + else past_key_values.get_seq_length() + ) + + if attention_mask is None: + attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device) + if token_type_ids is None: + if hasattr(self.embeddings, "token_type_ids"): + buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length] + buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length) + token_type_ids = buffered_token_type_ids_expanded + else: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) + + # in order to use block_sparse attention, sequence_length has to be at least + # bigger than all global attentions: 2 * block_size + # + sliding tokens: 3 * block_size + # + random tokens: 2 * num_random_blocks * block_size + max_tokens_to_attend = (5 + 2 * self.config.num_random_blocks) * self.config.block_size + if self.attention_type == "block_sparse" and seq_length <= max_tokens_to_attend: + # change attention_type from block_sparse to original_full + sequence_length = input_ids.size(1) if input_ids is not None else inputs_embeds.size(1) + logger.warning( + "Attention type 'block_sparse' is not possible if sequence_length: " + f"{sequence_length} <= num global tokens: 2 * config.block_size " + "+ min. num sliding tokens: 3 * config.block_size " + "+ config.num_random_blocks * config.block_size " + "+ additional buffer: config.num_random_blocks * config.block_size " + f"= {max_tokens_to_attend} with config.block_size " + f"= {self.config.block_size}, config.num_random_blocks " + f"= {self.config.num_random_blocks}. " + "Changing attention type to 'original_full'..." + ) + self.set_attention_type("original_full") + + if self.attention_type == "block_sparse": + ( + padding_len, + input_ids, + attention_mask, + token_type_ids, + position_ids, + inputs_embeds, + ) = self._pad_to_block_size( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + inputs_embeds=inputs_embeds, + pad_token_id=self.config.pad_token_id, + ) + else: + padding_len = 0 + + if self.attention_type == "block_sparse": + blocked_encoder_mask, band_mask, from_mask, to_mask = self.create_masks_for_block_sparse_attn( + attention_mask, self.block_size + ) + extended_attention_mask = None + + elif self.attention_type == "original_full": + blocked_encoder_mask = None + band_mask = None + from_mask = None + to_mask = None + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape) + else: + raise ValueError( + f"attention_type can either be original_full or block_sparse, but is {self.attention_type}" + ) + + # If a 2D or 3D attention mask is provided for the cross-attention + # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] + if self.config.is_decoder and encoder_hidden_states is not None: + encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size() + encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) + if encoder_attention_mask is None: + encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device) + encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) + else: + encoder_extended_attention_mask = None + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + + embedding_output = self.embeddings( + input_ids=input_ids, + position_ids=position_ids, + token_type_ids=token_type_ids, + inputs_embeds=inputs_embeds, + past_key_values_length=past_key_values_length, + ) + + encoder_outputs = self.encoder( + embedding_output, + attention_mask=extended_attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_extended_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + band_mask=band_mask, + from_mask=from_mask, + to_mask=to_mask, + blocked_encoder_mask=blocked_encoder_mask, + return_dict=return_dict, + cache_position=cache_position, + ) + sequence_output = encoder_outputs[0] + + pooler_output = self.activation(self.pooler(sequence_output[:, 0, :])) if (self.pooler is not None) else None + + # undo padding + if padding_len > 0: + # unpad `sequence_output` because the calling function is expecting a length == input_ids.size(1) + sequence_output = sequence_output[:, :-padding_len] + + if not return_dict: + return (sequence_output, pooler_output) + encoder_outputs[1:] + + return BaseModelOutputWithPoolingAndCrossAttentions( + last_hidden_state=sequence_output, + pooler_output=pooler_output, + past_key_values=encoder_outputs.past_key_values, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + cross_attentions=encoder_outputs.cross_attentions, + ) + + @staticmethod + def create_masks_for_block_sparse_attn(attention_mask: torch.Tensor, block_size: int): + batch_size, seq_length = attention_mask.size() + if seq_length % block_size != 0: + raise ValueError( + f"Sequence length must be multiple of block size, but sequence length is {seq_length}, while block" + f" size is {block_size}." + ) + + def create_band_mask_from_inputs(from_blocked_mask, to_blocked_mask): + """ + Create 3D attention mask from a 2D tensor mask. + + Args: + from_blocked_mask: 2D Tensor of shape [batch_size, + from_seq_length//from_block_size, from_block_size]. + to_blocked_mask: int32 Tensor of shape [batch_size, + to_seq_length//to_block_size, to_block_size]. + + Returns: + float Tensor of shape [batch_size, 1, from_seq_length//from_block_size-4, from_block_size, + 3*to_block_size]. + """ + exp_blocked_to_pad = torch.cat( + [to_blocked_mask[:, 1:-3], to_blocked_mask[:, 2:-2], to_blocked_mask[:, 3:-1]], dim=2 + ) + band_mask = torch.einsum("blq,blk->blqk", from_blocked_mask[:, 2:-2], exp_blocked_to_pad) + band_mask.unsqueeze_(1) + return band_mask + + blocked_encoder_mask = attention_mask.view(batch_size, seq_length // block_size, block_size) + band_mask = create_band_mask_from_inputs(blocked_encoder_mask, blocked_encoder_mask) + + from_mask = attention_mask.view(batch_size, 1, seq_length, 1) + to_mask = attention_mask.view(batch_size, 1, 1, seq_length) + + return blocked_encoder_mask, band_mask, from_mask, to_mask + + def _pad_to_block_size( + self, + input_ids: torch.Tensor, + attention_mask: torch.Tensor, + token_type_ids: torch.Tensor, + position_ids: torch.Tensor, + inputs_embeds: torch.Tensor, + pad_token_id: int, + ): + """A helper function to pad tokens and mask to work with implementation of BigBird block-sparse attention.""" + # padding + block_size = self.config.block_size + + input_shape = input_ids.shape if input_ids is not None else inputs_embeds.shape + batch_size, seq_len = input_shape[:2] + + padding_len = (block_size - seq_len % block_size) % block_size + if padding_len > 0: + logger.warning_once( + f"Input ids are automatically padded from {seq_len} to {seq_len + padding_len} to be a multiple of " + f"`config.block_size`: {block_size}" + ) + if input_ids is not None: + input_ids = nn.functional.pad(input_ids, (0, padding_len), value=pad_token_id) + if position_ids is not None: + # pad with position_id = pad_token_id as in modeling_bigbird.BigBirdEmbeddings + position_ids = nn.functional.pad(position_ids, (0, padding_len), value=pad_token_id) + if inputs_embeds is not None: + input_ids_padding = inputs_embeds.new_full( + (batch_size, padding_len), + self.config.pad_token_id, + dtype=torch.long, + ) + inputs_embeds_padding = self.embeddings(input_ids_padding) + inputs_embeds = torch.cat([inputs_embeds, inputs_embeds_padding], dim=-2) + + attention_mask = nn.functional.pad( + attention_mask, (0, padding_len), value=False + ) # no attention on the padding tokens + token_type_ids = nn.functional.pad(token_type_ids, (0, padding_len), value=0) # pad with token_type_id = 0 + + return padding_len, input_ids, attention_mask, token_type_ids, position_ids, inputs_embeds + + +class BigBirdForPreTraining(BigBirdPreTrainedModel): + _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"] + + def __init__(self, config): + super().__init__(config) + + self.bert = BigBirdModel(config, add_pooling_layer=True) + self.cls = BigBirdPreTrainingHeads(config) + + # Initialize weights and apply final processing + self.post_init() + + def get_output_embeddings(self): + return self.cls.predictions.decoder + + def set_output_embeddings(self, new_embeddings): + self.cls.predictions.decoder = new_embeddings + self.cls.predictions.bias = new_embeddings.bias + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.FloatTensor] = None, + next_sentence_label: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[BigBirdForPreTrainingOutput, tuple[torch.FloatTensor]]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., + config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the + loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` + next_sentence_label (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the next sequence prediction (classification) loss. If specified, nsp loss will be + added to masked_lm loss. Input should be a sequence pair (see `input_ids` docstring) Indices should be in + `[0, 1]`: + + - 0 indicates sequence B is a continuation of sequence A, + - 1 indicates sequence B is a random sequence. + + Example: + + ```python + >>> from transformers import AutoTokenizer, BigBirdForPreTraining + >>> import torch + + >>> tokenizer = AutoTokenizer.from_pretrained("google/bigbird-roberta-base") + >>> model = BigBirdForPreTraining.from_pretrained("google/bigbird-roberta-base") + + >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") + >>> outputs = model(**inputs) + + >>> prediction_logits = outputs.prediction_logits + >>> seq_relationship_logits = outputs.seq_relationship_logits + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output, pooled_output = outputs[:2] + prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output) + + total_loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + total_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) + + if next_sentence_label is not None and total_loss is not None: + next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1)) + total_loss = total_loss + next_sentence_loss + + if not return_dict: + output = (prediction_scores, seq_relationship_score) + outputs[2:] + return ((total_loss,) + output) if total_loss is not None else output + + return BigBirdForPreTrainingOutput( + loss=total_loss, + prediction_logits=prediction_scores, + seq_relationship_logits=seq_relationship_score, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@auto_docstring +class BigBirdForMaskedLM(BigBirdPreTrainedModel): + _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"] + + def __init__(self, config): + super().__init__(config) + + if config.is_decoder: + logger.warning( + "If you want to use `BigBirdForMaskedLM` make sure `config.is_decoder=False` for " + "bi-directional self-attention." + ) + + self.bert = BigBirdModel(config) + self.cls = BigBirdOnlyMLMHead(config) + + # Initialize weights and apply final processing + self.post_init() + + def get_output_embeddings(self): + return self.cls.predictions.decoder + + def set_output_embeddings(self, new_embeddings): + self.cls.predictions.decoder = new_embeddings + self.cls.predictions.bias = new_embeddings.bias + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[MaskedLMOutput, tuple[torch.FloatTensor]]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., + config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the + loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Example: + + ```python + >>> import torch + >>> from transformers import AutoTokenizer, BigBirdForMaskedLM + >>> from datasets import load_dataset + + >>> tokenizer = AutoTokenizer.from_pretrained("google/bigbird-roberta-base") + >>> model = BigBirdForMaskedLM.from_pretrained("google/bigbird-roberta-base") + >>> squad_ds = load_dataset("rajpurkar/squad_v2", split="train") # doctest: +IGNORE_RESULT + + >>> # select random long article + >>> LONG_ARTICLE_TARGET = squad_ds[81514]["context"] + >>> # select random sentence + >>> LONG_ARTICLE_TARGET[332:398] + 'the highest values are very close to the theoretical maximum value' + + >>> # add mask_token + >>> LONG_ARTICLE_TO_MASK = LONG_ARTICLE_TARGET.replace("maximum", "[MASK]") + >>> inputs = tokenizer(LONG_ARTICLE_TO_MASK, return_tensors="pt") + >>> # long article input + >>> list(inputs["input_ids"].shape) + [1, 919] + + >>> with torch.no_grad(): + ... logits = model(**inputs).logits + >>> # retrieve index of [MASK] + >>> mask_token_index = (inputs.input_ids == tokenizer.mask_token_id)[0].nonzero(as_tuple=True)[0] + >>> predicted_token_id = logits[0, mask_token_index].argmax(axis=-1) + >>> tokenizer.decode(predicted_token_id) + 'maximum' + ``` + + ```python + >>> labels = tokenizer(LONG_ARTICLE_TARGET, return_tensors="pt")["input_ids"] + >>> labels = torch.where(inputs.input_ids == tokenizer.mask_token_id, labels, -100) + >>> outputs = model(**inputs, labels=labels) + >>> round(outputs.loss.item(), 2) + 1.99 + ``` + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + prediction_scores = self.cls(sequence_output) + + masked_lm_loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() # -100 index = padding token + masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) + + if not return_dict: + output = (prediction_scores,) + outputs[2:] + return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output + + return MaskedLMOutput( + loss=masked_lm_loss, + logits=prediction_scores, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_kwargs): + input_shape = input_ids.shape + effective_batch_size = input_shape[0] + + # add a dummy token + if self.config.pad_token_id is None: + raise ValueError("The PAD token should be defined for generation") + attention_mask = torch.cat([attention_mask, attention_mask.new_zeros((attention_mask.shape[0], 1))], dim=-1) + dummy_token = torch.full( + (effective_batch_size, 1), self.config.pad_token_id, dtype=torch.long, device=input_ids.device + ) + input_ids = torch.cat([input_ids, dummy_token], dim=1) + + return {"input_ids": input_ids, "attention_mask": attention_mask} + + +@auto_docstring( + custom_intro=""" + BigBird Model with a `language modeling` head on top for CLM fine-tuning. + """ +) +class BigBirdForCausalLM(BigBirdPreTrainedModel, GenerationMixin): + _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"] + + def __init__(self, config): + super().__init__(config) + + if not config.is_decoder: + logger.warning("If you want to use `BigBirdForCausalLM` as a standalone, add `is_decoder=True.`") + + self.bert = BigBirdModel(config) + self.cls = BigBirdOnlyMLMHead(config) + + # Initialize weights and apply final processing + self.post_init() + + def get_output_embeddings(self): + return self.cls.predictions.decoder + + def set_output_embeddings(self, new_embeddings): + self.cls.predictions.decoder = new_embeddings + self.cls.predictions.bias = new_embeddings.bias + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_values: Optional[Cache] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.Tensor] = None, + **kwargs, + ) -> Union[CausalLMOutputWithCrossAttentions, tuple[torch.FloatTensor]]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in + `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are + ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]`. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + cache_position=cache_position, + **kwargs, + ) + + sequence_output = outputs[0] + prediction_scores = self.cls(sequence_output) + + lm_loss = None + if labels is not None: + lm_loss = self.loss_function( + prediction_scores, + labels, + vocab_size=self.config.vocab_size, + **kwargs, + ) + + if not return_dict: + output = (prediction_scores,) + outputs[2:] + return ((lm_loss,) + output) if lm_loss is not None else output + + return CausalLMOutputWithCrossAttentions( + loss=lm_loss, + logits=prediction_scores, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + cross_attentions=outputs.cross_attentions, + ) + + +class BigBirdClassificationHead(nn.Module): + """Head for sentence-level classification tasks.""" + + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + classifier_dropout = ( + config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob + ) + self.dropout = nn.Dropout(classifier_dropout) + self.out_proj = nn.Linear(config.hidden_size, config.num_labels) + + self.config = config + + def forward(self, features, **kwargs): + x = features[:, 0, :] # take token (equiv. to [CLS]) + x = self.dropout(x) + x = self.dense(x) + x = ACT2FN[self.config.hidden_act](x) + x = self.dropout(x) + x = self.out_proj(x) + return x + + +@auto_docstring( + custom_intro=""" + BigBird Model transformer with a sequence classification/regression head on top (a linear layer on top of the + pooled output) e.g. for GLUE tasks. + """ +) +class BigBirdForSequenceClassification(BigBirdPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.config = config + self.bert = BigBirdModel(config) + self.classifier = BigBirdClassificationHead(config) + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[SequenceClassifierOutput, tuple[torch.FloatTensor]]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + + Example: + + ```python + >>> import torch + >>> from transformers import AutoTokenizer, BigBirdForSequenceClassification + >>> from datasets import load_dataset + + >>> tokenizer = AutoTokenizer.from_pretrained("l-yohai/bigbird-roberta-base-mnli") + >>> model = BigBirdForSequenceClassification.from_pretrained("l-yohai/bigbird-roberta-base-mnli") + >>> squad_ds = load_dataset("rajpurkar/squad_v2", split="train") # doctest: +IGNORE_RESULT + + >>> LONG_ARTICLE = squad_ds[81514]["context"] + >>> inputs = tokenizer(LONG_ARTICLE, return_tensors="pt") + >>> # long input article + >>> list(inputs["input_ids"].shape) + [1, 919] + + >>> with torch.no_grad(): + ... logits = model(**inputs).logits + >>> predicted_class_id = logits.argmax().item() + >>> model.config.id2label[predicted_class_id] + 'LABEL_0' + ``` + + ```python + >>> num_labels = len(model.config.id2label) + >>> model = BigBirdForSequenceClassification.from_pretrained( + ... "l-yohai/bigbird-roberta-base-mnli", num_labels=num_labels + ... ) + >>> labels = torch.tensor(1) + >>> loss = model(**inputs, labels=labels).loss + >>> round(loss.item(), 2) + 1.13 + ``` + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + logits = self.classifier(sequence_output) + + loss = None + if labels is not None: + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits, labels) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@auto_docstring +class BigBirdForMultipleChoice(BigBirdPreTrainedModel): + def __init__(self, config): + super().__init__(config) + + self.bert = BigBirdModel(config) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.classifier = nn.Linear(config.hidden_size, 1) + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[MultipleChoiceModelOutput, tuple[torch.FloatTensor]]: + r""" + input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, + 1]`: + + - 0 corresponds to a *sentence A* token, + - 1 corresponds to a *sentence B* token. + + [What are token type IDs?](../glossary#token-type-ids) + position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + + [What are position IDs?](../glossary#position-ids) + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert *input_ids* indices into associated vectors than the + model's internal embedding lookup matrix. + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., + num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See + `input_ids` above) + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] + + input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None + attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None + token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None + position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None + inputs_embeds = ( + inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1)) + if inputs_embeds is not None + else None + ) + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + pooled_output = outputs[1] + + pooled_output = self.dropout(pooled_output) + logits = self.classifier(pooled_output) + reshaped_logits = logits.view(-1, num_choices) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(reshaped_logits, labels) + + if not return_dict: + output = (reshaped_logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return MultipleChoiceModelOutput( + loss=loss, + logits=reshaped_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@auto_docstring +class BigBirdForTokenClassification(BigBirdPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + + self.bert = BigBirdModel(config) + classifier_dropout = ( + config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob + ) + self.dropout = nn.Dropout(classifier_dropout) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[TokenClassifierOutput, tuple[torch.FloatTensor]]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + sequence_output = self.dropout(sequence_output) + logits = self.classifier(sequence_output) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TokenClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +class BigBirdForQuestionAnsweringHead(nn.Module): + """Head for question answering tasks.""" + + def __init__(self, config): + super().__init__() + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.intermediate = BigBirdIntermediate(config) + self.output = BigBirdOutput(config) + self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) + + def forward(self, encoder_output): + hidden_states = self.dropout(encoder_output) + hidden_states = self.intermediate(hidden_states) + hidden_states = self.output(hidden_states, encoder_output) + hidden_states = self.qa_outputs(hidden_states) + return hidden_states + + +@auto_docstring +class BigBirdForQuestionAnswering(BigBirdPreTrainedModel): + def __init__(self, config, add_pooling_layer=False): + r""" + add_pooling_layer (bool, *optional*, defaults to `True`): + Whether to add a pooling layer + """ + super().__init__(config) + + config.num_labels = 2 + self.num_labels = config.num_labels + self.sep_token_id = config.sep_token_id + + self.bert = BigBirdModel(config, add_pooling_layer=add_pooling_layer) + self.qa_classifier = BigBirdForQuestionAnsweringHead(config) + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + question_lengths: Optional[torch.LongTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + start_positions: Optional[torch.LongTensor] = None, + end_positions: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[BigBirdForQuestionAnsweringModelOutput, tuple[torch.FloatTensor]]: + r""" + question_lengths (`torch.LongTensor` of shape `(batch_size, 1)`, *optional*): + The lengths of the questions in the batch. + + Example: + + ```python + >>> import torch + >>> from transformers import AutoTokenizer, BigBirdForQuestionAnswering + >>> from datasets import load_dataset + + >>> tokenizer = AutoTokenizer.from_pretrained("google/bigbird-roberta-base") + >>> model = BigBirdForQuestionAnswering.from_pretrained("google/bigbird-roberta-base") + >>> squad_ds = load_dataset("rajpurkar/squad_v2", split="train") # doctest: +IGNORE_RESULT + + >>> # select random article and question + >>> LONG_ARTICLE = squad_ds[81514]["context"] + >>> QUESTION = squad_ds[81514]["question"] + >>> QUESTION + 'During daytime how high can the temperatures reach?' + + >>> inputs = tokenizer(QUESTION, LONG_ARTICLE, return_tensors="pt") + >>> # long article and question input + >>> list(inputs["input_ids"].shape) + [1, 929] + + >>> with torch.no_grad(): + ... outputs = model(**inputs) + + >>> answer_start_index = outputs.start_logits.argmax() + >>> answer_end_index = outputs.end_logits.argmax() + >>> predict_answer_token_ids = inputs.input_ids[0, answer_start_index : answer_end_index + 1] + >>> predict_answer_token = tokenizer.decode(predict_answer_token_ids) + ``` + + ```python + >>> target_start_index, target_end_index = torch.tensor([130]), torch.tensor([132]) + >>> outputs = model(**inputs, start_positions=target_start_index, end_positions=target_end_index) + >>> loss = outputs.loss + ``` + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + seqlen = input_ids.size(1) if input_ids is not None else inputs_embeds.size(1) + + if question_lengths is None and input_ids is not None: + # assuming input_ids format: context + question_lengths = torch.argmax(input_ids.eq(self.sep_token_id).int(), dim=-1) + 1 + question_lengths.unsqueeze_(1) + + logits_mask = None + if question_lengths is not None: + # setting lengths logits to `-inf` + logits_mask = self.prepare_question_mask(question_lengths, seqlen) + if token_type_ids is None: + token_type_ids = torch.ones(logits_mask.size(), dtype=int, device=logits_mask.device) - logits_mask + logits_mask[:, 0] = False + logits_mask.unsqueeze_(2) + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + logits = self.qa_classifier(sequence_output) + + if logits_mask is not None: + # removing question tokens from the competition + logits = logits - logits_mask * 1e6 + + start_logits, end_logits = logits.split(1, dim=-1) + start_logits = start_logits.squeeze(-1).contiguous() + end_logits = end_logits.squeeze(-1).contiguous() + + total_loss = None + if start_positions is not None and end_positions is not None: + # If we are on multi-GPU, split add a dimension + if len(start_positions.size()) > 1: + start_positions = start_positions.squeeze(-1) + if len(end_positions.size()) > 1: + end_positions = end_positions.squeeze(-1) + # sometimes the start/end positions are outside our model inputs, we ignore these terms + ignored_index = start_logits.size(1) + start_positions = start_positions.clamp(0, ignored_index) + end_positions = end_positions.clamp(0, ignored_index) + + loss_fct = CrossEntropyLoss(ignore_index=ignored_index) + start_loss = loss_fct(start_logits, start_positions) + end_loss = loss_fct(end_logits, end_positions) + total_loss = (start_loss + end_loss) / 2 + + if not return_dict: + output = (start_logits, end_logits) + outputs[2:] + return ((total_loss,) + output) if total_loss is not None else output + + return BigBirdForQuestionAnsweringModelOutput( + loss=total_loss, + start_logits=start_logits, + end_logits=end_logits, + pooler_output=outputs.pooler_output, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + @staticmethod + def prepare_question_mask(q_lengths: torch.Tensor, maxlen: int): + # q_lengths -> (bz, 1) + mask = torch.arange(0, maxlen).to(q_lengths.device) + mask.unsqueeze_(0) # -> (1, maxlen) + mask = torch.where(mask < q_lengths, 1, 0) + return mask + + +__all__ = [ + "BigBirdForCausalLM", + "BigBirdForMaskedLM", + "BigBirdForMultipleChoice", + "BigBirdForPreTraining", + "BigBirdForQuestionAnswering", + "BigBirdForSequenceClassification", + "BigBirdForTokenClassification", + "BigBirdLayer", + "BigBirdModel", + "BigBirdPreTrainedModel", + "load_tf_weights_in_big_bird", +] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/big_bird/modeling_flax_big_bird.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/big_bird/modeling_flax_big_bird.py new file mode 100644 index 0000000000000000000000000000000000000000..11dcb30f3d47e3ecf6d90d7a7305682d687e5828 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/big_bird/modeling_flax_big_bird.py @@ -0,0 +1,2648 @@ +# coding=utf-8 +# Copyright 2021 The Google Flax Team Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Callable, Optional + +import flax +import flax.linen as nn +import jax +import jax.numpy as jnp +from flax.core.frozen_dict import FrozenDict, freeze, unfreeze +from flax.linen import combine_masks, make_causal_mask +from flax.linen import partitioning as nn_partitioning +from flax.linen.attention import dot_product_attention_weights +from flax.traverse_util import flatten_dict, unflatten_dict +from jax import lax + +from ...modeling_flax_outputs import ( + FlaxBaseModelOutputWithPastAndCrossAttentions, + FlaxBaseModelOutputWithPooling, + FlaxBaseModelOutputWithPoolingAndCrossAttentions, + FlaxCausalLMOutputWithCrossAttentions, + FlaxMaskedLMOutput, + FlaxMultipleChoiceModelOutput, + FlaxSequenceClassifierOutput, + FlaxTokenClassifierOutput, +) +from ...modeling_flax_utils import ( + ACT2FN, + FlaxPreTrainedModel, + append_call_sample_docstring, + append_replace_return_docstrings, + overwrite_call_docstring, +) +from ...utils import ModelOutput, add_start_docstrings, add_start_docstrings_to_model_forward, logging +from .configuration_big_bird import BigBirdConfig + + +logger = logging.get_logger(__name__) + +_CHECKPOINT_FOR_DOC = "google/bigbird-roberta-base" +_CONFIG_FOR_DOC = "BigBirdConfig" + +remat = nn_partitioning.remat + + +@flax.struct.dataclass +class FlaxBigBirdForPreTrainingOutput(ModelOutput): + """ + Output type of [`BigBirdForPreTraining`]. + + Args: + prediction_logits (`jnp.ndarray` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + seq_relationship_logits (`jnp.ndarray` of shape `(batch_size, 2)`): + Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation + before SoftMax). + hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape + `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + prediction_logits: jnp.ndarray = None + seq_relationship_logits: jnp.ndarray = None + hidden_states: Optional[tuple[jnp.ndarray]] = None + attentions: Optional[tuple[jnp.ndarray]] = None + + +@flax.struct.dataclass +class FlaxBigBirdForQuestionAnsweringModelOutput(ModelOutput): + """ + Base class for outputs of question answering models. + + Args: + start_logits (`jnp.ndarray` of shape `(batch_size, sequence_length)`): + Span-start scores (before SoftMax). + end_logits (`jnp.ndarray` of shape `(batch_size, sequence_length)`): + Span-end scores (before SoftMax). + pooled_output (`jnp.ndarray` of shape `(batch_size, hidden_size)`): + pooled_output returned by FlaxBigBirdModel. + hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape + `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + start_logits: jnp.ndarray = None + end_logits: jnp.ndarray = None + pooled_output: jnp.ndarray = None + hidden_states: Optional[tuple[jnp.ndarray]] = None + attentions: Optional[tuple[jnp.ndarray]] = None + + +BIG_BIRD_START_DOCSTRING = r""" + + This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading, saving and converting weights from PyTorch models) + + This model is also a + [flax.linen.Module](https://flax.readthedocs.io/en/latest/api_reference/flax.linen/module.html) subclass. Use it as + a regular Flax linen Module and refer to the Flax documentation for all matter related to general usage and + behavior. + + Finally, this model supports inherent JAX features such as: + + - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit) + - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation) + - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap) + - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap) + + Parameters: + config ([`BigBirdConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights. + dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`): + The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on GPUs) and + `jax.numpy.bfloat16` (on TPUs). + + This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If + specified all the computation will be performed with the given `dtype`. + + **Note that this only specifies the dtype of the computation and does not influence the dtype of model + parameters.** + + If you wish to change the dtype of the model parameters, see [`~FlaxPreTrainedModel.to_fp16`] and + [`~FlaxPreTrainedModel.to_bf16`]. +""" + +BIG_BIRD_INPUTS_DOCSTRING = r""" + Args: + input_ids (`numpy.ndarray` of shape `({0})`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`numpy.ndarray` of shape `({0})`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + token_type_ids (`numpy.ndarray` of shape `({0})`, *optional*): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, + 1]`: + + - 0 corresponds to a *sentence A* token, + - 1 corresponds to a *sentence B* token. + + [What are token type IDs?](../glossary#token-type-ids) + position_ids (`numpy.ndarray` of shape `({0})`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + head_mask (`numpy.ndarray` of shape `({0})`, `optional): + Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + +""" + + +class FlaxBigBirdEmbeddings(nn.Module): + """Construct the embeddings from word, position and token_type embeddings.""" + + config: BigBirdConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + # Copied from transformers.models.bert.modeling_flax_bert.FlaxBertEmbeddings.setup + def setup(self): + self.word_embeddings = nn.Embed( + self.config.vocab_size, + self.config.hidden_size, + embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range), + dtype=self.dtype, + ) + self.position_embeddings = nn.Embed( + self.config.max_position_embeddings, + self.config.hidden_size, + embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range), + dtype=self.dtype, + ) + self.token_type_embeddings = nn.Embed( + self.config.type_vocab_size, + self.config.hidden_size, + embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range), + dtype=self.dtype, + ) + self.LayerNorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype) + self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob) + + def __call__(self, input_ids, token_type_ids, position_ids, attention_mask, deterministic: bool = True): + # Embed + inputs_embeds = self.word_embeddings(input_ids.astype("i4")) + position_embeds = self.position_embeddings(position_ids.astype("i4")) + token_type_embeddings = self.token_type_embeddings(token_type_ids.astype("i4")) + + if self.config.rescale_embeddings: + inputs_embeds *= self.config.hidden_size**0.5 + + # Sum all embeddings + hidden_states = inputs_embeds + token_type_embeddings + position_embeds + + # Layer Norm + hidden_states = self.dropout(hidden_states, deterministic=deterministic) + hidden_states = self.LayerNorm(hidden_states) + return hidden_states + + +# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertSelfAttention with Bert->BigBird +class FlaxBigBirdSelfAttention(nn.Module): + config: BigBirdConfig + causal: bool = False + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.head_dim = self.config.hidden_size // self.config.num_attention_heads + if self.config.hidden_size % self.config.num_attention_heads != 0: + raise ValueError( + "`config.hidden_size`: {self.config.hidden_size} has to be a multiple of `config.num_attention_heads` " + " : {self.config.num_attention_heads}" + ) + + self.query = nn.Dense( + self.config.hidden_size, + dtype=self.dtype, + kernel_init=jax.nn.initializers.normal(self.config.initializer_range), + ) + self.key = nn.Dense( + self.config.hidden_size, + dtype=self.dtype, + kernel_init=jax.nn.initializers.normal(self.config.initializer_range), + ) + self.value = nn.Dense( + self.config.hidden_size, + dtype=self.dtype, + kernel_init=jax.nn.initializers.normal(self.config.initializer_range), + ) + + if self.causal: + self.causal_mask = make_causal_mask( + jnp.ones((1, self.config.max_position_embeddings), dtype="bool"), dtype="bool" + ) + + def _split_heads(self, hidden_states): + return hidden_states.reshape(hidden_states.shape[:2] + (self.config.num_attention_heads, self.head_dim)) + + def _merge_heads(self, hidden_states): + return hidden_states.reshape(hidden_states.shape[:2] + (self.config.hidden_size,)) + + @nn.compact + # Copied from transformers.models.bart.modeling_flax_bart.FlaxBartAttention._concatenate_to_cache + def _concatenate_to_cache(self, key, value, query, attention_mask): + """ + This function takes projected key, value states from a single input token and concatenates the states to cached + states from previous steps. This function is slightly adapted from the official Flax repository: + https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252 + """ + # detect if we're initializing by absence of existing cache data. + is_initialized = self.has_variable("cache", "cached_key") + cached_key = self.variable("cache", "cached_key", jnp.zeros, key.shape, key.dtype) + cached_value = self.variable("cache", "cached_value", jnp.zeros, value.shape, value.dtype) + cache_index = self.variable("cache", "cache_index", lambda: jnp.array(0, dtype=jnp.int32)) + + if is_initialized: + *batch_dims, max_length, num_heads, depth_per_head = cached_key.value.shape + # update key, value caches with our new 1d spatial slices + cur_index = cache_index.value + indices = (0,) * len(batch_dims) + (cur_index, 0, 0) + key = lax.dynamic_update_slice(cached_key.value, key, indices) + value = lax.dynamic_update_slice(cached_value.value, value, indices) + cached_key.value = key + cached_value.value = value + num_updated_cache_vectors = query.shape[1] + cache_index.value = cache_index.value + num_updated_cache_vectors + # causal mask for cached decoder self-attention: our single query position should only attend to those key positions that have already been generated and cached, not the remaining zero elements. + pad_mask = jnp.broadcast_to( + jnp.arange(max_length) < cur_index + num_updated_cache_vectors, + tuple(batch_dims) + (1, num_updated_cache_vectors, max_length), + ) + attention_mask = combine_masks(pad_mask, attention_mask) + return key, value, attention_mask + + def __call__( + self, + hidden_states, + attention_mask, + layer_head_mask, + key_value_states: Optional[jnp.ndarray] = None, + init_cache: bool = False, + deterministic=True, + output_attentions: bool = False, + ): + # if key_value_states are provided this layer is used as a cross-attention layer + # for the decoder + is_cross_attention = key_value_states is not None + batch_size = hidden_states.shape[0] + + # get query proj + query_states = self.query(hidden_states) + # get key, value proj + if is_cross_attention: + # cross_attentions + key_states = self.key(key_value_states) + value_states = self.value(key_value_states) + else: + # self_attention + key_states = self.key(hidden_states) + value_states = self.value(hidden_states) + + query_states = self._split_heads(query_states) + key_states = self._split_heads(key_states) + value_states = self._split_heads(value_states) + + # handle cache prepare causal attention mask + if self.causal: + query_length, key_length = query_states.shape[1], key_states.shape[1] + if self.has_variable("cache", "cached_key"): + mask_shift = self.variables["cache"]["cache_index"] + max_decoder_length = self.variables["cache"]["cached_key"].shape[1] + causal_mask = lax.dynamic_slice( + self.causal_mask, (0, 0, mask_shift, 0), (1, 1, query_length, max_decoder_length) + ) + else: + causal_mask = self.causal_mask[:, :, :query_length, :key_length] + causal_mask = jnp.broadcast_to(causal_mask, (batch_size,) + causal_mask.shape[1:]) + + # combine masks if needed + if attention_mask is not None and self.causal: + attention_mask = jnp.broadcast_to(jnp.expand_dims(attention_mask, axis=(-3, -2)), causal_mask.shape) + attention_mask = combine_masks(attention_mask, causal_mask) + elif self.causal: + attention_mask = causal_mask + elif attention_mask is not None: + attention_mask = jnp.expand_dims(attention_mask, axis=(-3, -2)) + + # During fast autoregressive decoding, we feed one position at a time, + # and cache the keys and values step by step. + if self.causal and (self.has_variable("cache", "cached_key") or init_cache): + key_states, value_states, attention_mask = self._concatenate_to_cache( + key_states, value_states, query_states, attention_mask + ) + + # Convert the boolean attention mask to an attention bias. + if attention_mask is not None: + # attention mask in the form of attention bias + attention_bias = lax.select( + attention_mask > 0, + jnp.full(attention_mask.shape, 0.0).astype(self.dtype), + jnp.full(attention_mask.shape, jnp.finfo(self.dtype).min).astype(self.dtype), + ) + else: + attention_bias = None + + dropout_rng = None + if not deterministic and self.config.attention_probs_dropout_prob > 0.0: + dropout_rng = self.make_rng("dropout") + + attn_weights = dot_product_attention_weights( + query_states, + key_states, + bias=attention_bias, + dropout_rng=dropout_rng, + dropout_rate=self.config.attention_probs_dropout_prob, + broadcast_dropout=True, + deterministic=deterministic, + dtype=self.dtype, + precision=None, + ) + + # Mask heads if we want to + if layer_head_mask is not None: + attn_weights = jnp.einsum("...hqk,h->...hqk", attn_weights, layer_head_mask) + + attn_output = jnp.einsum("...hqk,...khd->...qhd", attn_weights, value_states) + attn_output = attn_output.reshape(attn_output.shape[:2] + (-1,)) + + outputs = (attn_output, attn_weights) if output_attentions else (attn_output,) + return outputs + + +class FlaxBigBirdBlockSparseAttention(nn.Module): + config: BigBirdConfig + block_sparse_seed: Optional[int] = None + dtype: jnp.dtype = jnp.float32 + + def setup(self): + self.query = nn.Dense( + self.config.hidden_size, + dtype=self.dtype, + use_bias=self.config.use_bias, + kernel_init=jax.nn.initializers.normal(self.config.initializer_range), + ) + self.key = nn.Dense( + self.config.hidden_size, + dtype=self.dtype, + use_bias=self.config.use_bias, + kernel_init=jax.nn.initializers.normal(self.config.initializer_range), + ) + self.value = nn.Dense( + self.config.hidden_size, + dtype=self.dtype, + use_bias=self.config.use_bias, + kernel_init=jax.nn.initializers.normal(self.config.initializer_range), + ) + + @staticmethod + def transpose_for_scores(x, n_heads, head_size): + new_x_shape = x.shape[:-1] + (n_heads, head_size) + x = x.reshape(*new_x_shape) + return jnp.transpose(x, axes=(0, 2, 1, 3)) + + def __call__( + self, + hidden_states, + attention_mask, + deterministic=True, + output_attentions=False, + ): + n_heads = self.config.num_attention_heads + head_size = self.config.hidden_size // n_heads + + blocked_encoder_mask, band_mask, from_mask, to_mask = self.create_masks_for_block_sparse_attn( + attention_mask, self.config.block_size + ) + + query_layer = self.transpose_for_scores(self.query(hidden_states), n_heads, head_size) + key_layer = self.transpose_for_scores(self.key(hidden_states), n_heads, head_size) + value_layer = self.transpose_for_scores(self.value(hidden_states), n_heads, head_size) + + indices_prng_key = None + if not deterministic: + indices_prng_key = self.make_rng("indices") + + attn_output, attn_weights = self.bigbird_block_sparse_attention( + query_layer, + key_layer, + value_layer, + band_mask, + from_mask, + to_mask, + blocked_encoder_mask, + blocked_encoder_mask, + n_heads, + head_size, + indices_prng_key=indices_prng_key, + deterministic=deterministic, + plan_from_length=None, + plan_num_rand_blocks=None, + output_attentions=output_attentions, + ) + + outputs = (attn_output, attn_weights) if output_attentions else (attn_output,) + return outputs + + @staticmethod + def create_masks_for_block_sparse_attn(attention_mask, block_size: int): + batch_size, seq_length = attention_mask.shape + if seq_length % block_size != 0: + raise ValueError( + f"Sequence length must be multiple of block size, but sequence length is {seq_length}, while block" + f" size is {block_size}." + ) + + def create_band_mask_from_inputs(from_blocked_mask, to_blocked_mask): + """ + Create 3D attention mask from a 2D tensor mask. + + Args: + from_blocked_mask: 2D Tensor of shape [batch_size, + from_seq_length//from_block_size, from_block_size]. + to_blocked_mask: int32 Tensor of shape [batch_size, + to_seq_length//to_block_size, to_block_size]. + + Returns: + float Tensor of shape [batch_size, 1, from_seq_length//from_block_size-4, from_block_size, + 3*to_block_size]. + """ + exp_blocked_to_pad = jnp.concatenate( + [to_blocked_mask[:, 1:-3], to_blocked_mask[:, 2:-2], to_blocked_mask[:, 3:-1]], axis=2 + ) + band_mask = jnp.einsum("blq,blk->blqk", from_blocked_mask[:, 2:-2], exp_blocked_to_pad) + band_mask = jnp.expand_dims(band_mask, 1) + return band_mask + + blocked_encoder_mask = attention_mask.reshape(batch_size, seq_length // block_size, block_size) + band_mask = create_band_mask_from_inputs(blocked_encoder_mask, blocked_encoder_mask) + + from_mask = attention_mask.reshape(batch_size, 1, seq_length, 1) + to_mask = attention_mask.reshape(batch_size, 1, 1, seq_length) + + return blocked_encoder_mask, band_mask, from_mask, to_mask + + def bigbird_block_sparse_attention( + self, + query_layer, + key_layer, + value_layer, + band_mask, + from_mask, + to_mask, + from_blocked_mask, + to_blocked_mask, + n_heads, + head_size, + indices_prng_key: Optional[jax.random.PRNGKey] = None, + deterministic: Optional[bool] = True, + plan_from_length=None, + plan_num_rand_blocks=None, + output_attentions=None, + ): + # BigBird block-sparse attention as suggested in paper + + # ITC: + # global tokens: 2 x block_size + # window tokens: 3 x block_size + # random tokens: num_rand_tokens x block_size + + # ETC: + # global tokens: extra_globals_tokens + 2 x block_size + # window tokens: 3 x block_size + # random tokens: num_rand_tokens x block_size + + # Note: + # 1) Currently, ETC is not supported. + # 2) Window size is fixed to 3 blocks & it can be changed only by + # changing `block_size`. + # 3) Number of global blocks are fixed (2 blocks here) & global tokens can be + # controlled only by `block_size`. + + # attention is calculated separately for q[0], q[1], q[2:-2], q[-2], q[-1] in order to use special trick of + # shifting tokens (for calculating sliding attention). hence following code can be divided into 5 parts. + + bsz, _, from_seq_len, _ = query_layer.shape + to_seq_len = key_layer.shape[2] + from_block_size = to_block_size = self.config.block_size + + if from_seq_len % from_block_size != 0: + raise ValueError("Query sided sequence length must be multiple of block size") + + if to_seq_len % to_block_size != 0: + raise ValueError("Key/Value sided sequence length must be multiple of block size") + + if from_seq_len // from_block_size != to_seq_len // to_block_size: + raise ValueError("Error the number of blocks needs to be same!") + + n_rand_blocks = self.config.num_random_blocks + rsqrt_d = 1 / jnp.sqrt(head_size) + attn_mask_penalty = -10000.0 + + if from_seq_len in [1024, 3072, 4096]: # old plans used in paper + max_seqlen = self.config.max_position_embeddings + rand_attn = [ + self._bigbird_block_rand_mask( + max_seqlen, + max_seqlen, + from_block_size, + to_block_size, + n_rand_blocks, + indices_prng_key=indices_prng_key, + deterministic=deterministic, + last_idx=1024, + )[: (from_seq_len // from_block_size - 2)] + for _ in range(n_heads) + ] + else: + if plan_from_length is None: + plan_from_length, plan_num_rand_blocks = self._get_rand_attn_plan( + from_seq_len, from_block_size, n_rand_blocks + ) + rand_attn = self._bigbird_block_rand_mask_with_head( + from_seq_length=from_seq_len, + to_seq_length=to_seq_len, + from_block_size=from_block_size, + to_block_size=to_block_size, + num_heads=n_heads, + plan_from_length=plan_from_length, + plan_num_rand_blocks=plan_num_rand_blocks, + indices_prng_key=indices_prng_key, + ) + + rand_attn = jnp.stack(rand_attn, axis=0) + rand_attn = jnp.broadcast_to(rand_attn, (bsz,) + rand_attn.shape) + + rand_mask = self._create_rand_mask_from_inputs( + from_blocked_mask, to_blocked_mask, rand_attn, n_heads, n_rand_blocks, bsz, from_seq_len, from_block_size + ) + + blocked_query_matrix = query_layer.reshape(bsz, n_heads, from_seq_len // from_block_size, from_block_size, -1) + blocked_key_matrix = key_layer.reshape(bsz, n_heads, to_seq_len // to_block_size, to_block_size, -1) + blocked_value_matrix = value_layer.reshape(bsz, n_heads, to_seq_len // to_block_size, to_block_size, -1) + + shape = (bsz, n_heads, to_seq_len // to_block_size - 2, n_rand_blocks * to_block_size, -1) + gathered_key = self.jax_gather(blocked_key_matrix, rand_attn, batch_dims=2).reshape(*shape) + gathered_value = self.jax_gather(blocked_value_matrix, rand_attn, batch_dims=2).reshape(*shape) + + # 1st PART + # 1st block (global block) attention scores + # q[0] x (k[0], k[1], k[2], k[3], k[4] .... ) + + # [bsz, n_heads, from_block_size, -1] x [bsz, n_heads, to_seq_len, -1] ==> [bsz, n_heads, from_block_size, to_seq_len] + first_product = jnp.einsum("bhqd,bhkd->bhqk", blocked_query_matrix[:, :, 0], key_layer) + + first_product = first_product * rsqrt_d + first_product += (1.0 - to_mask) * attn_mask_penalty + first_attn_weights = jax.nn.softmax(first_product, axis=-1) # [bsz, n_heads, from_block_size, to_seq_len] + + # [bsz, n_heads, from_block_size, to_seq_len] x [bsz, n_heads, to_seq_len, -1] ==> [bsz, n_heads, from_block_size, -1] + first_context_layer = jnp.einsum("bhqk,bhkd->bhqd", first_attn_weights, value_layer) + first_context_layer = jnp.expand_dims(first_context_layer, 2) + + # 2nd PART + # 2nd block attention scores + # q[1] x (sliding_keys, random_keys, global_keys) + # sliding key blocks -> 2nd, 3rd blocks + # global key blocks -> 1st block + + second_key_mat = jnp.concatenate( + [ + blocked_key_matrix[:, :, 0], + blocked_key_matrix[:, :, 1], + blocked_key_matrix[:, :, 2], + blocked_key_matrix[:, :, -1], + gathered_key[:, :, 0], + ], + axis=2, + ) # [bsz, n_heads, (4+n_rand_blocks)*to_block_size, -1] + second_value_mat = jnp.concatenate( + [ + blocked_value_matrix[:, :, 0], + blocked_value_matrix[:, :, 1], + blocked_value_matrix[:, :, 2], + blocked_value_matrix[:, :, -1], + gathered_value[:, :, 0], + ], + axis=2, + ) # [bsz, n_heads, (4+n_rand_blocks)*to_block_size, -1] + + # [bsz, n_heads, from_block_size, -1] x [bsz, n_heads, (4+n_rand_blocks)*to_block_size, -1] + # ==> [bsz, n_heads, from_block_size, (4+n_rand_blocks)*to_block_size] + second_product = jnp.einsum("bhqd,bhkd->bhqk", blocked_query_matrix[:, :, 1], second_key_mat) + second_seq_pad = jnp.concatenate( + [ + to_mask[:, :, :, : 3 * to_block_size], + to_mask[:, :, :, -to_block_size:], + jnp.ones([bsz, 1, 1, n_rand_blocks * to_block_size], dtype=to_mask.dtype), + ], + axis=3, + ) + second_rand_pad = jnp.concatenate( + [ + jnp.ones([bsz, n_heads, from_block_size, 4 * to_block_size], dtype=rand_mask.dtype), + rand_mask[:, :, 0], + ], + axis=3, + ) + second_product = second_product * rsqrt_d + second_product += (1.0 - jnp.minimum(second_seq_pad, second_rand_pad)) * attn_mask_penalty + second_attn_weights = jax.nn.softmax( + second_product, axis=-1 + ) # [bsz, n_heads, from_block_size, (4+n_rand_blocks)*to_block_size] + + # [bsz, n_heads, from_block_size, (4+r)*to_block_size] x [bsz, n_heads, (4+r)*to_block_size, -1] + # ==> [bsz, n_heads, from_block_size, -1] + second_context_layer = jnp.einsum("bhqk,bhkd->bhqd", second_attn_weights, second_value_mat) + second_context_layer = jnp.expand_dims(second_context_layer, 2) + + # 3rd PART + # Middle blocks attention scores + # q[-2:2] x (sliding_keys, random_keys, global_keys) + # sliding attn is calculated using special trick of shifting tokens as discussed in paper + # random keys are generated by taking random indices as per `rand_attn` + # global keys -> 1st & last block + + exp_blocked_key_matrix = jnp.concatenate( + [blocked_key_matrix[:, :, 1:-3], blocked_key_matrix[:, :, 2:-2], blocked_key_matrix[:, :, 3:-1]], axis=3 + ) # [bsz, n_heads, from_seq_len//from_block_size-4, 3*to_block_size, -1] + exp_blocked_value_matrix = jnp.concatenate( + [blocked_value_matrix[:, :, 1:-3], blocked_value_matrix[:, :, 2:-2], blocked_value_matrix[:, :, 3:-1]], + axis=3, + ) # [bsz, n_heads, from_seq_len//from_block_size-4, 3*to_block_size, -1] + middle_query_matrix = blocked_query_matrix[:, :, 2:-2] + + # sliding attention scores for q[-2:2] + # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, -1] x [b, n_heads, from_seq_len//from_block_size-4, 3*to_block_size, -1] + inner_band_product = jnp.einsum("bhlqd,bhlkd->bhlqk", middle_query_matrix, exp_blocked_key_matrix) + # ==> [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, 3*to_block_size] + inner_band_product = inner_band_product * rsqrt_d + + # randn attention scores for q[-2:2] + # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, -1] + # x [bsz, n_heads, from_seq_len//from_block_size-4, n_rand_blocks*to_block_size, -1] + rand_band_product = jnp.einsum("bhlqd,bhlkd->bhlqk", middle_query_matrix, gathered_key[:, :, 1:-1]) + # ==> [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, n_rand_blocks*to_block_size] + rand_band_product = rand_band_product * rsqrt_d + + # Including 1st block (since it's global) + # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, -1] x [bsz, n_heads, to_block_size, -1] + # ==> [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, to_block_size] + first_band_product = jnp.einsum("bhlqd,bhkd->bhlqk", middle_query_matrix, blocked_key_matrix[:, :, 0]) + first_band_product = first_band_product * rsqrt_d + + # Including last block (since it's global) + # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, -1] x [bsz, n_heads, to_block_size, -1] + # ==> [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, to_block_size] + last_band_product = jnp.einsum("bhlqd,bhkd->bhlqk", middle_query_matrix, blocked_key_matrix[:, :, -1]) + last_band_product = last_band_product * rsqrt_d + + # masking padded tokens + inner_band_product += (1.0 - band_mask) * attn_mask_penalty + first_band_product += (1.0 - jnp.expand_dims(to_mask[:, :, :, :to_block_size], 3)) * attn_mask_penalty + last_band_product += (1.0 - jnp.expand_dims(to_mask[:, :, :, -to_block_size:], 3)) * attn_mask_penalty + rand_band_product += (1.0 - rand_mask[:, :, 1:-1]) * attn_mask_penalty + + # completing attention scores matrix for all q[-2:2] + band_product = jnp.concatenate( + [first_band_product, inner_band_product, rand_band_product, last_band_product], axis=-1 + ) # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, (5+n_rand_blocks)*to_block_size] + + # safely doing softmax since attention matrix is completed + attn_weights = jax.nn.softmax( + band_product, axis=-1 + ) # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, (5+n_rand_blocks)*to_block_size] + + # contribution of sliding keys + # [bsz, n_heads, m//from_block_size-4, from_block_size, 3*to_block_size] + # x [bsz, n_heads, from_seq_len//from_block_size-4, 3*to_block_size, -1] + context_layer = jnp.einsum( + "bhlqk,bhlkd->bhlqd", attn_weights[:, :, :, :, to_block_size : 4 * to_block_size], exp_blocked_value_matrix + ) + # ==> [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, -1] + + # adding contribution of random keys + # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, n_rand_blocks*to_block_size] + # x [bsz, n_heads, from_seq_len//from_block_size-4, n_rand_blocks*to_block_size, -1] + context_layer += jnp.einsum( + "bhlqk,bhlkd->bhlqd", + attn_weights[:, :, :, :, 4 * to_block_size : -to_block_size], + gathered_value[:, :, 1:-1], + ) + # ==> [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, -1] + + # adding contribution of global keys + # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, to_block_size] x [bsz, n_heads, to_block_size, -1] + # ==> [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, -1] + context_layer += jnp.einsum( + "bhlqk,bhkd->bhlqd", attn_weights[:, :, :, :, :to_block_size], blocked_value_matrix[:, :, 0] + ) + # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, to_block_size] x [bsz, n_heads, to_block_size, -1] + # ==> [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, -1] + context_layer += jnp.einsum( + "bhlqk,bhkd->bhlqd", attn_weights[:, :, :, :, -to_block_size:], blocked_value_matrix[:, :, -1] + ) + + # 4th PART + # last 2nd token attention scores + # q[-2] x (sliding_keys, random_keys, global_keys) + # sliding key blocks -> last 3 blocks + # global key block -> 1st block + # random key block -> based on indices stored in `randn_attn` + + second_last_key_mat = jnp.concatenate( + [ + blocked_key_matrix[:, :, 0], + blocked_key_matrix[:, :, -3], + blocked_key_matrix[:, :, -2], + blocked_key_matrix[:, :, -1], + gathered_key[:, :, -1], + ], + axis=2, + ) # [bsz, n_heads, (4+n_random_blocks)*to_block_size, -1] + second_last_value_mat = jnp.concatenate( + [ + blocked_value_matrix[:, :, 0], + blocked_value_matrix[:, :, -3], + blocked_value_matrix[:, :, -2], + blocked_value_matrix[:, :, -1], + gathered_value[:, :, -1], + ], + axis=2, + ) # [bsz, n_heads, (4+r)*to_block_size, -1] + + # [bsz, n_heads, from_block_size, -1] x [bsz, n_heads, (4+n_rand_blocks)*to_block_size, -1] + # ==> [bsz, n_heads, from_block_size, (4+n_rand_blocks)*to_block_size] + second_last_product = jnp.einsum("bhqd,bhkd->bhqk", blocked_query_matrix[:, :, -2], second_last_key_mat) + second_last_seq_pad = jnp.concatenate( + [ + to_mask[:, :, :, :to_block_size], + to_mask[:, :, :, -3 * to_block_size :], + jnp.ones([bsz, 1, 1, n_rand_blocks * to_block_size], dtype=to_mask.dtype), + ], + axis=3, + ) + second_last_rand_pad = jnp.concatenate( + [ + jnp.ones([bsz, n_heads, from_block_size, 4 * to_block_size], dtype=rand_mask.dtype), + rand_mask[:, :, -1], + ], + axis=3, + ) + second_last_product = second_last_product * rsqrt_d + second_last_product += (1.0 - jnp.minimum(second_last_seq_pad, second_last_rand_pad)) * attn_mask_penalty + second_last_attn_weights = jax.nn.softmax( + second_last_product, axis=-1 + ) # [bsz, n_heads, from_block_size, (4+n_rand_blocks)*to_block_size] + + # [bsz, n_heads, from_block_size, (4+n_rand_blocks)*to_block_size] x [bsz, n_heads, (4+n_rand_blocks)*to_block_size, -1] + # ==> [bsz, n_heads, from_block_size, -1] + second_last_context_layer = jnp.einsum("bhqk,bhkd->bhqd", second_last_attn_weights, second_last_value_mat) + second_last_context_layer = jnp.expand_dims(second_last_context_layer, 2) + + # 5th PART + # last block (global) attention scores + # q[-1] x (k[0], k[1], k[2], k[3], .... ) + + # [bsz, n_heads, from_block_size, -1] x [bsz, n_heads, to_seq_len, -1] ==> [bsz, n_heads, from_block_size, to_seq_len] + last_product = jnp.einsum("bhqd,bhkd->bhqk", blocked_query_matrix[:, :, -1], key_layer) + last_product = last_product * rsqrt_d + last_product += (1.0 - to_mask) * attn_mask_penalty + last_attn_weights = jax.nn.softmax(last_product, axis=-1) # [bsz, n_heads, from_block_size, n] + + # [bsz, n_heads, from_block_size, to_seq_len] x [bsz, n_heads, to_seq_len, -1] ==> [bsz, n_heads, from_block_size, -1] + last_context_layer = jnp.einsum("bhqk,bhkd->bhqd", last_attn_weights, value_layer) + last_context_layer = jnp.expand_dims(last_context_layer, 2) + + # combining representations of all tokens + context_layer = jnp.concatenate( + [first_context_layer, second_context_layer, context_layer, second_last_context_layer, last_context_layer], + axis=2, + ) + context_layer = context_layer.reshape(bsz, n_heads, from_seq_len, -1) * from_mask + context_layer = jnp.transpose(context_layer, axes=(0, 2, 1, 3)).reshape(bsz, from_seq_len, -1) + + attention_probs = None + + return context_layer, attention_probs + + @staticmethod + def jax_gather(params, indices, batch_dims=2): + """ + Gather the indices from params correctly (equivalent to tf.gather but with modifications) + + Args: + params: (bsz, n_heads, num_blocks, block_size, head_dim) + indices: (bhlqk", from_blocked_mask[:, 1:-1], rand_mask) + return rand_mask + + @staticmethod + def _get_rand_attn_plan(from_seq_length, from_block_size, num_rand_blocks): + """ + Gives the plan of where to put random attention. + + Args: + from_seq_length: int. length of from sequence. + from_block_size: int. size of block in from sequence. + num_rand_blocks: int. Number of random chunks per row. + + Returns: + plan_from_length: ending location of from block plan_num_rand_blocks: number of random ending location for + each block + """ + + plan_from_length = [] + plan_num_rand_blocks = [] + if (2 * num_rand_blocks + 5) < (from_seq_length // from_block_size): + plan_from_length.append(int((2 * num_rand_blocks + 5) * from_block_size)) + plan_num_rand_blocks.append(num_rand_blocks) + plan_from_length.append(from_seq_length) + plan_num_rand_blocks.append(0) + elif (num_rand_blocks + 5) < (from_seq_length // from_block_size): + plan_from_length.append(int((num_rand_blocks + 5) * from_block_size)) + plan_num_rand_blocks.append(num_rand_blocks // 2) + plan_from_length.append(from_seq_length) + plan_num_rand_blocks.append(num_rand_blocks - (num_rand_blocks // 2)) + else: + plan_from_length.append(from_seq_length) + plan_num_rand_blocks.append(num_rand_blocks) + + return plan_from_length, plan_num_rand_blocks + + @staticmethod + def _bigbird_block_rand_mask( + from_seq_length, + to_seq_length, + from_block_size, + to_block_size, + num_rand_blocks, + indices_prng_key: Optional[jax.random.PRNGKey] = None, + deterministic: Optional[bool] = True, + last_idx: Optional[int] = -1, + ): + """ + Create adjacency list of random attention. + + Args: + from_seq_length: int. length of from sequence. + to_seq_length: int. length of to sequence. + from_block_size: int. size of block in from sequence. + to_block_size: int. size of block in to sequence. + num_rand_blocks: int. Number of random chunks per row. + indices_prng_key: jax.random.PRNGKey. PRNG key that is used to perform random jax operations. + deterministic: bool. When False random attention will be used. + last_idx: if -1 then num_rand_blocks blocks chosen anywhere in to sequence, + if positive then num_rand_blocks blocks chosen only up to last_idx. + + Returns: + adjacency list of size from_seq_length//from_block_size-2 by num_rand_blocks + """ + # using this method when from_seq_length in [1024, 3072, 4096] + + if from_seq_length // from_block_size != to_seq_length // to_block_size: + raise ValueError("Error the number of blocks needs to be same!") + rand_attn = jnp.zeros((from_seq_length // from_block_size - 2, num_rand_blocks), dtype=jnp.int32) + # deterministic nor randomness + if deterministic: + return rand_attn + + middle_seq = jnp.arange(1, to_seq_length // to_block_size - 1, dtype=jnp.int32) + last = to_seq_length // to_block_size - 1 + if last_idx > (2 * to_block_size): + last = (last_idx // to_block_size) - 1 + + r = num_rand_blocks # shorthand + for i in range(1, from_seq_length // from_block_size - 1): + start = i - 2 + end = i + if i == 1: + seq_values = jax.random.permutation(indices_prng_key, middle_seq[2:last])[:r] + rand_attn = rand_attn.at[i - 1].set(seq_values) + elif i == 2: + seq_values = jax.random.permutation(indices_prng_key, middle_seq[3:last])[:r] + rand_attn = rand_attn.at[i - 1].set(seq_values) + elif i == from_seq_length // from_block_size - 3: + seq_values = jax.random.permutation(indices_prng_key, middle_seq[:last])[:r] + rand_attn = rand_attn.at[i - 1].set(seq_values) + # Missing -3: should have been sliced till last-3 + elif i == from_seq_length // from_block_size - 2: + seq_values = jax.random.permutation(indices_prng_key, middle_seq[:last])[:r] + rand_attn = rand_attn.at[i - 1].set(seq_values) + # Missing -4: should have been sliced till last-4 + else: + if start > last: + start = last + seq_values = jax.random.permutation(indices_prng_key, middle_seq[:start])[:r] + rand_attn = rand_attn.at[i - 1].set(seq_values) + elif (end + 1) == last: + seq_values = jax.random.permutation(indices_prng_key, middle_seq[:start])[:r] + rand_attn = rand_attn.at[i - 1].set(seq_values) + else: + concat_values = jnp.concatenate((middle_seq[:start], middle_seq[end + 1 : last])) + seq_values = jax.random.permutation(indices_prng_key, concat_values)[:r] + rand_attn = rand_attn.at[i - 1].set(seq_values) + return rand_attn + + def _bigbird_block_rand_mask_with_head( + self, + from_seq_length, + to_seq_length, + from_block_size, + to_block_size, + num_heads, + plan_from_length, + plan_num_rand_blocks, + indices_prng_key: Optional[jax.random.PRNGKey] = None, + deterministic: Optional[bool] = True, + window_block_left=1, + window_block_right=1, + global_block_top=1, + global_block_bottom=1, + global_block_left=1, + global_block_right=1, + ): + """ + Create adjacency list of random attention. + + Args: + from_seq_length: int. length of from sequence. + to_seq_length: int. length of to sequence. + from_block_size: int. size of block in from sequence. + to_block_size: int. size of block in to sequence. + num_heads: int. total number of heads. + plan_from_length: list. plan from length where num_random_blocks are chosen from. + plan_num_rand_blocks: list. number of rand blocks within the plan. + indices_prng_key: jax.random.PRNGKey. PRNG key that is used to perform random jax operations. + deterministic: bool. When False random attention will be used. + window_block_left: int. number of blocks of window to left of a block. + window_block_right: int. number of blocks of window to right of a block. + global_block_top: int. number of blocks at the top. + global_block_bottom: int. number of blocks at the bottom. + global_block_left: int. Number of blocks globally used to the left. + global_block_right: int. Number of blocks globally used to the right. + + Returns: + adjacency list of size num_head where each element is of size from_seq_length//from_block_size-2 by + num_rand_blocks + """ + # using this method when from_seq_length not in [1024, 3072, 4096] + + if from_seq_length // from_block_size != to_seq_length // to_block_size: + raise ValueError("Error the number of blocks needs to be same!") + + if from_seq_length not in plan_from_length: + raise ValueError("Error from sequence length not in plan!") + + # Total number of blocks in the mmask + num_blocks = from_seq_length // from_block_size + # Number of blocks per plan + plan_block_length = jnp.array(plan_from_length) // from_block_size + # till when to follow plan + max_plan_idx = plan_from_length.index(from_seq_length) + + # Random Attention adjacency list + rand_attn = [ + jnp.zeros((num_blocks, sum(plan_num_rand_blocks[: max_plan_idx + 1])), dtype=jnp.int32) + for i in range(num_heads) + ] + + # deterministic + if deterministic: + for nh in range(num_heads): + rand_attn[nh] = rand_attn[nh][global_block_top : num_blocks - global_block_bottom, :] + return rand_attn + + # We will go iteratively over the plan blocks and pick random number of + # Attention blocks from the legally allowed blocks + for plan_idx in range(max_plan_idx + 1): + rnd_r_cnt = 0 + if plan_idx > 0: + # set the row for all from_blocks starting from 0 to + # plan_block_length[plan_idx-1] + # column indx start from plan_block_length[plan_idx-1] and ends at + # plan_block_length[plan_idx] + if plan_num_rand_blocks[plan_idx] > 0: + rnd_r_cnt = int(sum(plan_num_rand_blocks[:plan_idx])) + curr_r_cnt = int(sum(plan_num_rand_blocks[: plan_idx + 1])) + for blk_rw_idx in range(global_block_top, plan_block_length[plan_idx - 1]): + for h in range(num_heads): + single_block_row_attention = self._get_single_block_row_attention( + block_id=blk_rw_idx, + to_start_block_id=plan_block_length[plan_idx - 1], + to_end_block_id=plan_block_length[plan_idx], + num_rand_blocks=plan_num_rand_blocks[plan_idx], + window_block_left=window_block_left, + window_block_right=window_block_right, + global_block_left=global_block_left, + global_block_right=global_block_right, + indices_prng_key=indices_prng_key, + ) + rand_attn[h] = ( + rand_attn[h].at[blk_rw_idx, rnd_r_cnt:curr_r_cnt].set(single_block_row_attention) + ) + + for pl_id in range(plan_idx): + if plan_num_rand_blocks[pl_id] == 0: + continue + for blk_rw_idx in range(plan_block_length[plan_idx - 1], plan_block_length[plan_idx]): + rnd_r_cnt = 0 + to_start_block_id = 0 + if pl_id > 0: + rnd_r_cnt = int(sum(plan_num_rand_blocks[:pl_id])) + to_start_block_id = plan_block_length[pl_id - 1] + curr_r_cnt = int(sum(plan_num_rand_blocks[: pl_id + 1])) + for h in range(num_heads): + single_block_row_attention = self._get_single_block_row_attention( + block_id=blk_rw_idx, + to_start_block_id=to_start_block_id, + to_end_block_id=plan_block_length[pl_id], + num_rand_blocks=plan_num_rand_blocks[pl_id], + window_block_left=window_block_left, + window_block_right=window_block_right, + global_block_left=global_block_left, + global_block_right=global_block_right, + indices_prng_key=indices_prng_key, + ) + rand_attn[h] = ( + rand_attn[h].at[blk_rw_idx, rnd_r_cnt:curr_r_cnt].set(single_block_row_attention) + ) + + if plan_num_rand_blocks[plan_idx] == 0: + continue + curr_r_cnt = int(sum(plan_num_rand_blocks[: plan_idx + 1])) + from_start_block_id = global_block_top + to_start_block_id = 0 + if plan_idx > 0: + rnd_r_cnt = int(sum(plan_num_rand_blocks[:plan_idx])) + from_start_block_id = plan_block_length[plan_idx - 1] + to_start_block_id = plan_block_length[plan_idx - 1] + for blk_rw_idx in range(from_start_block_id, plan_block_length[plan_idx]): + for h in range(num_heads): + single_block_row_attention = self._get_single_block_row_attention( + block_id=blk_rw_idx, + to_start_block_id=to_start_block_id, + to_end_block_id=plan_block_length[plan_idx], + num_rand_blocks=plan_num_rand_blocks[plan_idx], + window_block_left=window_block_left, + window_block_right=window_block_right, + global_block_left=global_block_left, + global_block_right=global_block_right, + indices_prng_key=indices_prng_key, + ) + rand_attn[h] = rand_attn[h].at[blk_rw_idx, rnd_r_cnt:curr_r_cnt].set(single_block_row_attention) + + for nh in range(num_heads): + rand_attn[nh] = rand_attn[nh][global_block_top : num_blocks - global_block_bottom, :] + return rand_attn + + @staticmethod + def _get_single_block_row_attention( + block_id, + to_start_block_id, + to_end_block_id, + num_rand_blocks, + indices_prng_key: Optional[jax.random.PRNGKey] = None, + window_block_left=1, + window_block_right=1, + global_block_left=1, + global_block_right=1, + ): + """ + For a single row block get random row attention. + + Args: + block_id: int. block id of row. + to_start_block_id: int. random attention column start id. + to_end_block_id: int. random attention column end id. + num_rand_blocks: int. number of random blocks to be selected. + indices_prng_key: jax.random.PRNGKey. PRNG key that is used to perform random jax operations + window_block_left: int. number of blocks of window to left of a block. + window_block_right: int. number of blocks of window to right of a block. + global_block_left: int. Number of blocks globally used to the left. + global_block_right: int. Number of blocks globally used to the right. + + Returns: + row containing the random attention vector of size num_rand_blocks. + """ + # list of to_blocks from which to choose random attention + to_block_list = jnp.arange(to_start_block_id, to_end_block_id, dtype=jnp.int32) + # permute the blocks + perm_block = jax.random.permutation(indices_prng_key, to_block_list) + + # illegal blocks for the current block id, using window + illegal_blocks = list(range(block_id - window_block_left, block_id + window_block_right + 1)) + + # Add blocks at the start and at the end + illegal_blocks.extend(list(range(global_block_left))) + illegal_blocks.extend(list(range(to_end_block_id - global_block_right, to_end_block_id))) + + # The second from_block cannot choose random attention on second last to_block + if block_id == 1: + illegal_blocks.append(to_end_block_id - 2) + + # The second last from_block cannot choose random attention on second to_block + if block_id == to_end_block_id - 2: + illegal_blocks.append(1) + + selected_random_blocks = [] + + for i in range(to_end_block_id - to_start_block_id): + if perm_block[i] not in illegal_blocks: + selected_random_blocks.append(perm_block[i]) + if len(selected_random_blocks) == num_rand_blocks: + break + return jnp.array(selected_random_blocks, dtype=jnp.int32) + + +# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertSelfOutput with Bert->BigBird +class FlaxBigBirdSelfOutput(nn.Module): + config: BigBirdConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.dense = nn.Dense( + self.config.hidden_size, + kernel_init=jax.nn.initializers.normal(self.config.initializer_range), + dtype=self.dtype, + ) + self.LayerNorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype) + self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob) + + def __call__(self, hidden_states, input_tensor, deterministic: bool = True): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states, deterministic=deterministic) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class FlaxBigBirdAttention(nn.Module): + config: BigBirdConfig + layer_id: Optional[int] = None + causal: bool = False + dtype: jnp.dtype = jnp.float32 + + def setup(self): + if self.config.attention_type == "original_full": + self.self = FlaxBigBirdSelfAttention(self.config, causal=self.causal, dtype=self.dtype) + elif self.config.attention_type == "block_sparse": + self.self = FlaxBigBirdBlockSparseAttention(self.config, block_sparse_seed=self.layer_id, dtype=self.dtype) + else: + raise ValueError( + f"Your `config.attention_type` is {self.config.attention_type} but it can either be `original_full` or" + " `block_sparse`" + ) + + self.output = FlaxBigBirdSelfOutput(self.config, dtype=self.dtype) + + def __call__( + self, + hidden_states, + attention_mask, + layer_head_mask, + key_value_states=None, + init_cache=False, + deterministic=True, + output_attentions: bool = False, + ): + # Attention mask comes in as attention_mask.shape == (*batch_sizes, kv_length) + # FLAX expects: attention_mask.shape == (*batch_sizes, 1, 1, kv_length) such that it is broadcastable + # with attn_weights.shape == (*batch_sizes, num_heads, q_length, kv_length) + if self.config.attention_type == "original_full": + attn_outputs = self.self( + hidden_states, + attention_mask, + layer_head_mask=layer_head_mask, + key_value_states=key_value_states, + init_cache=init_cache, + deterministic=deterministic, + output_attentions=output_attentions, + ) + else: + attn_outputs = self.self( + hidden_states, + attention_mask, + deterministic=deterministic, + output_attentions=output_attentions, + ) + attn_output = attn_outputs[0] + hidden_states = self.output(attn_output, hidden_states, deterministic=deterministic) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (attn_outputs[1],) + + return outputs + + +# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertIntermediate with Bert->BigBird +class FlaxBigBirdIntermediate(nn.Module): + config: BigBirdConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.dense = nn.Dense( + self.config.intermediate_size, + kernel_init=jax.nn.initializers.normal(self.config.initializer_range), + dtype=self.dtype, + ) + self.activation = ACT2FN[self.config.hidden_act] + + def __call__(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.activation(hidden_states) + return hidden_states + + +# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertOutput with Bert->BigBird +class FlaxBigBirdOutput(nn.Module): + config: BigBirdConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.dense = nn.Dense( + self.config.hidden_size, + kernel_init=jax.nn.initializers.normal(self.config.initializer_range), + dtype=self.dtype, + ) + self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob) + self.LayerNorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype) + + def __call__(self, hidden_states, attention_output, deterministic: bool = True): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states, deterministic=deterministic) + hidden_states = self.LayerNorm(hidden_states + attention_output) + return hidden_states + + +class FlaxBigBirdLayer(nn.Module): + config: BigBirdConfig + layer_id: Optional[int] = None + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.attention = FlaxBigBirdAttention( + self.config, layer_id=self.layer_id, causal=self.config.is_decoder, dtype=self.dtype + ) + self.intermediate = FlaxBigBirdIntermediate(self.config, dtype=self.dtype) + self.output = FlaxBigBirdOutput(self.config, dtype=self.dtype) + if self.config.add_cross_attention: + self.crossattention = FlaxBigBirdAttention(self.config, causal=False, dtype=self.dtype) + + # Copied from transformers.models.bert.modeling_flax_bert.FlaxBertLayer.__call__ with Bert->BigBird + def __call__( + self, + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states: Optional[jnp.ndarray] = None, + encoder_attention_mask: Optional[jnp.ndarray] = None, + init_cache: bool = False, + deterministic: bool = True, + output_attentions: bool = False, + ): + # Self Attention + attention_outputs = self.attention( + hidden_states, + attention_mask, + layer_head_mask=layer_head_mask, + init_cache=init_cache, + deterministic=deterministic, + output_attentions=output_attentions, + ) + attention_output = attention_outputs[0] + + # Cross-Attention Block + if encoder_hidden_states is not None: + cross_attention_outputs = self.crossattention( + attention_output, + attention_mask=encoder_attention_mask, + layer_head_mask=layer_head_mask, + key_value_states=encoder_hidden_states, + deterministic=deterministic, + output_attentions=output_attentions, + ) + attention_output = cross_attention_outputs[0] + + hidden_states = self.intermediate(attention_output) + hidden_states = self.output(hidden_states, attention_output, deterministic=deterministic) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (attention_outputs[1],) + if encoder_hidden_states is not None: + outputs += (cross_attention_outputs[1],) + return outputs + + +class FlaxBigBirdLayerCollection(nn.Module): + config: BigBirdConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + gradient_checkpointing: bool = False + + def setup(self): + if self.gradient_checkpointing: + FlaxBigBirdCheckpointLayer = remat(FlaxBigBirdLayer, static_argnums=(5, 6, 7)) + self.layers = [ + FlaxBigBirdCheckpointLayer(self.config, layer_id=i, name=str(i), dtype=self.dtype) + for i in range(self.config.num_hidden_layers) + ] + else: + self.layers = [ + FlaxBigBirdLayer(self.config, layer_id=i, name=str(i), dtype=self.dtype) + for i in range(self.config.num_hidden_layers) + ] + + # Copied from transformers.models.bert.modeling_flax_bert.FlaxBertLayerCollection.__call__ with Bert->BigBird + def __call__( + self, + hidden_states, + attention_mask, + head_mask, + encoder_hidden_states: Optional[jnp.ndarray] = None, + encoder_attention_mask: Optional[jnp.ndarray] = None, + init_cache: bool = False, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + all_attentions = () if output_attentions else None + all_hidden_states = () if output_hidden_states else None + all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None + + # Check if head_mask has a correct number of layers specified if desired + if head_mask is not None: + if head_mask.shape[0] != (len(self.layers)): + raise ValueError( + f"The head_mask should be specified for {len(self.layers)} layers, but it is for " + f" {head_mask.shape[0]}." + ) + + for i, layer in enumerate(self.layers): + if output_hidden_states: + all_hidden_states += (hidden_states,) + + layer_outputs = layer( + hidden_states, + attention_mask, + head_mask[i] if head_mask is not None else None, + encoder_hidden_states, + encoder_attention_mask, + init_cache, + deterministic, + output_attentions, + ) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_attentions += (layer_outputs[1],) + + if encoder_hidden_states is not None: + all_cross_attentions += (layer_outputs[2],) + + if output_hidden_states: + all_hidden_states += (hidden_states,) + + outputs = (hidden_states, all_hidden_states, all_attentions, all_cross_attentions) + + if not return_dict: + return tuple(v for v in outputs if v is not None) + + return FlaxBaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + hidden_states=all_hidden_states, + attentions=all_attentions, + cross_attentions=all_cross_attentions, + ) + + +# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertEncoder with Bert->BigBird +class FlaxBigBirdEncoder(nn.Module): + config: BigBirdConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + gradient_checkpointing: bool = False + + def setup(self): + self.layer = FlaxBigBirdLayerCollection( + self.config, + dtype=self.dtype, + gradient_checkpointing=self.gradient_checkpointing, + ) + + def __call__( + self, + hidden_states, + attention_mask, + head_mask, + encoder_hidden_states: Optional[jnp.ndarray] = None, + encoder_attention_mask: Optional[jnp.ndarray] = None, + init_cache: bool = False, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + return self.layer( + hidden_states, + attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + init_cache=init_cache, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + +# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertPredictionHeadTransform with Bert->BigBird +class FlaxBigBirdPredictionHeadTransform(nn.Module): + config: BigBirdConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self): + self.dense = nn.Dense(self.config.hidden_size, dtype=self.dtype) + self.activation = ACT2FN[self.config.hidden_act] + self.LayerNorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype) + + def __call__(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.activation(hidden_states) + return self.LayerNorm(hidden_states) + + +# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertLMPredictionHead with Bert->BigBird, np.ndarray->jnp.ndarray +class FlaxBigBirdLMPredictionHead(nn.Module): + config: BigBirdConfig + dtype: jnp.dtype = jnp.float32 + bias_init: Callable[..., jnp.ndarray] = jax.nn.initializers.zeros + + def setup(self): + self.transform = FlaxBigBirdPredictionHeadTransform(self.config, dtype=self.dtype) + self.decoder = nn.Dense(self.config.vocab_size, dtype=self.dtype, use_bias=False) + self.bias = self.param("bias", self.bias_init, (self.config.vocab_size,)) + + def __call__(self, hidden_states, shared_embedding=None): + hidden_states = self.transform(hidden_states) + + if shared_embedding is not None: + hidden_states = self.decoder.apply({"params": {"kernel": shared_embedding.T}}, hidden_states) + else: + hidden_states = self.decoder(hidden_states) + + bias = jnp.asarray(self.bias, self.dtype) + hidden_states += bias + return hidden_states + + +# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertOnlyMLMHead with Bert->BigBird +class FlaxBigBirdOnlyMLMHead(nn.Module): + config: BigBirdConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self): + self.predictions = FlaxBigBirdLMPredictionHead(self.config, dtype=self.dtype) + + def __call__(self, hidden_states, shared_embedding=None): + hidden_states = self.predictions(hidden_states, shared_embedding=shared_embedding) + return hidden_states + + +class FlaxBigBirdPreTrainingHeads(nn.Module): + config: BigBirdConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self): + self.predictions = FlaxBigBirdLMPredictionHead(self.config, dtype=self.dtype) + self.seq_relationship = nn.Dense(2, dtype=self.dtype) + + def __call__(self, hidden_states, pooled_output, shared_embedding=None): + prediction_scores = self.predictions(hidden_states, shared_embedding=shared_embedding) + seq_relationship_score = self.seq_relationship(pooled_output) + return prediction_scores, seq_relationship_score + + +class FlaxBigBirdPreTrainedModel(FlaxPreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = BigBirdConfig + base_model_prefix = "bert" + module_class: nn.Module = None + + def __init__( + self, + config: BigBirdConfig, + input_shape: Optional[tuple] = None, + seed: int = 0, + dtype: jnp.dtype = jnp.float32, + _do_init: bool = True, + gradient_checkpointing: bool = False, + **kwargs, + ): + module = self.module_class(config=config, dtype=dtype, gradient_checkpointing=gradient_checkpointing, **kwargs) + if config.attention_type == "block_sparse" and input_shape is None: + input_shape = (1, 12 * config.block_size) + elif input_shape is None: + input_shape = (1, 1) + + super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init) + + # Copied from transformers.models.bert.modeling_flax_bert.FlaxBertPreTrainedModel.enable_gradient_checkpointing + def enable_gradient_checkpointing(self): + self._module = self.module_class( + config=self.config, + dtype=self.dtype, + gradient_checkpointing=True, + ) + + def init_weights(self, rng: jax.random.PRNGKey, input_shape: tuple, params: FrozenDict = None) -> FrozenDict: + # init input tensors + input_ids = jnp.zeros(input_shape, dtype="i4") + token_type_ids = jnp.zeros_like(input_ids) + position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_shape) + attention_mask = jnp.ones_like(input_ids) + head_mask = jnp.ones((self.config.num_hidden_layers, self.config.num_attention_heads)) + + params_rng, dropout_rng, indices_rng = jax.random.split(rng, num=3) + rngs = {"params": params_rng, "dropout": dropout_rng, "indices": indices_rng} + + if self.config.add_cross_attention: + encoder_hidden_states = jnp.zeros(input_shape + (self.config.hidden_size,)) + encoder_attention_mask = attention_mask + module_init_outputs = self.module.init( + rngs, + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + return_dict=False, + ) + else: + module_init_outputs = self.module.init( + rngs, + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + return_dict=False, + ) + + random_params = module_init_outputs["params"] + + if params is not None: + random_params = flatten_dict(unfreeze(random_params)) + params = flatten_dict(unfreeze(params)) + for missing_key in self._missing_keys: + params[missing_key] = random_params[missing_key] + self._missing_keys = set() + return freeze(unflatten_dict(params)) + else: + return random_params + + # Copied from transformers.models.bart.modeling_flax_bart.FlaxBartDecoderPreTrainedModel.init_cache + def init_cache(self, batch_size, max_length): + r""" + Args: + batch_size (`int`): + batch_size used for fast auto-regressive decoding. Defines the batch size of the initialized cache. + max_length (`int`): + maximum possible length for auto-regressive decoding. Defines the sequence length of the initialized + cache. + """ + # init input variables to retrieve cache + input_ids = jnp.ones((batch_size, max_length), dtype="i4") + attention_mask = jnp.ones_like(input_ids, dtype="i4") + position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape) + + init_variables = self.module.init( + jax.random.PRNGKey(0), input_ids, attention_mask, position_ids, return_dict=False, init_cache=True + ) + return unfreeze(init_variables["cache"]) + + @add_start_docstrings_to_model_forward(BIG_BIRD_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + def __call__( + self, + input_ids, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + params: Optional[dict] = None, + dropout_rng: Optional[jax.random.PRNGKey] = None, + indices_rng: Optional[jax.random.PRNGKey] = None, + train: bool = False, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + past_key_values: Optional[dict] = None, + ): + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.return_dict + + # init input tensors if not passed + if token_type_ids is None: + token_type_ids = jnp.zeros_like(input_ids) + + if position_ids is None: + position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape) + + if attention_mask is None: + attention_mask = jnp.ones_like(input_ids) + + if head_mask is None: + head_mask = jnp.ones((self.config.num_hidden_layers, self.config.num_attention_heads)) + + # Handle any PRNG if needed + rngs = {} + if indices_rng is not None: + rngs["indices"] = indices_rng + + if dropout_rng is not None: + rngs["dropout"] = dropout_rng + + inputs = {"params": params or self.params} + + if self.config.add_cross_attention: + # if past_key_values are passed then cache is already initialized a private flag init_cache has to be passed + # down to ensure cache is used. It has to be made sure that cache is marked as mutable so that it can be + # changed by FlaxBigBirdAttention module + if past_key_values: + inputs["cache"] = past_key_values + mutable = ["cache"] + else: + mutable = False + + outputs = self.module.apply( + inputs, + jnp.array(input_ids, dtype="i4"), + jnp.array(attention_mask, dtype="i4"), + token_type_ids=jnp.array(token_type_ids, dtype="i4"), + position_ids=jnp.array(position_ids, dtype="i4"), + head_mask=jnp.array(head_mask, dtype="i4"), + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + deterministic=not train, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + rngs=rngs, + mutable=mutable, + ) + + # add updated cache to model output + if past_key_values is not None and return_dict: + outputs, past_key_values = outputs + outputs["past_key_values"] = unfreeze(past_key_values["cache"]) + return outputs + elif past_key_values is not None and not return_dict: + outputs, past_key_values = outputs + outputs = outputs[:1] + (unfreeze(past_key_values["cache"]),) + outputs[1:] + + else: + outputs = self.module.apply( + inputs, + jnp.array(input_ids, dtype="i4"), + jnp.array(attention_mask, dtype="i4"), + token_type_ids=jnp.array(token_type_ids, dtype="i4"), + position_ids=jnp.array(position_ids, dtype="i4"), + head_mask=jnp.array(head_mask, dtype="i4"), + deterministic=not train, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + rngs=rngs, + ) + + return outputs + + +class FlaxBigBirdModule(nn.Module): + config: BigBirdConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + add_pooling_layer: bool = True + gradient_checkpointing: bool = False + + def setup(self): + self.embeddings = FlaxBigBirdEmbeddings(self.config, dtype=self.dtype) + self.encoder = FlaxBigBirdEncoder( + self.config, dtype=self.dtype, gradient_checkpointing=self.gradient_checkpointing + ) + self.pooler = nn.Dense( + self.config.hidden_size, + kernel_init=jax.nn.initializers.normal(self.config.initializer_range), + dtype=self.dtype, + ) + + def __call__( + self, + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + encoder_hidden_states: Optional[jnp.ndarray] = None, + encoder_attention_mask: Optional[jnp.ndarray] = None, + init_cache: bool = False, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + hidden_states = self.embeddings( + input_ids, token_type_ids, position_ids, attention_mask, deterministic=deterministic + ) + outputs = self.encoder( + hidden_states, + attention_mask, + head_mask=head_mask, + deterministic=deterministic, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + init_cache=init_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = outputs[0] + + pooled = nn.tanh(self.pooler(hidden_states[:, 0, :])) if self.add_pooling_layer else None + + if not return_dict: + # if pooled is None, don't return it + if pooled is None: + return (hidden_states,) + outputs[1:] + return (hidden_states, pooled) + outputs[1:] + + return FlaxBaseModelOutputWithPoolingAndCrossAttentions( + last_hidden_state=hidden_states, + pooler_output=pooled, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + cross_attentions=outputs.cross_attentions, + ) + + +@add_start_docstrings( + "The bare BigBird Model transformer outputting raw hidden-states without any specific head on top.", + BIG_BIRD_START_DOCSTRING, +) +# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertModel with Bert->BigBird +class FlaxBigBirdModel(FlaxBigBirdPreTrainedModel): + module_class = FlaxBigBirdModule + + +append_call_sample_docstring(FlaxBigBirdModel, _CHECKPOINT_FOR_DOC, FlaxBaseModelOutputWithPooling, _CONFIG_FOR_DOC) + + +# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertForPreTrainingModule with Bert->BigBird +class FlaxBigBirdForPreTrainingModule(nn.Module): + config: BigBirdConfig + dtype: jnp.dtype = jnp.float32 + gradient_checkpointing: bool = False + + def setup(self): + self.bert = FlaxBigBirdModule( + config=self.config, + dtype=self.dtype, + gradient_checkpointing=self.gradient_checkpointing, + ) + self.cls = FlaxBigBirdPreTrainingHeads(config=self.config, dtype=self.dtype) + + def __call__( + self, + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + # Model + outputs = self.bert( + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + if self.config.tie_word_embeddings: + shared_embedding = self.bert.variables["params"]["embeddings"]["word_embeddings"]["embedding"] + else: + shared_embedding = None + + hidden_states = outputs[0] + pooled_output = outputs[1] + + prediction_scores, seq_relationship_score = self.cls( + hidden_states, pooled_output, shared_embedding=shared_embedding + ) + + if not return_dict: + return (prediction_scores, seq_relationship_score) + outputs[2:] + + return FlaxBigBirdForPreTrainingOutput( + prediction_logits=prediction_scores, + seq_relationship_logits=seq_relationship_score, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + BigBird Model with two heads on top as done during the pretraining: a `masked language modeling` head and a `next + sentence prediction (classification)` head. + """, + BIG_BIRD_START_DOCSTRING, +) +# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertForPreTraining with Bert->BigBird +class FlaxBigBirdForPreTraining(FlaxBigBirdPreTrainedModel): + module_class = FlaxBigBirdForPreTrainingModule + + +FLAX_BIG_BIRD_FOR_PRETRAINING_DOCSTRING = """ + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, FlaxBigBirdForPreTraining + + >>> tokenizer = AutoTokenizer.from_pretrained("google/bigbird-roberta-base") + >>> model = FlaxBigBirdForPreTraining.from_pretrained("google/bigbird-roberta-base") + + >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="np") + >>> outputs = model(**inputs) + + >>> prediction_logits = outputs.prediction_logits + >>> seq_relationship_logits = outputs.seq_relationship_logits + ``` +""" + +overwrite_call_docstring( + FlaxBigBirdForPreTraining, + BIG_BIRD_INPUTS_DOCSTRING.format("batch_size, sequence_length") + FLAX_BIG_BIRD_FOR_PRETRAINING_DOCSTRING, +) +append_replace_return_docstrings( + FlaxBigBirdForPreTraining, output_type=FlaxBigBirdForPreTrainingOutput, config_class=_CONFIG_FOR_DOC +) + + +# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertForMaskedLMModule with Bert->BigBird +class FlaxBigBirdForMaskedLMModule(nn.Module): + config: BigBirdConfig + dtype: jnp.dtype = jnp.float32 + gradient_checkpointing: bool = False + + def setup(self): + self.bert = FlaxBigBirdModule( + config=self.config, + add_pooling_layer=False, + dtype=self.dtype, + gradient_checkpointing=self.gradient_checkpointing, + ) + self.cls = FlaxBigBirdOnlyMLMHead(config=self.config, dtype=self.dtype) + + def __call__( + self, + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + # Model + outputs = self.bert( + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs[0] + if self.config.tie_word_embeddings: + shared_embedding = self.bert.variables["params"]["embeddings"]["word_embeddings"]["embedding"] + else: + shared_embedding = None + + # Compute the prediction scores + logits = self.cls(hidden_states, shared_embedding=shared_embedding) + + if not return_dict: + return (logits,) + outputs[1:] + + return FlaxMaskedLMOutput( + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings("""BigBird Model with a `language modeling` head on top.""", BIG_BIRD_START_DOCSTRING) +# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertForMaskedLM with Bert->BigBird +class FlaxBigBirdForMaskedLM(FlaxBigBirdPreTrainedModel): + module_class = FlaxBigBirdForMaskedLMModule + + +append_call_sample_docstring(FlaxBigBirdForMaskedLM, _CHECKPOINT_FOR_DOC, FlaxMaskedLMOutput, _CONFIG_FOR_DOC) + + +class FlaxBigBirdClassificationHead(nn.Module): + """Head for sentence-level classification tasks.""" + + config: BigBirdConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self): + self.dense = nn.Dense(self.config.hidden_size, dtype=self.dtype) + classifier_dropout = ( + self.config.classifier_dropout + if self.config.classifier_dropout is not None + else self.config.hidden_dropout_prob + ) + self.dropout = nn.Dropout(classifier_dropout) + self.out_proj = nn.Dense(self.config.num_labels, dtype=self.dtype) + + def __call__(self, features, deterministic=True): + x = features[:, 0, :] # take token (equiv. to [CLS]) + x = self.dropout(x, deterministic=deterministic) + x = self.dense(x) + x = ACT2FN[self.config.hidden_act](x) + x = self.dropout(x, deterministic=deterministic) + x = self.out_proj(x) + return x + + +class FlaxBigBirdForSequenceClassificationModule(nn.Module): + config: BigBirdConfig + dtype: jnp.dtype = jnp.float32 + gradient_checkpointing: bool = False + + def setup(self): + self.bert = FlaxBigBirdModule( + config=self.config, dtype=self.dtype, gradient_checkpointing=self.gradient_checkpointing + ) + self.classifier = FlaxBigBirdClassificationHead(self.config, dtype=self.dtype) + + def __call__( + self, + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + # Model + outputs = self.bert( + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + logits = self.classifier(sequence_output, deterministic=deterministic) + + if not return_dict: + return (logits,) + outputs[2:] + + return FlaxSequenceClassifierOutput( + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + BigBird Model transformer with a sequence classification/regression head on top (a linear layer on top of the + pooled output) e.g. for GLUE tasks. + """, + BIG_BIRD_START_DOCSTRING, +) +# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertForSequenceClassification with Bert->BigBird +class FlaxBigBirdForSequenceClassification(FlaxBigBirdPreTrainedModel): + module_class = FlaxBigBirdForSequenceClassificationModule + + +append_call_sample_docstring( + FlaxBigBirdForSequenceClassification, + _CHECKPOINT_FOR_DOC, + FlaxSequenceClassifierOutput, + _CONFIG_FOR_DOC, +) + + +# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertForMultipleChoiceModule with Bert->BigBird +class FlaxBigBirdForMultipleChoiceModule(nn.Module): + config: BigBirdConfig + dtype: jnp.dtype = jnp.float32 + gradient_checkpointing: bool = False + + def setup(self): + self.bert = FlaxBigBirdModule( + config=self.config, + dtype=self.dtype, + gradient_checkpointing=self.gradient_checkpointing, + ) + self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob) + self.classifier = nn.Dense(1, dtype=self.dtype) + + def __call__( + self, + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + num_choices = input_ids.shape[1] + input_ids = input_ids.reshape(-1, input_ids.shape[-1]) if input_ids is not None else None + attention_mask = attention_mask.reshape(-1, attention_mask.shape[-1]) if attention_mask is not None else None + token_type_ids = token_type_ids.reshape(-1, token_type_ids.shape[-1]) if token_type_ids is not None else None + position_ids = position_ids.reshape(-1, position_ids.shape[-1]) if position_ids is not None else None + + # Model + outputs = self.bert( + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + pooled_output = outputs[1] + pooled_output = self.dropout(pooled_output, deterministic=deterministic) + logits = self.classifier(pooled_output) + + reshaped_logits = logits.reshape(-1, num_choices) + + if not return_dict: + return (reshaped_logits,) + outputs[2:] + + return FlaxMultipleChoiceModelOutput( + logits=reshaped_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + BigBird Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a + softmax) e.g. for RocStories/SWAG tasks. + """, + BIG_BIRD_START_DOCSTRING, +) +class FlaxBigBirdForMultipleChoice(FlaxBigBirdPreTrainedModel): + module_class = FlaxBigBirdForMultipleChoiceModule + + def __init__( + self, + config: BigBirdConfig, + input_shape: Optional[tuple] = None, + seed: int = 0, + dtype: jnp.dtype = jnp.float32, + _do_init: bool = True, + **kwargs, + ): + if config.attention_type == "block_sparse" and input_shape is None: + input_shape = (1, 1, 12 * config.block_size) + elif input_shape is None: + input_shape = (1, 1) + super().__init__(config, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init) + + +overwrite_call_docstring( + FlaxBigBirdForMultipleChoice, BIG_BIRD_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length") +) +append_call_sample_docstring( + FlaxBigBirdForMultipleChoice, + _CHECKPOINT_FOR_DOC, + FlaxMultipleChoiceModelOutput, + _CONFIG_FOR_DOC, +) + + +# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertForTokenClassificationModule with Bert->BigBird +class FlaxBigBirdForTokenClassificationModule(nn.Module): + config: BigBirdConfig + dtype: jnp.dtype = jnp.float32 + gradient_checkpointing: bool = False + + def setup(self): + self.bert = FlaxBigBirdModule( + config=self.config, + dtype=self.dtype, + add_pooling_layer=False, + gradient_checkpointing=self.gradient_checkpointing, + ) + classifier_dropout = ( + self.config.classifier_dropout + if self.config.classifier_dropout is not None + else self.config.hidden_dropout_prob + ) + self.dropout = nn.Dropout(rate=classifier_dropout) + self.classifier = nn.Dense(self.config.num_labels, dtype=self.dtype) + + def __call__( + self, + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + # Model + outputs = self.bert( + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs[0] + hidden_states = self.dropout(hidden_states, deterministic=deterministic) + logits = self.classifier(hidden_states) + + if not return_dict: + return (logits,) + outputs[1:] + + return FlaxTokenClassifierOutput( + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + BigBird Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for + Named-Entity-Recognition (NER) tasks. + """, + BIG_BIRD_START_DOCSTRING, +) +# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertForTokenClassification with Bert->BigBird +class FlaxBigBirdForTokenClassification(FlaxBigBirdPreTrainedModel): + module_class = FlaxBigBirdForTokenClassificationModule + + +append_call_sample_docstring( + FlaxBigBirdForTokenClassification, + _CHECKPOINT_FOR_DOC, + FlaxTokenClassifierOutput, + _CONFIG_FOR_DOC, +) + + +class FlaxBigBirdForQuestionAnsweringHead(nn.Module): + config: BigBirdConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self): + self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob) + self.intermediate = FlaxBigBirdIntermediate(self.config, dtype=self.dtype) + self.output = FlaxBigBirdOutput(self.config, dtype=self.dtype) + self.qa_outputs = nn.Dense(self.config.num_labels, dtype=self.dtype) + + def __call__(self, encoder_output, deterministic=True): + hidden_states = self.dropout(encoder_output, deterministic=deterministic) + hidden_states = self.intermediate(hidden_states) + hidden_states = self.output(hidden_states, encoder_output) + hidden_states = self.qa_outputs(hidden_states) + return hidden_states + + +class FlaxBigBirdForQuestionAnsweringModule(nn.Module): + config: BigBirdConfig + dtype: jnp.dtype = jnp.float32 + add_pooling_layer: bool = False + gradient_checkpointing: bool = False + + def setup(self): + self.config.num_labels = 2 + self.bert = FlaxBigBirdModule( + self.config, + dtype=self.dtype, + add_pooling_layer=self.add_pooling_layer, + gradient_checkpointing=self.gradient_checkpointing, + ) + self.qa_classifier = FlaxBigBirdForQuestionAnsweringHead(self.config, dtype=self.dtype) + + def __call__( + self, + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + logits_mask=None, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + # Model + outputs = self.bert( + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs[0] + pooled_output = outputs[1] if self.add_pooling_layer else None + logits = self.qa_classifier(hidden_states, deterministic=deterministic) + + if logits_mask is not None: + # removing question tokens from the competition + logits = logits - logits_mask * 1e6 + + start_logits, end_logits = jnp.split(logits, self.config.num_labels, axis=-1) + start_logits = start_logits.squeeze(-1) + end_logits = end_logits.squeeze(-1) + + if not return_dict: + return (start_logits, end_logits) + outputs[1:] + + return FlaxBigBirdForQuestionAnsweringModelOutput( + start_logits=start_logits, + end_logits=end_logits, + pooled_output=pooled_output, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + BigBird Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear + layers on top of the hidden-states output to compute `span start logits` and `span end logits`). + """, + BIG_BIRD_START_DOCSTRING, +) +class FlaxBigBirdForQuestionAnswering(FlaxBigBirdPreTrainedModel): + module_class = FlaxBigBirdForQuestionAnsweringModule + + @add_start_docstrings_to_model_forward(BIG_BIRD_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + def __call__( + self, + input_ids, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + question_lengths=None, + params: Optional[dict] = None, + dropout_rng: Optional[jax.random.PRNGKey] = None, + indices_rng: Optional[jax.random.PRNGKey] = None, + train: bool = False, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ): + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.return_dict + + if position_ids is None: + position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape) + + if attention_mask is None: + attention_mask = jnp.ones_like(input_ids) + + if head_mask is None: + head_mask = jnp.ones((self.config.num_hidden_layers, self.config.num_attention_heads)) + + if question_lengths is None and input_ids is not None: + # assuming input_ids format: context + question_lengths = jnp.argmax((input_ids == self.config.sep_token_id).astype("i4"), axis=-1) + 1 + question_lengths = jnp.expand_dims(question_lengths, axis=1) + + seqlen = input_ids.shape[1] + + logits_mask = None + if question_lengths is not None: + # setting lengths logits to `-inf` + logits_mask = self.prepare_question_mask(question_lengths, seqlen) + if token_type_ids is None: + token_type_ids = (~logits_mask).astype("i4") + logits_mask = jnp.expand_dims(logits_mask, axis=2) + logits_mask = logits_mask.at[:, 0].set(False) + + # init input tensors if not passed + if token_type_ids is None: + token_type_ids = jnp.zeros_like(input_ids) + + # Handle any PRNG if needed + rngs = {} + if dropout_rng is not None: + rngs["dropout"] = dropout_rng + + if indices_rng is not None: + rngs["indices"] = indices_rng + + return self.module.apply( + {"params": params or self.params}, + jnp.array(input_ids, dtype="i4"), + jnp.array(attention_mask, dtype="i4"), + token_type_ids, + jnp.array(position_ids, dtype="i4"), + jnp.array(head_mask, dtype="i4"), + logits_mask, + not train, + output_attentions, + output_hidden_states, + return_dict, + rngs=rngs, + ) + + @staticmethod + def prepare_question_mask(q_lengths, maxlen: int): + # q_lengths -> (bz, 1) + mask = jnp.arange(0, maxlen) + mask = jnp.expand_dims(mask, axis=0) < q_lengths + return mask + + +append_call_sample_docstring( + FlaxBigBirdForQuestionAnswering, + _CHECKPOINT_FOR_DOC, + FlaxBigBirdForQuestionAnsweringModelOutput, + _CONFIG_FOR_DOC, +) + + +class FlaxBigBirdForCausalLMModule(nn.Module): + config: BigBirdConfig + dtype: jnp.dtype = jnp.float32 + gradient_checkpointing: bool = False + + def setup(self): + self.bert = FlaxBigBirdModule( + config=self.config, + add_pooling_layer=False, + dtype=self.dtype, + gradient_checkpointing=self.gradient_checkpointing, + ) + self.cls = FlaxBigBirdOnlyMLMHead(config=self.config, dtype=self.dtype) + + def __call__( + self, + input_ids, + attention_mask, + position_ids, + token_type_ids: Optional[jnp.ndarray] = None, + head_mask: Optional[jnp.ndarray] = None, + encoder_hidden_states: Optional[jnp.ndarray] = None, + encoder_attention_mask: Optional[jnp.ndarray] = None, + init_cache: bool = False, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + # Model + outputs = self.bert( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + init_cache=init_cache, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs[0] + if self.config.tie_word_embeddings: + shared_embedding = self.bert.variables["params"]["embeddings"]["word_embeddings"]["embedding"] + else: + shared_embedding = None + + # Compute the prediction scores + logits = self.cls(hidden_states, shared_embedding=shared_embedding) + + if not return_dict: + return (logits,) + outputs[1:] + + return FlaxCausalLMOutputWithCrossAttentions( + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + cross_attentions=outputs.cross_attentions, + ) + + +@add_start_docstrings( + """ + BigBird Model with a language modeling head on top (a linear layer on top of the hidden-states output) e.g for + autoregressive tasks. + """, + BIG_BIRD_START_DOCSTRING, +) +# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertForCausalLM with Bert->BigBird +class FlaxBigBirdForCausalLM(FlaxBigBirdPreTrainedModel): + module_class = FlaxBigBirdForCausalLMModule + + def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jax.Array] = None): + # initializing the cache + batch_size, seq_length = input_ids.shape + + past_key_values = self.init_cache(batch_size, max_length) + # Note that usually one would have to put 0's in the attention_mask for x > input_ids.shape[-1] and x < cache_length. + # But since the decoder uses a causal mask, those positions are masked anyway. + # Thus, we can create a single static attention_mask here, which is more efficient for compilation + extended_attention_mask = jnp.ones((batch_size, max_length), dtype="i4") + if attention_mask is not None: + position_ids = attention_mask.cumsum(axis=-1) - 1 + extended_attention_mask = lax.dynamic_update_slice(extended_attention_mask, attention_mask, (0, 0)) + else: + position_ids = jnp.broadcast_to(jnp.arange(seq_length, dtype="i4")[None, :], (batch_size, seq_length)) + + return { + "past_key_values": past_key_values, + "attention_mask": extended_attention_mask, + "position_ids": position_ids, + } + + def update_inputs_for_generation(self, model_outputs, model_kwargs): + model_kwargs["past_key_values"] = model_outputs.past_key_values + model_kwargs["position_ids"] = model_kwargs["position_ids"][:, -1:] + 1 + return model_kwargs + + +append_call_sample_docstring( + FlaxBigBirdForCausalLM, + _CHECKPOINT_FOR_DOC, + FlaxCausalLMOutputWithCrossAttentions, + _CONFIG_FOR_DOC, +) + + +__all__ = [ + "FlaxBigBirdForCausalLM", + "FlaxBigBirdForMaskedLM", + "FlaxBigBirdForMultipleChoice", + "FlaxBigBirdForPreTraining", + "FlaxBigBirdForQuestionAnswering", + "FlaxBigBirdForSequenceClassification", + "FlaxBigBirdForTokenClassification", + "FlaxBigBirdModel", + "FlaxBigBirdPreTrainedModel", +] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/big_bird/tokenization_big_bird.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/big_bird/tokenization_big_bird.py new file mode 100644 index 0000000000000000000000000000000000000000..56680972a63e6360eaaefc023e57bd6beff71b9d --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/big_bird/tokenization_big_bird.py @@ -0,0 +1,303 @@ +# coding=utf-8 +# Copyright 2021 Google Research and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes for BigBird.""" + +import os +import re +from shutil import copyfile +from typing import Any, Optional + +import sentencepiece as spm + +from ...tokenization_utils import AddedToken, PreTrainedTokenizer +from ...utils import logging +from ...utils.import_utils import requires + + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"} + + +@requires(backends=("sentencepiece",)) +class BigBirdTokenizer(PreTrainedTokenizer): + """ + Construct a BigBird tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece). + + This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to + this superclass for more information regarding those methods. + + Args: + vocab_file (`str`): + [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that + contains the vocabulary necessary to instantiate a tokenizer. + unk_token (`str`, *optional*, defaults to `""`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + bos_token (`str`, *optional*, defaults to `""`): + The begin of sequence token. + eos_token (`str`, *optional*, defaults to `""`): + The end of sequence token. + pad_token (`str`, *optional*, defaults to `""`): + The token used for padding, for example when batching sequences of different lengths. + sep_token (`str`, *optional*, defaults to `"[SEP]"`): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for + sequence classification or for a text and a question for question answering. It is also used as the last + token of a sequence built with special tokens. + mask_token (`str`, *optional*, defaults to `"[MASK]"`): + The token used for masking values. This is the token used when training this model with masked language + modeling. This is the token which the model will try to predict. + cls_token (`str`, *optional*, defaults to `"[CLS]"`): + The classifier token which is used when doing sequence classification (classification of the whole sequence + instead of per-token classification). It is the first token of the sequence when built with special tokens. + sp_model_kwargs (`dict`, *optional*): + Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for + SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things, + to set: + + - `enable_sampling`: Enable subword regularization. + - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout. + + - `nbest_size = {0,1}`: No sampling is performed. + - `nbest_size > 1`: samples from the nbest_size results. + - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice) + using forward-filtering-and-backward-sampling algorithm. + + - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for + BPE-dropout. + """ + + vocab_files_names = VOCAB_FILES_NAMES + model_input_names = ["input_ids", "attention_mask"] + prefix_tokens: list[int] = [] + + def __init__( + self, + vocab_file, + unk_token="", + bos_token="", + eos_token="", + pad_token="", + sep_token="[SEP]", + mask_token="[MASK]", + cls_token="[CLS]", + sp_model_kwargs: Optional[dict[str, Any]] = None, + **kwargs, + ) -> None: + bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token + eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token + unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token + pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token + cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token + sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token + + # Mask token behave like a normal word, i.e. include the space before it + mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token + + self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + + self.vocab_file = vocab_file + + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) + self.sp_model.Load(vocab_file) + + super().__init__( + bos_token=bos_token, + eos_token=eos_token, + unk_token=unk_token, + pad_token=pad_token, + sep_token=sep_token, + mask_token=mask_token, + cls_token=cls_token, + sp_model_kwargs=self.sp_model_kwargs, + **kwargs, + ) + + @property + def vocab_size(self): + return self.sp_model.get_piece_size() + + def get_vocab(self): + vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab + + def __getstate__(self): + state = self.__dict__.copy() + state["sp_model"] = None + return state + + def __setstate__(self, d): + self.__dict__ = d + + # for backward compatibility + if not hasattr(self, "sp_model_kwargs"): + self.sp_model_kwargs = {} + + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) + self.sp_model.Load(self.vocab_file) + + def _tokenize(self, text: str) -> list[str]: + """Take as input a string and return a list of strings (tokens) for words/sub-words""" + return self.sp_model.encode(text, out_type=str) + + def _convert_token_to_id(self, token): + """Converts a token (str) in an id using the vocab.""" + return self.sp_model.piece_to_id(token) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + token = self.sp_model.IdToPiece(index) + return token + + # Copied from transformers.models.albert.tokenization_albert.AlbertTokenizer.convert_tokens_to_string + def convert_tokens_to_string(self, tokens): + """Converts a sequence of tokens (string) in a single string.""" + current_sub_tokens = [] + out_string = "" + prev_is_special = False + for token in tokens: + # make sure that special tokens are not decoded using sentencepiece model + if token in self.all_special_tokens: + if not prev_is_special: + out_string += " " + out_string += self.sp_model.decode(current_sub_tokens) + token + prev_is_special = True + current_sub_tokens = [] + else: + current_sub_tokens.append(token) + prev_is_special = False + out_string += self.sp_model.decode(current_sub_tokens) + return out_string.strip() + + def _decode( + self, + token_ids: list[int], + skip_special_tokens: bool = False, + clean_up_tokenization_spaces: Optional[bool] = None, + spaces_between_special_tokens: bool = True, + **kwargs, + ) -> str: + self._decode_use_source_tokenizer = kwargs.pop("use_source_tokenizer", False) + + filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens) + + # To avoid mixing byte-level and unicode for byte-level BPT + # we need to build string separately for added tokens and byte-level tokens + # cf. https://github.com/huggingface/transformers/issues/1133 + sub_texts = [] + current_sub_text = [] + for token in filtered_tokens: + if skip_special_tokens and token in self.all_special_ids: + continue + if token in self.added_tokens_encoder: + if current_sub_text: + sub_texts.append(self.convert_tokens_to_string(current_sub_text)) + current_sub_text = [] + sub_texts.append(token) + else: + current_sub_text.append(token) + if current_sub_text: + sub_texts.append(self.convert_tokens_to_string(current_sub_text)) + + # Mimic the behavior of the Rust tokenizer: + # No space before [MASK] and [SEP] + if spaces_between_special_tokens: + text = re.sub(r" (\[(MASK|SEP)\])", r"\1", " ".join(sub_texts)) + else: + text = "".join(sub_texts) + + clean_up_tokenization_spaces = ( + clean_up_tokenization_spaces + if clean_up_tokenization_spaces is not None + else self.clean_up_tokenization_spaces + ) + if clean_up_tokenization_spaces: + clean_text = self.clean_up_tokenization(text) + return clean_text + else: + return text + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]: + if not os.path.isdir(save_directory): + logger.error(f"Vocabulary path ({save_directory}) should be a directory") + return + out_vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + + if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file): + copyfile(self.vocab_file, out_vocab_file) + elif not os.path.isfile(self.vocab_file): + with open(out_vocab_file, "wb") as fi: + content_spiece_model = self.sp_model.serialized_model_proto() + fi.write(content_spiece_model) + + return (out_vocab_file,) + + def build_inputs_with_special_tokens( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None + ) -> list[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A Big Bird sequence has the following format: + + - single sequence: `[CLS] X [SEP]` + - pair of sequences: `[CLS] A [SEP] B [SEP]` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + if token_ids_1 is None: + return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] + cls = [self.cls_token_id] + sep = [self.sep_token_id] + return cls + token_ids_0 + sep + token_ids_1 + sep + + def get_special_tokens_mask( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False + ) -> list[int]: + """ + Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer `prepare_for_model` method. + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not the token list is already formatted with special tokens for the model. + + Returns: + `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + if already_has_special_tokens: + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) + + if token_ids_1 is None: + return [1] + ([0] * len(token_ids_0)) + [1] + return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] + + +__all__ = ["BigBirdTokenizer"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/big_bird/tokenization_big_bird_fast.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/big_bird/tokenization_big_bird_fast.py new file mode 100644 index 0000000000000000000000000000000000000000..6148585a40b10385098c69714f650e1f42c22b4f --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/big_bird/tokenization_big_bird_fast.py @@ -0,0 +1,198 @@ +# coding=utf-8 +# Copyright 2018 Google AI, Google Brain and the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes for Big Bird model.""" + +import os +from shutil import copyfile +from typing import Optional + +from ...tokenization_utils import AddedToken +from ...tokenization_utils_fast import PreTrainedTokenizerFast +from ...utils import is_sentencepiece_available, logging + + +if is_sentencepiece_available(): + from .tokenization_big_bird import BigBirdTokenizer +else: + BigBirdTokenizer = None + +logger = logging.get_logger(__name__) +VOCAB_FILES_NAMES = {"vocab_file": "spiece.model", "tokenizer_file": "tokenizer.json"} + + +SPIECE_UNDERLINE = "▁" + + +class BigBirdTokenizerFast(PreTrainedTokenizerFast): + """ + Construct a "fast" BigBird tokenizer (backed by HuggingFace's *tokenizers* library). Based on + [Unigram](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models). This + tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should refer to + this superclass for more information regarding those methods + + Args: + vocab_file (`str`): + [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that + contains the vocabulary necessary to instantiate a tokenizer. + bos_token (`str`, *optional*, defaults to `""`): + The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token. + + + + When building a sequence using special tokens, this is not the token that is used for the beginning of + sequence. The token used is the `cls_token`. + + + + eos_token (`str`, *optional*, defaults to `""`): + The end of sequence token. .. note:: When building a sequence using special tokens, this is not the token + that is used for the end of sequence. The token used is the `sep_token`. + unk_token (`str`, *optional*, defaults to `""`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + sep_token (`str`, *optional*, defaults to `"[SEP]"`): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for + sequence classification or for a text and a question for question answering. It is also used as the last + token of a sequence built with special tokens. + pad_token (`str`, *optional*, defaults to `""`): + The token used for padding, for example when batching sequences of different lengths. + cls_token (`str`, *optional*, defaults to `"[CLS]"`): + The classifier token which is used when doing sequence classification (classification of the whole sequence + instead of per-token classification). It is the first token of the sequence when built with special tokens. + mask_token (`str`, *optional*, defaults to `"[MASK]"`): + The token used for masking values. This is the token used when training this model with masked language + modeling. This is the token which the model will try to predict. + """ + + vocab_files_names = VOCAB_FILES_NAMES + slow_tokenizer_class = BigBirdTokenizer + model_input_names = ["input_ids", "attention_mask"] + prefix_tokens: list[int] = [] + + def __init__( + self, + vocab_file=None, + tokenizer_file=None, + unk_token="", + bos_token="", + eos_token="", + pad_token="", + sep_token="[SEP]", + mask_token="[MASK]", + cls_token="[CLS]", + **kwargs, + ): + bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token + eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token + unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token + pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token + cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token + sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token + + # Mask token behave like a normal word, i.e. include the space before it + mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token + + super().__init__( + vocab_file, + tokenizer_file=tokenizer_file, + bos_token=bos_token, + eos_token=eos_token, + unk_token=unk_token, + sep_token=sep_token, + pad_token=pad_token, + cls_token=cls_token, + mask_token=mask_token, + **kwargs, + ) + + self.vocab_file = vocab_file + + def build_inputs_with_special_tokens( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None + ) -> list[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. An BigBird sequence has the following format: + + - single sequence: `[CLS] X [SEP]` + - pair of sequences: `[CLS] A [SEP] B [SEP]` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + sep = [self.sep_token_id] + cls = [self.cls_token_id] + if token_ids_1 is None: + return cls + token_ids_0 + sep + return cls + token_ids_0 + sep + token_ids_1 + sep + + def get_special_tokens_mask( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False + ) -> list[int]: + """ + Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer `prepare_for_model` method. + + Args: + token_ids_0 (`List[int]`): + List of ids. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (`bool`, *optional*, defaults to `False`): + Set to True if the token list is already formatted with special tokens for the model + + Returns: + `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + + if already_has_special_tokens: + if token_ids_1 is not None: + raise ValueError( + "You should not supply a second sequence if the provided sequence of " + "ids is already formatted with special tokens for the model." + ) + return [1 if x in [self.sep_token_id, self.cls_token_id] else 0 for x in token_ids_0] + + if token_ids_1 is None: + return [1] + ([0] * len(token_ids_0)) + [1] + return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]: + if not self.can_save_slow_tokenizer: + raise ValueError( + "Your fast tokenizer does not have the necessary information to save the vocabulary for a slow " + "tokenizer." + ) + + if not os.path.isdir(save_directory): + logger.error(f"Vocabulary path ({save_directory}) should be a directory") + return + out_vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + + if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file): + copyfile(self.vocab_file, out_vocab_file) + + return (out_vocab_file,) + + +__all__ = ["BigBirdTokenizerFast"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py new file mode 100644 index 0000000000000000000000000000000000000000..e419af75da3878553dc755776550d305e4d717ba --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py @@ -0,0 +1,3033 @@ +# coding=utf-8 +# Copyright 2021 Google Research The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch BigBirdPegasus model.""" + +import math +from typing import Callable, Optional, Union + +import numpy as np +import torch +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from ...activations import ACT2FN +from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache +from ...generation import GenerationMixin +from ...modeling_attn_mask_utils import ( + AttentionMaskConverter, + _prepare_4d_attention_mask, + _prepare_4d_attention_mask_for_sdpa, +) +from ...modeling_flash_attention_utils import FlashAttentionKwargs +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_outputs import ( + BaseModelOutput, + BaseModelOutputWithPastAndCrossAttentions, + CausalLMOutputWithCrossAttentions, + Seq2SeqLMOutput, + Seq2SeqModelOutput, + Seq2SeqQuestionAnsweringModelOutput, + Seq2SeqSequenceClassifierOutput, +) +from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel +from ...processing_utils import Unpack +from ...utils import auto_docstring, is_torch_flex_attn_available, is_torchdynamo_compiling, logging +from ...utils.deprecation import deprecate_kwarg +from .configuration_bigbird_pegasus import BigBirdPegasusConfig + + +if is_torch_flex_attn_available(): + from ...integrations.flex_attention import BlockMask, make_flex_block_causal_mask + + +logger = logging.get_logger(__name__) + +_EXPECTED_OUTPUT_SHAPE = [1, 7, 1024] + + +def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int): + """ + Shift input ids one token to the right. + """ + shifted_input_ids = input_ids.new_zeros(input_ids.shape) + shifted_input_ids[:, 1:] = input_ids[:, :-1].clone() + shifted_input_ids[:, 0] = decoder_start_token_id + + if pad_token_id is None: + raise ValueError("self.model.config.pad_token_id has to be defined.") + # replace possible -100 values in labels by `pad_token_id` + shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id) + + return shifted_input_ids + + +class BigBirdPegasusLearnedPositionalEmbedding(nn.Embedding): + """ + This module learns positional embeddings up to a fixed maximum size. + """ + + def __init__(self, num_embeddings: int, embedding_dim: int): + super().__init__(num_embeddings, embedding_dim) + + def forward( + self, input_ids_shape: torch.Size, past_key_values_length: int = 0, position_ids: Optional[torch.Tensor] = None + ): + """`input_ids' shape is expected to be [bsz x seqlen].""" + + if position_ids is None: + bsz, seq_len = input_ids_shape[:2] + position_ids = torch.arange( + past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=self.weight.device + ) + return super().forward(position_ids) + + +# Copied from transformers.models.bart.modeling_bart.BartScaledWordEmbedding with Bart->BigBirdPegasus +class BigBirdPegasusScaledWordEmbedding(nn.Embedding): + """ + This module overrides nn.Embeddings' forward by multiplying with embeddings scale. + """ + + def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int, embed_scale: Optional[float] = 1.0): + super().__init__(num_embeddings, embedding_dim, padding_idx) + self.embed_scale = embed_scale + + def forward(self, input_ids: torch.Tensor): + return super().forward(input_ids) * self.embed_scale + + +# Copied from transformers.models.big_bird.modeling_big_bird.BigBirdSelfAttention with BigBird->BigBirdPegasus +class BigBirdPegasusSelfAttention(nn.Module): + def __init__(self, config, layer_idx=None): + super().__init__() + if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): + raise ValueError( + f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention " + f"heads ({config.num_attention_heads})" + ) + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.use_bias) + self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.use_bias) + self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.use_bias) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + self.is_decoder = config.is_decoder + self.layer_idx = layer_idx + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_values=None, + output_attentions=False, + cache_position=None, + ): + batch_size, seq_length, _ = hidden_states.shape + query_layer = ( + self.query(hidden_states) + .view(batch_size, -1, self.num_attention_heads, self.attention_head_size) + .transpose(1, 2) + ) + + is_cross_attention = encoder_hidden_states is not None + current_states = encoder_hidden_states if is_cross_attention else hidden_states + attention_mask = encoder_attention_mask if is_cross_attention else attention_mask + if is_cross_attention and past_key_values is not None and past_key_values.get_seq_length(self.layer_idx) > 0: + # reuse k,v, cross_attentions + key_layer = past_key_values.layers[self.layer_idx].keys + value_layer = past_key_values.layers[self.layer_idx].values + else: + key_layer = ( + self.key(current_states) + .view(batch_size, -1, self.num_attention_heads, self.attention_head_size) + .transpose(1, 2) + ) + value_layer = ( + self.value(current_states) + .view(batch_size, -1, self.num_attention_heads, self.attention_head_size) + .transpose(1, 2) + ) + + if past_key_values is not None: + # save all key/value_layer to cache to be re-used for fast auto-regressive generation + key_layer, value_layer = past_key_values.update( + key_layer, + value_layer, + self.layer_idx, + ) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in BigBirdPegasusModel forward() function) + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = nn.functional.softmax(attention_scores, dim=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = attention_probs * head_mask + + context_layer = torch.matmul(attention_probs, value_layer) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(*new_context_layer_shape) + + return context_layer, attention_probs + + +# Copied from transformers.models.big_bird.modeling_big_bird.BigBirdBlockSparseAttention with BigBird->BigBirdPegasus +class BigBirdPegasusBlockSparseAttention(nn.Module): + def __init__(self, config, seed=None): + super().__init__() + + self.max_seqlen = config.max_position_embeddings + self.seed = seed + + if config.hidden_size % config.num_attention_heads != 0: + raise ValueError( + f"The hidden size {config.hidden_size} is not a multiple of the number of attention " + f"heads {config.num_attention_heads}." + ) + + self.num_attention_heads = config.num_attention_heads + self.num_random_blocks = config.num_random_blocks + self.block_size = config.block_size + + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.use_bias) + self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.use_bias) + self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.use_bias) + + def forward( + self, + hidden_states, + band_mask=None, + from_mask=None, + to_mask=None, + from_blocked_mask=None, + to_blocked_mask=None, + output_attentions=None, + ): + # Currently this `class` can't be used in decoder. + + batch_size, seqlen, _ = hidden_states.size() + to_seq_length = from_seq_length = seqlen + from_block_size = to_block_size = self.block_size + + if from_seq_length % from_block_size != 0: + raise ValueError("Query sided sequence length must be multiple of block size") + + if to_seq_length % to_block_size != 0: + raise ValueError("Key/Value sided sequence length must be multiple of block size") + + query_layer = ( + self.query(hidden_states) + .view(batch_size, -1, self.num_attention_heads, self.attention_head_size) + .transpose(1, 2) + ) + key_layer = ( + self.key(hidden_states) + .view(batch_size, -1, self.num_attention_heads, self.attention_head_size) + .transpose(1, 2) + ) + value_layer = ( + self.value(hidden_states) + .view(batch_size, -1, self.num_attention_heads, self.attention_head_size) + .transpose(1, 2) + ) + + context_layer, attention_probs = self.bigbird_block_sparse_attention( + query_layer, + key_layer, + value_layer, + band_mask, + from_mask, + to_mask, + from_blocked_mask, + to_blocked_mask, + self.num_attention_heads, + self.num_random_blocks, + self.attention_head_size, + from_block_size, + to_block_size, + batch_size, + from_seq_length, + to_seq_length, + seed=self.seed, + plan_from_length=None, + plan_num_rand_blocks=None, + output_attentions=output_attentions, + ) + + context_layer = context_layer.contiguous().view(batch_size, from_seq_length, -1) + return context_layer, attention_probs + + @staticmethod + def torch_bmm_nd(inp_1, inp_2, ndim=None): + """Fast nd matrix multiplication""" + # faster replacement of torch.einsum ("bhqk,bhkd->bhqd") + return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view( + inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 1]) + ) + + @staticmethod + def torch_bmm_nd_transpose(inp_1, inp_2, ndim=None): + """Fast nd matrix multiplication with transpose""" + # faster replacement of torch.einsum (bhqd,bhkd->bhqk) + return torch.bmm( + inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2) + ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2])) + + def bigbird_block_sparse_attention( + self, + query_layer, + key_layer, + value_layer, + band_mask, + from_mask, + to_mask, + from_blocked_mask, + to_blocked_mask, + n_heads, + n_rand_blocks, + attention_head_size, + from_block_size, + to_block_size, + batch_size, + from_seq_len, + to_seq_len, + seed, + plan_from_length, + plan_num_rand_blocks, + output_attentions, + ): + # BigBirdPegasus block-sparse attention as suggested in paper + + # ITC: + # global tokens: 2 x block_size + # window tokens: 3 x block_size + # random tokens: num_rand_tokens x block_size + + # ETC: + # global tokens: extra_globals_tokens + 2 x block_size + # window tokens: 3 x block_size + # random tokens: num_rand_tokens x block_size + + # Note: + # 1) Currently, ETC is not supported. + # 2) Window size is fixed to 3 blocks & it can be changed only by + # changing `block_size`. + # 3) Number of global blocks are fixed (2 blocks here) & global tokens can be + # controlled only by `block_size`. + + # attention is calculated separately for q[0], q[1], q[2:-2], q[-2], q[-1] in order to use special trick of shifting tokens (for calculating sliding attention) + # hence following code can be divided into 5 parts. + + if from_seq_len // from_block_size != to_seq_len // to_block_size: + raise ValueError("Error the number of blocks needs to be same!") + + rsqrt_d = 1 / math.sqrt(attention_head_size) + bsz = batch_size + attn_mask_penalty = -10000.0 + + # generate random attention and corresponding masks + np.random.seed(seed) + if from_seq_len in [1024, 3072, 4096]: # old plans used in paper + rand_attn = [ + self._bigbird_block_rand_mask( + self.max_seqlen, self.max_seqlen, from_block_size, to_block_size, n_rand_blocks, last_idx=1024 + )[: (from_seq_len // from_block_size - 2)] + for _ in range(n_heads) + ] + else: + if plan_from_length is None: + plan_from_length, plan_num_rand_blocks = self._get_rand_attn_plan( + from_seq_len, from_block_size, n_rand_blocks + ) + + rand_attn = self._bigbird_block_rand_mask_with_head( + from_seq_length=from_seq_len, + to_seq_length=to_seq_len, + from_block_size=from_block_size, + to_block_size=to_block_size, + num_heads=n_heads, + plan_from_length=plan_from_length, + plan_num_rand_blocks=plan_num_rand_blocks, + ) + + rand_attn = np.stack(rand_attn, axis=0) + rand_attn = torch.tensor(rand_attn, device=query_layer.device, dtype=torch.long) + rand_attn.unsqueeze_(0) + rand_attn = torch.cat([rand_attn for _ in range(batch_size)], dim=0) + + rand_mask = self._create_rand_mask_from_inputs( + from_blocked_mask, to_blocked_mask, rand_attn, n_heads, n_rand_blocks, bsz, from_seq_len, from_block_size + ) + + blocked_query_matrix = query_layer.view(bsz, n_heads, from_seq_len // from_block_size, from_block_size, -1) + blocked_key_matrix = key_layer.view(bsz, n_heads, to_seq_len // to_block_size, to_block_size, -1) + blocked_value_matrix = value_layer.view(bsz, n_heads, to_seq_len // to_block_size, to_block_size, -1) + + # preparing block for randn attn + gathered_key = self.torch_gather_b2(blocked_key_matrix, rand_attn) + gathered_key = gathered_key.view( + bsz, n_heads, to_seq_len // to_block_size - 2, n_rand_blocks * to_block_size, -1 + ) # [bsz, n_heads, to_seq_len//to_block_size-2, n_rand_blocks, to_block_size, -1] + gathered_value = self.torch_gather_b2(blocked_value_matrix, rand_attn) + gathered_value = gathered_value.view( + bsz, n_heads, to_seq_len // to_block_size - 2, n_rand_blocks * to_block_size, -1 + ) # [bsz, n_heads, to_seq_len//to_block_size-2, n_rand_blocks, to_block_size, -1] + + # 1st PART + # 1st block (global block) attention scores + # q[0] x (k[0], k[1], k[2], k[3], k[4] .... ) + + # [bsz, n_heads, from_block_size, -1] x [bsz, n_heads, to_seq_len, -1] ==> [bsz, n_heads, from_block_size, to_seq_len] + first_product = self.torch_bmm_nd_transpose(blocked_query_matrix[:, :, 0], key_layer, ndim=4) + + first_product = first_product * rsqrt_d + first_product += (1.0 - to_mask) * attn_mask_penalty + first_attn_weights = nn.functional.softmax( + first_product, dim=-1 + ) # [bsz, n_heads, from_block_size, to_seq_len] + + # [bsz, n_heads, from_block_size, to_seq_len] x [bsz, n_heads, to_seq_len, -1] ==> [bsz, n_heads, from_block_size, -1] + first_context_layer = self.torch_bmm_nd(first_attn_weights, value_layer, ndim=4) + first_context_layer.unsqueeze_(2) + + # 2nd PART + # 2nd block attention scores + # q[1] x (sliding_keys, random_keys, global_keys) + # sliding key blocks -> 2nd, 3rd blocks + # global key blocks -> 1st block + + second_key_mat = torch.cat( + [ + blocked_key_matrix[:, :, 0], + blocked_key_matrix[:, :, 1], + blocked_key_matrix[:, :, 2], + blocked_key_matrix[:, :, -1], + gathered_key[:, :, 0], + ], + dim=2, + ) # [bsz, n_heads, (4+n_rand_blocks)*to_block_size, -1] + second_value_mat = torch.cat( + [ + blocked_value_matrix[:, :, 0], + blocked_value_matrix[:, :, 1], + blocked_value_matrix[:, :, 2], + blocked_value_matrix[:, :, -1], + gathered_value[:, :, 0], + ], + dim=2, + ) # [bsz, n_heads, (4+n_rand_blocks)*to_block_size, -1] + + # [bsz, n_heads, from_block_size, -1] x [bsz, n_heads, (4+n_rand_blocks)*to_block_size, -1] ==> [bsz, n_heads, from_block_size, (4+n_rand_blocks)*to_block_size] + second_product = self.torch_bmm_nd_transpose(blocked_query_matrix[:, :, 1], second_key_mat, ndim=4) + second_seq_pad = torch.cat( + [ + to_mask[:, :, :, : 3 * to_block_size], + to_mask[:, :, :, -to_block_size:], + to_mask.new_ones([bsz, 1, 1, n_rand_blocks * to_block_size]), + ], + dim=3, + ) + second_rand_pad = torch.cat( + [ + rand_mask.new_ones([bsz, n_heads, from_block_size, 4 * to_block_size]), + rand_mask[:, :, 0], + ], + dim=3, + ) + second_product = second_product * rsqrt_d + second_product += (1.0 - torch.minimum(second_seq_pad, second_rand_pad)) * attn_mask_penalty + second_attn_weights = nn.functional.softmax( + second_product, dim=-1 + ) # [bsz, n_heads, from_block_size, (4+n_rand_blocks)*to_block_size] + + # [bsz, n_heads, from_block_size, (4+n_rand_blocks)*to_block_size] x [bsz, n_heads, (4+n_rand_blocks)*to_block_size, -1] ==> [bsz, n_heads, from_block_size, -1] + second_context_layer = self.torch_bmm_nd(second_attn_weights, second_value_mat, ndim=4) + + second_context_layer.unsqueeze_(2) + + # 3rd PART + # Middle blocks attention scores + # q[-2:2] x (sliding_keys, random_keys, global_keys) + # sliding attn is calculated using special trick of shifting tokens as discussed in paper + # random keys are generated by taking random indices as per `rand_attn` + # global keys -> 1st & last block + + exp_blocked_key_matrix = torch.cat( + [blocked_key_matrix[:, :, 1:-3], blocked_key_matrix[:, :, 2:-2], blocked_key_matrix[:, :, 3:-1]], dim=3 + ) # [bsz, n_heads, from_seq_len//from_block_size-4, 3*to_block_size, -1] + exp_blocked_value_matrix = torch.cat( + [blocked_value_matrix[:, :, 1:-3], blocked_value_matrix[:, :, 2:-2], blocked_value_matrix[:, :, 3:-1]], + dim=3, + ) # [bsz, n_heads, from_seq_len//from_block_size-4, 3*to_block_size, -1] + middle_query_matrix = blocked_query_matrix[:, :, 2:-2] + + # sliding attention scores for q[-2:2] + # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, -1] x [b, n_heads, from_seq_len//from_block_size-4, 3*to_block_size, -1] + inner_band_product = self.torch_bmm_nd_transpose(middle_query_matrix, exp_blocked_key_matrix, ndim=5) + # ==> [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, 3*to_block_size] + inner_band_product = inner_band_product * rsqrt_d + + # randn attention scores for q[-2:2] + # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, -1] x [bsz, n_heads, from_seq_len//from_block_size-4, n_rand_blocks*to_block_size, -1] + rand_band_product = self.torch_bmm_nd_transpose(middle_query_matrix, gathered_key[:, :, 1:-1], ndim=5) + # ==> [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, n_rand_blocks*to_block_size] + rand_band_product = rand_band_product * rsqrt_d + + # Including 1st block (since it's global) + first_band_product = torch.einsum( + "bhlqd,bhkd->bhlqk", middle_query_matrix, blocked_key_matrix[:, :, 0] + ) # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, -1] x [bsz, n_heads, to_block_size, -1] ==> [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, to_block_size] + first_band_product = first_band_product * rsqrt_d + + # Including last block (since it's global) + last_band_product = torch.einsum( + "bhlqd,bhkd->bhlqk", middle_query_matrix, blocked_key_matrix[:, :, -1] + ) # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, -1] x [bsz, n_heads, to_block_size, -1] ==> [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, to_block_size] + last_band_product = last_band_product * rsqrt_d + + # masking padded tokens + inner_band_product += (1.0 - band_mask) * attn_mask_penalty + first_band_product += (1.0 - to_mask[:, :, :, :to_block_size].unsqueeze(3)) * attn_mask_penalty + last_band_product += (1.0 - to_mask[:, :, :, -to_block_size:].unsqueeze(3)) * attn_mask_penalty + rand_band_product += (1.0 - rand_mask[:, :, 1:-1]) * attn_mask_penalty + + # completing attention scores matrix for all q[-2:2] + band_product = torch.cat( + [first_band_product, inner_band_product, rand_band_product, last_band_product], dim=-1 + ) # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, (5+n_rand_blocks)*to_block_size] + + # safely doing softmax since attention matrix is completed + attn_weights = nn.functional.softmax( + band_product, dim=-1 + ) # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, (5+n_rand_blocks)*to_block_size] + + # contribution of sliding keys + # [bsz, n_heads, m//from_block_size-4, from_block_size, 3*to_block_size] x [bsz, n_heads, from_seq_len//from_block_size-4, 3*to_block_size, -1] + context_layer = self.torch_bmm_nd( + attn_weights[:, :, :, :, to_block_size : 4 * to_block_size], exp_blocked_value_matrix, ndim=5 + ) + # ==> [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, -1] + + # adding contribution of random keys + # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, n_rand_blocks*to_block_size] x [bsz, n_heads, from_seq_len//from_block_size-4, n_rand_blocks*to_block_size, -1] + context_layer += self.torch_bmm_nd( + attn_weights[:, :, :, :, 4 * to_block_size : -to_block_size], gathered_value[:, :, 1:-1], ndim=5 + ) + # ==> [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, -1] + + # adding contribution of global keys + context_layer += torch.einsum( + "bhlqk,bhkd->bhlqd", attn_weights[:, :, :, :, :to_block_size], blocked_value_matrix[:, :, 0] + ) # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, to_block_size] x [bsz, n_heads, to_block_size, -1] ==> [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, -1] + context_layer += torch.einsum( + "bhlqk,bhkd->bhlqd", attn_weights[:, :, :, :, -to_block_size:], blocked_value_matrix[:, :, -1] + ) # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, to_block_size] x [bsz, n_heads, to_block_size, -1] ==> [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, -1] + + # 4th PART + # last 2nd token attention scores + # q[-2] x (sliding_keys, random_keys, global_keys) + # sliding key blocks -> last 3 blocks + # global key block -> 1st block + # random key block -> based on indices stored in `randn_attn` + + second_last_key_mat = torch.cat( + [ + blocked_key_matrix[:, :, 0], + blocked_key_matrix[:, :, -3], + blocked_key_matrix[:, :, -2], + blocked_key_matrix[:, :, -1], + gathered_key[:, :, -1], + ], + dim=2, + ) # [bsz, n_heads, (4+n_random_blocks)*to_block_size, -1] + second_last_value_mat = torch.cat( + [ + blocked_value_matrix[:, :, 0], + blocked_value_matrix[:, :, -3], + blocked_value_matrix[:, :, -2], + blocked_value_matrix[:, :, -1], + gathered_value[:, :, -1], + ], + dim=2, + ) # [bsz, n_heads, (4+r)*to_block_size, -1] + + # [bsz, n_heads, from_block_size, -1] x [bsz, n_heads, (4+n_rand_blocks)*to_block_size, -1] ==> [bsz, n_heads, from_block_size, (4+n_rand_blocks)*to_block_size] + second_last_product = self.torch_bmm_nd_transpose(blocked_query_matrix[:, :, -2], second_last_key_mat, ndim=4) + second_last_seq_pad = torch.cat( + [ + to_mask[:, :, :, :to_block_size], + to_mask[:, :, :, -3 * to_block_size :], + to_mask.new_ones([bsz, 1, 1, n_rand_blocks * to_block_size]), + ], + dim=3, + ) + second_last_rand_pad = torch.cat( + [ + rand_mask.new_ones([bsz, n_heads, from_block_size, 4 * to_block_size]), + rand_mask[:, :, -1], + ], + dim=3, + ) + second_last_product = second_last_product * rsqrt_d + second_last_product += (1.0 - torch.minimum(second_last_seq_pad, second_last_rand_pad)) * attn_mask_penalty + second_last_attn_weights = nn.functional.softmax( + second_last_product, dim=-1 + ) # [bsz, n_heads, from_block_size, (4+n_rand_blocks)*to_block_size] + + # [bsz, n_heads, from_block_size, (4+n_rand_blocks)*to_block_size] x [bsz, n_heads, (4+n_rand_blocks)*to_block_size, -1] ==> [bsz, n_heads, from_block_size, -1] + second_last_context_layer = self.torch_bmm_nd(second_last_attn_weights, second_last_value_mat, ndim=4) + second_last_context_layer.unsqueeze_(2) + + # 5th PART + # last block (global) attention scores + # q[-1] x (k[0], k[1], k[2], k[3], .... ) + + # [bsz, n_heads, from_block_size, -1] x [bsz, n_heads, to_seq_len, -1] ==> [bsz, n_heads, from_block_size, to_seq_len] + last_product = self.torch_bmm_nd_transpose(blocked_query_matrix[:, :, -1], key_layer, ndim=4) + last_product = last_product * rsqrt_d + last_product += (1.0 - to_mask) * attn_mask_penalty + last_attn_weights = nn.functional.softmax(last_product, dim=-1) # [bsz, n_heads, from_block_size, n] + + # [bsz, n_heads, from_block_size, to_seq_len] x [bsz, n_heads, to_seq_len, -1] ==> [bsz, n_heads, from_block_size, -1] + last_context_layer = self.torch_bmm_nd(last_attn_weights, value_layer, ndim=4) + last_context_layer.unsqueeze_(2) + + # combining representations of all tokens + context_layer = torch.cat( + [first_context_layer, second_context_layer, context_layer, second_last_context_layer, last_context_layer], + dim=2, + ) + context_layer = context_layer.view((bsz, n_heads, from_seq_len, -1)) * from_mask + context_layer = torch.transpose(context_layer, 1, 2) + + # this is just for visualizing; forward pass doesn't depend on following code + if output_attentions: + # TODO(PVP): need to verify if below code is correct + attention_probs = torch.zeros( + bsz, n_heads, from_seq_len, to_seq_len, dtype=torch.float, device=context_layer.device + ) + + # 1st query block + # corresponding to `first_context_layer` + attention_probs[:, :, :from_block_size, :] = first_attn_weights # all keys global + + # 2nd query block + # corresponding to `second_context_layer` + attention_probs[:, :, from_block_size : 2 * from_block_size, : 3 * to_block_size] = second_attn_weights[ + :, :, :, : 3 * to_block_size + ] # 1st three key blocks (global + sliding) + attention_probs[:, :, from_block_size : 2 * from_block_size, -to_block_size:] = second_attn_weights[ + :, :, :, 3 * to_block_size : 4 * to_block_size + ] # last key block (global) + # random keys + for p1, i1, w1 in zip(range(bsz), rand_attn, second_attn_weights): + # p1, i1, w1 corresponds to batch_dim i.e. following operation is done for each sequence in batch + for p2, i2, w2 in zip(range(n_heads), i1, w1): + # p2, i2, w2 corresponds to head_dim i.e. following operation is done for each heads + attn_probs_view = attention_probs.view( + bsz, + n_heads, + from_seq_len // from_block_size, + from_block_size, + to_seq_len // to_block_size, + to_block_size, + ) + right_slice = w2[:, 4 * to_block_size :] + attn_probs_view[p1, p2, 1, :, i2[0]] = right_slice.view( + from_block_size, n_rand_blocks, to_block_size + ) + + # Middle query blocks + # corresponding to `context_layer` + # sliding keys + for q_idx in range(from_seq_len // from_block_size - 4): + attn_probs_view = attention_probs.view( + bsz, + n_heads, + from_seq_len // from_block_size, + from_block_size, + to_seq_len // to_block_size, + to_block_size, + )[:, :, 2:-2, :, 1:-1, :] + right_slice = attn_weights[:, :, q_idx, :, to_block_size : 4 * to_block_size] + attn_probs_view[:, :, q_idx, :, q_idx : q_idx + 3, :] = right_slice.view( + bsz, n_heads, from_block_size, 3, to_block_size + ) # inner_band_product + # global keys (corresponding to 1st key block) + attention_probs[:, :, 2 * from_block_size : -2 * from_block_size, :to_block_size] = attn_weights[ + :, :, :, :, :to_block_size + ].view(bsz, n_heads, -1, to_block_size) # first_band_product + # global keys (corresponding to last key block) + attention_probs[:, :, 2 * from_block_size : -2 * from_block_size, -to_block_size:] = attn_weights[ + :, :, :, :, -to_block_size: + ].view(bsz, n_heads, -1, to_block_size) # last_band_product + # random keys + for p1, i1, w1 in zip(range(bsz), rand_attn, attn_weights): + # p1, i1, w1 corresponds to batch_dim i.e. following operation is done for each sequence in batch + for p2, i2, w2 in zip(range(n_heads), i1, w1): + # p2, i2, w2 corresponds to head_dim i.e. following operation is done for each heads + for q_idx in range(1, len(i2) - 1): + attn_probs_view = attention_probs.view( + bsz, + n_heads, + from_seq_len // from_block_size, + from_block_size, + to_seq_len // to_block_size, + to_block_size, + ) + right_slice = w2[q_idx - 1, :, 4 * to_block_size : -to_block_size] + attn_probs_view[p1, p2, q_idx + 1, :, i2[q_idx]] = right_slice.view( + from_block_size, n_rand_blocks, to_block_size + ) + + # Second-last query block + # corresponding to `second_last_context_layer` + attention_probs[:, :, -2 * from_block_size : -from_block_size, :to_block_size] = second_last_attn_weights[ + :, :, :, :to_block_size + ] # 1st key block (global) + attention_probs[:, :, -2 * from_block_size : -from_block_size, -3 * to_block_size :] = ( + second_last_attn_weights[:, :, :, to_block_size : 4 * to_block_size] + ) # last three blocks (global + sliding) + # random keys + for p1, i1, w1 in zip(range(bsz), rand_attn, second_last_attn_weights): + # p1, i1, w1 corresponds to batch_dim i.e. following operation is done for each sequence in batch + for p2, i2, w2 in zip(range(n_heads), i1, w1): + # p2, i2, w2 corresponds to head_dim i.e. following operation is done for each heads + attn_probs_view = attention_probs.view( + bsz, + n_heads, + from_seq_len // from_block_size, + from_block_size, + to_seq_len // to_block_size, + to_block_size, + ) + right_slice = w2[:, 4 * to_block_size :] + attn_probs_view[p1, p2, -2, :, i2[-1]] = right_slice.view( + from_block_size, n_rand_blocks, to_block_size + ) + + # last query block + # corresponding to `last_context_layer` + attention_probs[:, :, -from_block_size:, :] = last_attn_weights # all keys global + + else: + attention_probs = None + + return context_layer, attention_probs + + @staticmethod + def torch_gather_b2(params, indices): + # this operation is equivalent to tf.gather when batch_dims=2 + + if params.shape[:2] != indices.shape[:2]: + raise ValueError( + "Make sure that the first two dimensions of params and indices are identical, but" + f" they are params: {params.shape[:2]} vs. indices: {indices.shape[:2]}" + ) + num_indices_to_gather = indices.shape[-2] * indices.shape[-1] + num_indices_to_pick_from = params.shape[2] + + shift = torch.arange(indices.shape[0] * indices.shape[1] * num_indices_to_gather, device=indices.device) + indices_shift = torch.div(shift, num_indices_to_gather, rounding_mode="floor") * num_indices_to_pick_from + + flattened_indices = indices.view(-1) + indices_shift + flattened_params = params.reshape(-1, params.shape[-2], params.shape[-1]) + + out_flattened = flattened_params.index_select(0, flattened_indices) + + out = out_flattened.reshape(params.shape[:2] + (num_indices_to_gather,) + params.shape[3:]) + return out + + @staticmethod + def _create_rand_mask_from_inputs( + from_blocked_mask, + to_blocked_mask, + rand_attn, + num_attention_heads, + num_rand_blocks, + batch_size, + from_seq_length, + from_block_size, + ): + """ + Create 3D attention mask from a 2D tensor mask. + + Args: + from_blocked_mask: 2D Tensor of shape [batch_size, + from_seq_length//from_block_size, from_block_size]. + to_blocked_mask: int32 Tensor of shape [batch_size, + to_seq_length//to_block_size, to_block_size]. + rand_attn: [batch_size, num_attention_heads, + from_seq_length//from_block_size-2, num_rand_blocks] + num_attention_heads: int. Number of attention heads. + num_rand_blocks: int. Number of random chunks per row. + batch_size: int. Batch size for computation. + from_seq_length: int. length of from sequence. + from_block_size: int. size of block in from sequence. + + Returns: + float Tensor of shape [batch_size, num_attention_heads, from_seq_length//from_block_size-2, + from_block_size, num_rand_blocks*to_block_size]. + """ + num_windows = from_seq_length // from_block_size - 2 + rand_mask = torch.stack([p1[i1.flatten()] for p1, i1 in zip(to_blocked_mask, rand_attn)]) + rand_mask = rand_mask.view(batch_size, num_attention_heads, num_windows, num_rand_blocks * from_block_size) + rand_mask = torch.einsum("blq,bhlk->bhlqk", from_blocked_mask[:, 1:-1], rand_mask) + return rand_mask + + @staticmethod + def _get_rand_attn_plan(from_seq_length, from_block_size, num_rand_blocks): + """ + Gives the plan of where to put random attention. + + Args: + from_seq_length: int. length of from sequence. + from_block_size: int. size of block in from sequence. + num_rand_blocks: int. Number of random chunks per row. + + Returns: + plan_from_length: ending location of from block plan_num_rand_blocks: number of random ending location for + each block + """ + + plan_from_length = [] + plan_num_rand_blocks = [] + if (2 * num_rand_blocks + 5) < (from_seq_length // from_block_size): + plan_from_length.append(int((2 * num_rand_blocks + 5) * from_block_size)) + plan_num_rand_blocks.append(num_rand_blocks) + plan_from_length.append(from_seq_length) + plan_num_rand_blocks.append(0) + elif (num_rand_blocks + 5) < (from_seq_length // from_block_size): + plan_from_length.append(int((num_rand_blocks + 5) * from_block_size)) + plan_num_rand_blocks.append(num_rand_blocks // 2) + plan_from_length.append(from_seq_length) + plan_num_rand_blocks.append(num_rand_blocks - (num_rand_blocks // 2)) + else: + plan_from_length.append(from_seq_length) + plan_num_rand_blocks.append(num_rand_blocks) + + return plan_from_length, plan_num_rand_blocks + + def _bigbird_block_rand_mask( + self, from_seq_length, to_seq_length, from_block_size, to_block_size, num_rand_blocks, last_idx=-1 + ): + """ + Create adjacency list of random attention. + + Args: + from_seq_length: int. length of from sequence. + to_seq_length: int. length of to sequence. + from_block_size: int. size of block in from sequence. + to_block_size: int. size of block in to sequence. + num_rand_blocks: int. Number of random chunks per row. + last_idx: if -1 then num_rand_blocks blocks chosen anywhere in to sequence, + if positive then num_rand_blocks blocks chosen only up to last_idx. + + Returns: + adjacency list of size from_seq_length//from_block_size-2 by num_rand_blocks + """ + # using this method when from_seq_length in [1024, 3072, 4096] + + if from_seq_length // from_block_size != to_seq_length // to_block_size: + raise ValueError("Error the number of blocks needs to be same!") + + rand_attn = np.zeros((from_seq_length // from_block_size - 2, num_rand_blocks), dtype=np.int32) + # During inference (eval) no randomness + if not self.training: + return rand_attn + middle_seq = np.arange(1, to_seq_length // to_block_size - 1, dtype=np.int32) + last = to_seq_length // to_block_size - 1 + if last_idx > (2 * to_block_size): + last = (last_idx // to_block_size) - 1 + + r = num_rand_blocks # shorthand + for i in range(1, from_seq_length // from_block_size - 1): + start = i - 2 + end = i + if i == 1: + rand_attn[i - 1, :] = np.random.permutation(middle_seq[2:last])[:r] + elif i == 2: + rand_attn[i - 1, :] = np.random.permutation(middle_seq[3:last])[:r] + elif i == from_seq_length // from_block_size - 3: + rand_attn[i - 1, :] = np.random.permutation(middle_seq[:last])[:r] + # Missing -3: should have been sliced till last-3 + elif i == from_seq_length // from_block_size - 2: + rand_attn[i - 1, :] = np.random.permutation(middle_seq[:last])[:r] + # Missing -4: should have been sliced till last-4 + else: + if start > last: + start = last + rand_attn[i - 1, :] = np.random.permutation(middle_seq[:start])[:r] + elif (end + 1) == last: + rand_attn[i - 1, :] = np.random.permutation(middle_seq[:start])[:r] + else: + rand_attn[i - 1, :] = np.random.permutation( + np.concatenate((middle_seq[:start], middle_seq[end + 1 : last])) + )[:r] + return rand_attn + + def _bigbird_block_rand_mask_with_head( + self, + from_seq_length, + to_seq_length, + from_block_size, + to_block_size, + num_heads, + plan_from_length, + plan_num_rand_blocks, + window_block_left=1, + window_block_right=1, + global_block_top=1, + global_block_bottom=1, + global_block_left=1, + global_block_right=1, + ): + """ + Create adjacency list of random attention. + + Args: + from_seq_length: int. length of from sequence. + to_seq_length: int. length of to sequence. + from_block_size: int. size of block in from sequence. + to_block_size: int. size of block in to sequence. + num_heads: int. total number of heads. + plan_from_length: list. plan from length where num_random_blocks are chosen from. + plan_num_rand_blocks: list. number of rand blocks within the plan. + window_block_left: int. number of blocks of window to left of a block. + window_block_right: int. number of blocks of window to right of a block. + global_block_top: int. number of blocks at the top. + global_block_bottom: int. number of blocks at the bottom. + global_block_left: int. Number of blocks globally used to the left. + global_block_right: int. Number of blocks globally used to the right. + + Returns: + adjacency list of size num_head where each element is of size from_seq_length//from_block_size-2 by + num_rand_blocks + """ + # using this method when from_seq_length not in [1024, 3072, 4096] + + if from_seq_length // from_block_size != to_seq_length // to_block_size: + raise ValueError("Error the number of blocks needs to be same!") + + if from_seq_length not in plan_from_length: + raise ValueError("Error from sequence length not in plan!") + + # Total number of blocks in the mmask + num_blocks = from_seq_length // from_block_size + # Number of blocks per plan + plan_block_length = np.array(plan_from_length) // from_block_size + # till when to follow plan + max_plan_idx = plan_from_length.index(from_seq_length) + + # Random Attention adjacency list + rand_attn = [ + np.zeros((num_blocks, np.sum(plan_num_rand_blocks[: max_plan_idx + 1])), dtype=np.int32) + for i in range(num_heads) + ] + # During inference (eval) no randomness + if not self.training: + for nh in range(num_heads): + rand_attn[nh] = rand_attn[nh][global_block_top : num_blocks - global_block_bottom, :] + return rand_attn + + # We will go iteratively over the plan blocks and pick random number of + # Attention blocks from the legally allowed blocks + for plan_idx in range(max_plan_idx + 1): + rnd_r_cnt = 0 + if plan_idx > 0: + # set the row for all from_blocks starting from 0 to + # plan_block_length[plan_idx-1] + # column indx start from plan_block_length[plan_idx-1] and ends at + # plan_block_length[plan_idx] + if plan_num_rand_blocks[plan_idx] > 0: + rnd_r_cnt = int(np.sum(plan_num_rand_blocks[:plan_idx])) + curr_r_cnt = int(np.sum(plan_num_rand_blocks[: plan_idx + 1])) + for blk_rw_idx in range(global_block_top, plan_block_length[plan_idx - 1]): + for h in range(num_heads): + rand_attn[h][blk_rw_idx, rnd_r_cnt:curr_r_cnt] = self._get_single_block_row_attention( + block_id=blk_rw_idx, + to_start_block_id=plan_block_length[plan_idx - 1], + to_end_block_id=plan_block_length[plan_idx], + num_rand_blocks=plan_num_rand_blocks[plan_idx], + window_block_left=window_block_left, + window_block_right=window_block_right, + global_block_left=global_block_left, + global_block_right=global_block_right, + ) + + for pl_id in range(plan_idx): + if plan_num_rand_blocks[pl_id] == 0: + continue + for blk_rw_idx in range(plan_block_length[plan_idx - 1], plan_block_length[plan_idx]): + rnd_r_cnt = 0 + to_start_block_id = 0 + if pl_id > 0: + rnd_r_cnt = int(np.sum(plan_num_rand_blocks[:pl_id])) + to_start_block_id = plan_block_length[pl_id - 1] + curr_r_cnt = int(np.sum(plan_num_rand_blocks[: pl_id + 1])) + for h in range(num_heads): + rand_attn[h][blk_rw_idx, rnd_r_cnt:curr_r_cnt] = self._get_single_block_row_attention( + block_id=blk_rw_idx, + to_start_block_id=to_start_block_id, + to_end_block_id=plan_block_length[pl_id], + num_rand_blocks=plan_num_rand_blocks[pl_id], + window_block_left=window_block_left, + window_block_right=window_block_right, + global_block_left=global_block_left, + global_block_right=global_block_right, + ) + + if plan_num_rand_blocks[plan_idx] == 0: + continue + curr_r_cnt = int(np.sum(plan_num_rand_blocks[: plan_idx + 1])) + from_start_block_id = global_block_top + to_start_block_id = 0 + if plan_idx > 0: + rnd_r_cnt = int(np.sum(plan_num_rand_blocks[:plan_idx])) + from_start_block_id = plan_block_length[plan_idx - 1] + to_start_block_id = plan_block_length[plan_idx - 1] + + for blk_rw_idx in range(from_start_block_id, plan_block_length[plan_idx]): + for h in range(num_heads): + rand_attn[h][blk_rw_idx, rnd_r_cnt:curr_r_cnt] = self._get_single_block_row_attention( + block_id=blk_rw_idx, + to_start_block_id=to_start_block_id, + to_end_block_id=plan_block_length[plan_idx], + num_rand_blocks=plan_num_rand_blocks[plan_idx], + window_block_left=window_block_left, + window_block_right=window_block_right, + global_block_left=global_block_left, + global_block_right=global_block_right, + ) + + for nh in range(num_heads): + rand_attn[nh] = rand_attn[nh][global_block_top : num_blocks - global_block_bottom, :] + + return rand_attn + + @staticmethod + def _get_single_block_row_attention( + block_id, + to_start_block_id, + to_end_block_id, + num_rand_blocks, + window_block_left=1, + window_block_right=1, + global_block_left=1, + global_block_right=1, + ): + """ + For a single row block get random row attention. + + Args: + block_id: int. block id of row. + to_start_block_id: int. random attention column start id. + to_end_block_id: int. random attention column end id. + num_rand_blocks: int. number of random blocks to be selected. + window_block_left: int. number of blocks of window to left of a block. + window_block_right: int. number of blocks of window to right of a block. + global_block_left: int. Number of blocks globally used to the left. + global_block_right: int. Number of blocks globally used to the right. + + Returns: + row containing the random attention vector of size num_rand_blocks. + """ + # list of to_blocks from which to choose random attention + to_block_list = np.arange(to_start_block_id, to_end_block_id, dtype=np.int32) + # permute the blocks + perm_block = np.random.permutation(to_block_list) + + # illegal blocks for the current block id, using window + illegal_blocks = list(range(block_id - window_block_left, block_id + window_block_right + 1)) + + # Add blocks at the start and at the end + illegal_blocks.extend(list(range(global_block_left))) + illegal_blocks.extend(list(range(to_end_block_id - global_block_right, to_end_block_id))) + + # The second from_block cannot choose random attention on second last to_block + if block_id == 1: + illegal_blocks.append(to_end_block_id - 2) + + # The second last from_block cannot choose random attention on second to_block + if block_id == to_end_block_id - 2: + illegal_blocks.append(1) + + selected_random_blocks = [] + + for i in range(to_end_block_id - to_start_block_id): + if perm_block[i] not in illegal_blocks: + selected_random_blocks.append(perm_block[i]) + if len(selected_random_blocks) == num_rand_blocks: + break + return np.array(selected_random_blocks, dtype=np.int32) + + +class BigBirdPegasusEncoderAttention(nn.Module): + def __init__(self, config, seed=None): + super().__init__() + self.config = config + self.seed = seed + + self.attention_type = config.attention_type + + if self.attention_type == "original_full": + self.self = BigBirdPegasusSelfAttention(config) + elif self.attention_type == "block_sparse": + self.self = BigBirdPegasusBlockSparseAttention(config, seed) + else: + raise ValueError( + f"attention_type can either be original_full or block_sparse, but is {self.config.attention_type}" + ) + + self.output = nn.Linear(config.hidden_size, config.hidden_size, bias=config.use_bias) + + def set_attention_type(self, value: str): + if value not in ["original_full", "block_sparse"]: + raise ValueError( + f"attention_type can only be set to either 'original_full' or 'block_sparse', but is {value}" + ) + # attention type is already correctly set + if value == self.attention_type: + return + + if value == "original_full": + # copy all weights to new full attention class + attn_weights = BigBirdPegasusSelfAttention(self.config) + else: + # copy all weights to new sparse attention class + attn_weights = BigBirdPegasusBlockSparseAttention(self.config, self.seed) + + attn_weights.query = self.self.query + attn_weights.value = self.self.value + attn_weights.key = self.self.key + self.self = attn_weights + self.attention_type = value + + if not self.training: + self.self.eval() + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + output_attentions=False, + band_mask=None, + from_mask=None, + to_mask=None, + from_blocked_mask=None, + to_blocked_mask=None, + ): + # Expand dims to enable multiplication in the self-attention module + head_mask = head_mask.reshape(1, -1, 1, 1) if head_mask is not None else None + + if self.attention_type == "original_full": + self_outputs = self.self( + hidden_states, + attention_mask, + head_mask, + output_attentions=output_attentions, + ) + else: + self_outputs = self.self( + hidden_states, band_mask, from_mask, to_mask, from_blocked_mask, to_blocked_mask, output_attentions + ) + + attention_output = self.output(self_outputs[0]) + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + return outputs + + +# Copied from transformers.models.bart.modeling_bart.eager_attention_forward +def eager_attention_forward( + module: nn.Module, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attention_mask: Optional[torch.Tensor], + scaling: Optional[float] = None, + dropout: float = 0.0, + head_mask: Optional[torch.Tensor] = None, + **kwargs, +): + if scaling is None: + scaling = query.size(-1) ** -0.5 + + attn_weights = torch.matmul(query, key.transpose(2, 3)) * scaling + if attention_mask is not None: + attn_weights = attn_weights + attention_mask + + attn_weights = nn.functional.softmax(attn_weights, dim=-1) + + if head_mask is not None: + attn_weights = attn_weights * head_mask.view(1, -1, 1, 1) + + attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) + attn_output = torch.matmul(attn_weights, value) + attn_output = attn_output.transpose(1, 2).contiguous() + + return attn_output, attn_weights + + +# Copied from transformers.models.bart.modeling_bart.BartAttention with BartConfig->BigBirdPegasusConfig, Bart->BigBirdPegasusDecoder +class BigBirdPegasusDecoderAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__( + self, + embed_dim: int, + num_heads: int, + dropout: float = 0.0, + is_decoder: bool = False, + bias: bool = True, + is_causal: bool = False, + config: Optional[BigBirdPegasusConfig] = None, + layer_idx: Optional[int] = None, + ): + super().__init__() + self.embed_dim = embed_dim + self.num_heads = num_heads + self.dropout = dropout + self.head_dim = embed_dim // num_heads + self.config = config + + if (self.head_dim * num_heads) != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}" + f" and `num_heads`: {num_heads})." + ) + self.scaling = self.head_dim**-0.5 + self.is_decoder = is_decoder + self.is_causal = is_causal + self.layer_idx = layer_idx + if layer_idx is None and self.is_decoder: + logger.warning_once( + f"Instantiating a decoder {self.__class__.__name__} without passing `layer_idx` is not recommended and " + "will lead to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` " + "when creating this class." + ) + + self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + key_value_states: Optional[torch.Tensor] = None, + past_key_values: Optional[Cache] = None, + attention_mask: Optional[torch.Tensor] = None, + layer_head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + cache_position: Optional[torch.Tensor] = None, + # TODO: we need a refactor so that the different attention modules can get their specific kwargs + # ATM, we have mixed things encoder, decoder, and encoder-decoder attn + **kwargs: Unpack[FlashAttentionKwargs], + ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]: + """Input shape: Batch x Time x Channel""" + + # if key_value_states are provided this layer is used as a cross-attention layer + # for the decoder + is_cross_attention = key_value_states is not None + + # determine input shapes + bsz, tgt_len = hidden_states.shape[:-1] + src_len = key_value_states.shape[1] if is_cross_attention else tgt_len + + q_input_shape = (bsz, tgt_len, -1, self.head_dim) + kv_input_shape = (bsz, src_len, -1, self.head_dim) + + # get query proj + query_states = self.q_proj(hidden_states).view(*q_input_shape).transpose(1, 2) + + is_updated = False + if past_key_values is not None: + if isinstance(past_key_values, EncoderDecoderCache): + is_updated = past_key_values.is_updated.get(self.layer_idx) + if is_cross_attention: + # after the first generated id, we can subsequently re-use all key/value_states from cache + curr_past_key_value = past_key_values.cross_attention_cache + else: + curr_past_key_value = past_key_values.self_attention_cache + else: + curr_past_key_value = past_key_values + + current_states = key_value_states if is_cross_attention else hidden_states + if is_cross_attention and past_key_values is not None and is_updated: + # reuse k,v, cross_attentions + key_states = curr_past_key_value.layers[self.layer_idx].keys + value_states = curr_past_key_value.layers[self.layer_idx].values + else: + key_states = self.k_proj(current_states) + value_states = self.v_proj(current_states) + key_states = key_states.view(*kv_input_shape).transpose(1, 2) + value_states = value_states.view(*kv_input_shape).transpose(1, 2) + + if past_key_values is not None: + # save all key/value_states to cache to be re-used for fast auto-regressive generation + cache_position = cache_position if not is_cross_attention else None + key_states, value_states = curr_past_key_value.update( + key_states, value_states, self.layer_idx, {"cache_position": cache_position} + ) + # set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls + if is_cross_attention and isinstance(past_key_values, EncoderDecoderCache): + past_key_values.is_updated[self.layer_idx] = True + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + attn_output, attn_weights = attention_interface( + self, + query_states, + key_states, + value_states, + attention_mask, + dropout=0.0 if not self.training else self.dropout, + scaling=self.scaling, + output_attentions=output_attentions, + head_mask=layer_head_mask, + **kwargs, + ) + + attn_output = attn_output.reshape(bsz, tgt_len, -1).contiguous() + attn_output = self.out_proj(attn_output) + + return attn_output, attn_weights + + +class BigBirdPegasusEncoderLayer(GradientCheckpointingLayer): + def __init__(self, config: BigBirdPegasusConfig, seed=None): + super().__init__() + self.attention_type = config.attention_type + self.embed_dim = config.d_model + self.self_attn = BigBirdPegasusEncoderAttention(config, seed=seed) + self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) + self.dropout = config.dropout + self.activation_fn = ACT2FN[config.activation_function] + self.activation_dropout = config.activation_dropout + self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim) + self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim) + self.final_layer_norm = nn.LayerNorm(self.embed_dim) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: torch.Tensor, + layer_head_mask: torch.Tensor, + band_mask=None, + from_mask=None, + to_mask=None, + from_blocked_mask=None, + to_blocked_mask=None, + output_attentions: bool = False, + ): + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`): attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + """ + residual = hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + + self_attention_outputs = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + head_mask=layer_head_mask, + output_attentions=output_attentions, + band_mask=band_mask, + from_mask=from_mask, + to_mask=to_mask, + from_blocked_mask=from_blocked_mask, + to_blocked_mask=to_blocked_mask, + ) + hidden_states = self_attention_outputs[0] + + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + + residual = hidden_states + hidden_states = self.final_layer_norm(hidden_states) + hidden_states = self.activation_fn(self.fc1(hidden_states)) + + hidden_states = self.fc2(hidden_states) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + + if hidden_states.dtype == torch.float16 and ( + torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any() + ): + clamp_value = torch.finfo(hidden_states.dtype).max - 1000 + hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attention_outputs[1],) + + return outputs + + def set_attention_type(self, value: str): + if value not in ["original_full", "block_sparse"]: + raise ValueError( + f"attention_type can only be set to either 'original_full' or 'block_sparse', but is {value}" + ) + # attention type is already correctly set + if value == self.attention_type: + return + self.attention_type = value + self.self_attn.set_attention_type(value) + + +class BigBirdPegasusDecoderLayer(GradientCheckpointingLayer): + def __init__(self, config: BigBirdPegasusConfig, layer_idx: Optional[int] = None): + super().__init__() + self.embed_dim = config.d_model + self.self_attn = BigBirdPegasusDecoderAttention( + embed_dim=self.embed_dim, + num_heads=config.decoder_attention_heads, + dropout=config.attention_dropout, + is_decoder=True, + bias=config.use_bias, + config=config, + layer_idx=layer_idx, + ) + self.dropout = config.dropout + self.activation_fn = ACT2FN[config.activation_function] + self.activation_dropout = config.activation_dropout + + self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) + self.encoder_attn = BigBirdPegasusDecoderAttention( + self.embed_dim, + config.decoder_attention_heads, + dropout=config.attention_dropout, + is_decoder=True, + bias=config.use_bias, + config=config, + layer_idx=layer_idx, + ) + self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim) + self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim) + self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim) + self.final_layer_norm = nn.LayerNorm(self.embed_dim) + + # Copied from transformers.models.mbart.modeling_mbart.MBartDecoderLayer.forward + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + layer_head_mask: Optional[torch.Tensor] = None, + cross_attn_layer_head_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[Cache] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = True, + cache_position: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`): attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + encoder_hidden_states (`torch.FloatTensor`): + cross attention input to the layer of shape `(batch, seq_len, embed_dim)` + encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size + `(encoder_attention_heads,)`. + cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of + size `(decoder_attention_heads,)`. + past_key_values (`Cache`): cached past key and value projection states + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*): + Indices depicting the position of the input sequence tokens in the sequence. It is used to update the + cache in the correct position and to infer the complete sequence length. + """ + residual = hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + + # Self Attention + hidden_states, self_attn_weights = self.self_attn( + hidden_states=hidden_states, + past_key_values=past_key_values, + attention_mask=attention_mask, + layer_head_mask=layer_head_mask, + output_attentions=output_attentions, + cache_position=cache_position, + ) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + + # Cross-Attention Block + cross_attn_weights = None + if encoder_hidden_states is not None: + residual = hidden_states + hidden_states = self.encoder_attn_layer_norm(hidden_states) + + hidden_states, cross_attn_weights = self.encoder_attn( + hidden_states=hidden_states, + key_value_states=encoder_hidden_states, + attention_mask=encoder_attention_mask, + layer_head_mask=cross_attn_layer_head_mask, + past_key_values=past_key_values, + output_attentions=output_attentions, + ) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.final_layer_norm(hidden_states) + hidden_states = self.activation_fn(self.fc1(hidden_states)) + hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) + hidden_states = self.fc2(hidden_states) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights, cross_attn_weights) + + return outputs + + +# Copied from transformers.models.bart.modeling_bart.BartClassificationHead with Bart->BigBirdPegasus +class BigBirdPegasusClassificationHead(nn.Module): + """Head for sentence-level classification tasks.""" + + def __init__( + self, + input_dim: int, + inner_dim: int, + num_classes: int, + pooler_dropout: float, + ): + super().__init__() + self.dense = nn.Linear(input_dim, inner_dim) + self.dropout = nn.Dropout(p=pooler_dropout) + self.out_proj = nn.Linear(inner_dim, num_classes) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dropout(hidden_states) + hidden_states = self.dense(hidden_states) + hidden_states = torch.tanh(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.out_proj(hidden_states) + return hidden_states + + +@auto_docstring +class BigBirdPegasusPreTrainedModel(PreTrainedModel): + config: BigBirdPegasusConfig + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["BigBirdPegasusEncoderLayer", "BigBirdPegasusDecoderLayer"] + _skip_keys_device_placement = "past_key_values" + _supports_param_buffer_assignment = False + + _can_compile_fullgraph = True + + def _init_weights(self, module): + std = self.config.init_std + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, nn.LayerNorm): + module.weight.data.fill_(1.0) + module.bias.data.zero_() + + @property + def dummy_inputs(self): + pad_token = self.config.pad_token_id + input_ids = torch.tensor([[0, 6, 10, 4, 2], [0, 8, 12, 2, pad_token]], device=self.device) + dummy_inputs = { + "attention_mask": input_ids.ne(pad_token), + "input_ids": input_ids, + } + return dummy_inputs + + # Copied from transformers.models.bart.modeling_bart.BartPreTrainedModel._update_causal_mask + def _update_causal_mask( + self, + attention_mask: Optional[Union[torch.Tensor, "BlockMask"]], + input_tensor: torch.Tensor, + cache_position: torch.Tensor, + past_key_values: Cache, + ): + if self.config._attn_implementation == "flex_attention": + if isinstance(attention_mask, torch.Tensor): + attention_mask = make_flex_block_causal_mask(attention_mask) + # Other attention flavors support in-built causal (when `mask is None`) + # while we need to create our specific block mask regardless + elif attention_mask is None: + attention_mask = make_flex_block_causal_mask( + torch.ones( + size=(input_tensor.shape[0], input_tensor.shape[1]), + device=attention_mask.device, + ) + ) + return attention_mask + + if self.config._attn_implementation == "flash_attention_2": + if attention_mask is not None and (attention_mask == 0.0).any(): + return attention_mask + return None + + # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in + # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail + # to infer the attention mask. + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + using_compilable_cache = past_key_values.is_compileable if past_key_values is not None else False + + # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward + if self.config._attn_implementation == "sdpa" and not using_compilable_cache: + if AttentionMaskConverter._ignore_causal_mask_sdpa( + attention_mask, + inputs_embeds=input_tensor, + past_key_values_length=past_seen_tokens, + is_training=self.training, + ): + return None + + dtype = input_tensor.dtype + sequence_length = input_tensor.shape[1] + if using_compilable_cache: + target_length = past_key_values.get_max_cache_shape() + else: + target_length = ( + attention_mask.shape[-1] + if isinstance(attention_mask, torch.Tensor) + else past_seen_tokens + sequence_length + 1 + ) + + # In case the provided `attention` mask is 2D, we generate a causal mask here (4D). + causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position( + attention_mask, + sequence_length=sequence_length, + target_length=target_length, + dtype=dtype, + cache_position=cache_position, + batch_size=input_tensor.shape[0], + ) + + if ( + self.config._attn_implementation == "sdpa" + and attention_mask is not None + and attention_mask.device.type in ["cuda", "xpu", "npu"] + ): + # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when + # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path. + # Details: https://github.com/pytorch/pytorch/issues/110213 + min_dtype = torch.finfo(dtype).min + causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype) + + return causal_mask + + @staticmethod + # Copied from transformers.models.gptj.modeling_gptj.GPTJModel._prepare_4d_causal_attention_mask_with_cache_position + def _prepare_4d_causal_attention_mask_with_cache_position( + attention_mask: torch.Tensor, + sequence_length: int, + target_length: int, + dtype: torch.dtype, + cache_position: torch.Tensor, + batch_size: int, + **kwargs, + ): + """ + Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape + `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing. + + Args: + attention_mask (`torch.Tensor`): + A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape + `(batch_size, 1, query_length, key_value_length)`. + sequence_length (`int`): + The sequence length being processed. + target_length (`int`): + The target length: when generating with static cache, the mask should be as long as the static cache, + to account for the 0 padding, the part of the cache that is not filled yet. + dtype (`torch.dtype`): + The dtype to use for the 4D attention mask. + cache_position (`torch.Tensor`): + Indices depicting the position of the input sequence tokens in the sequence. + batch_size (`torch.Tensor`): + Batch size. + """ + if attention_mask is not None and attention_mask.dim() == 4: + # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing. + causal_mask = attention_mask + else: + min_dtype = torch.finfo(dtype).min + causal_mask = torch.full( + (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=cache_position.device + ) + if sequence_length != 1: + causal_mask = torch.triu(causal_mask, diagonal=1) + causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(-1, 1) + causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1) + if attention_mask is not None: + causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit + mask_length = attention_mask.shape[-1] + padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to( + causal_mask.device + ) + padding_mask = padding_mask == 0 + causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill( + padding_mask, min_dtype + ) + + return causal_mask + + # Copied from transformers.models.bart.modeling_bart.BartPreTrainedModel._update_cross_attn_mask + def _update_cross_attn_mask( + self, + encoder_hidden_states: Union[torch.Tensor, None], + encoder_attention_mask: Union[torch.Tensor, None], + input_shape: torch.Size, + inputs_embeds: torch.Tensor, + ): + # expand encoder attention mask + if encoder_hidden_states is not None and encoder_attention_mask is not None: + if self.config._attn_implementation == "flash_attention_2": + encoder_attention_mask = encoder_attention_mask if 0 in encoder_attention_mask else None + elif self.config._attn_implementation == "sdpa": + # output_attentions=True & cross_attn_head_mask can not be supported when using SDPA, and we fall back on + # the manual implementation that requires a 4D causal mask in all cases. + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + encoder_attention_mask = _prepare_4d_attention_mask_for_sdpa( + encoder_attention_mask, + inputs_embeds.dtype, + tgt_len=input_shape[-1], + ) + elif self.config._attn_implementation == "flex_attention": + if isinstance(encoder_attention_mask, torch.Tensor): + encoder_attention_mask = make_flex_block_causal_mask( + encoder_attention_mask, + query_length=input_shape[-1], + is_causal=False, + ) + else: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + encoder_attention_mask = _prepare_4d_attention_mask( + encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1] + ) + + return encoder_attention_mask + + +class BigBirdPegasusEncoder(BigBirdPegasusPreTrainedModel): + """ + Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a + [`BigBirdPegasusEncoderLayer`]. + + Args: + config: BigBirdPegasusConfig + embed_tokens (nn.Embedding): output embedding + """ + + def __init__(self, config: BigBirdPegasusConfig, embed_tokens: Optional[nn.Embedding] = None): + super().__init__(config) + + self.attention_type = config.attention_type + self.block_size = config.block_size + + self.dropout = config.dropout + self.layerdrop = config.encoder_layerdrop + + embed_dim = config.d_model + self.padding_idx = config.pad_token_id + self.max_source_positions = config.max_position_embeddings + embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0 + + self.embed_tokens = BigBirdPegasusScaledWordEmbedding( + config.vocab_size, embed_dim, self.padding_idx, embed_scale=embed_scale + ) + + if embed_tokens is not None: + self.embed_tokens.weight = embed_tokens.weight + + self.embed_positions = BigBirdPegasusLearnedPositionalEmbedding( + config.max_position_embeddings, + embed_dim, + ) + self.layers = nn.ModuleList([BigBirdPegasusEncoderLayer(config, seed=i) for i in range(config.encoder_layers)]) + self.layernorm_embedding = nn.LayerNorm(embed_dim) + + self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() + + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ): + r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you + provide it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert `input_ids` indices into associated vectors + than the model's internal embedding lookup matrix. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask) + input_shape = input_ids.size() + input_ids = input_ids.view(-1, input_shape[-1]) + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + embed_pos = self.embed_positions(input_shape) + + hidden_states = inputs_embeds + embed_pos + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + + if attention_mask is None: + attention_mask = torch.ones(input_shape, device=hidden_states.device) + attention_mask = attention_mask.long() + + # in order to use block_sparse attention, sequence_length has to be at least + # bigger than all global attentions: 2 * block_size + # + sliding tokens: 3 * block_size + # + random tokens: 2 * num_random_blocks * block_size + max_tokens_to_attend = (5 + 2 * self.config.num_random_blocks) * self.config.block_size + if self.attention_type == "block_sparse" and input_shape[1] <= max_tokens_to_attend: + # change attention_type from block_sparse to original_full + sequence_length = input_shape[1] + logger.warning( + "Attention type 'block_sparse' is not possible if sequence_length: " + f"{sequence_length} <= num global tokens: 2 * config.block_size " + "+ min. num sliding tokens: 3 * config.block_size " + "+ config.num_random_blocks * config.block_size " + "+ additional buffer: config.num_random_blocks * config.block_size " + f"= {max_tokens_to_attend} with config.block_size " + f"= {self.config.block_size}, config.num_random_blocks " + f"= {self.config.num_random_blocks}. " + "Changing attention type to 'original_full'..." + ) + self.set_attention_type("original_full") + + if self.attention_type == "block_sparse": + padding_len, hidden_states, attention_mask = self._pad_to_block_size(hidden_states, attention_mask) + else: + padding_len = 0 + + # expand attention_mask + if self.attention_type == "original_full": + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + attention_mask = _prepare_4d_attention_mask(attention_mask, inputs_embeds.dtype) + blocked_encoder_mask = band_mask = from_mask = to_mask = None + elif self.attention_type == "block_sparse": + blocked_encoder_mask, band_mask, from_mask, to_mask = self.create_masks_for_block_sparse_attn( + attention_mask, self.block_size + ) + attention_mask = None + else: + raise ValueError( + f"attention_type can either be original_full or block_sparse, but is {self.attention_type}" + ) + + encoder_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + + # check if head_mask has a correct number of layers specified if desired + if head_mask is not None: + if head_mask.size()[0] != len(self.layers): + raise ValueError( + f"The head_mask should be specified for {len(self.layers)} layers, but it is for" + f" {head_mask.size()[0]}." + ) + + for idx, encoder_layer in enumerate(self.layers): + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + # add LayerDrop (see https://huggingface.co/papers/1909.11556 for description) + to_drop = False + if self.training: + dropout_probability = torch.rand([]) + if dropout_probability < self.layerdrop: # skip the layer + to_drop = True + + if to_drop: + layer_outputs = (None, None) + else: + layer_outputs = encoder_layer( + hidden_states, + attention_mask, + layer_head_mask=(head_mask[idx] if head_mask is not None else None), + band_mask=band_mask, + from_mask=from_mask, + to_mask=to_mask, + from_blocked_mask=blocked_encoder_mask, + to_blocked_mask=blocked_encoder_mask, + output_attentions=output_attentions, + ) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + + hidden_states = self.layernorm_embedding(hidden_states) + + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + + if padding_len > 0: + # unpad `sequence_output` because the calling function is expecting a length == input_ids.size(1) + hidden_states = hidden_states[:, :-padding_len] + + if not return_dict: + return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) + + return BaseModelOutput( + last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions + ) + + def set_attention_type(self, value: str): + if value not in ["original_full", "block_sparse"]: + raise ValueError( + f"attention_type can only be set to either 'original_full' or 'block_sparse', but is {value}" + ) + # attention type is already correctly set + if value == self.attention_type: + return + self.attention_type = value + for layer in self.layers: + layer.set_attention_type(value) + + @staticmethod # Copied from transformers.models.big_bird.modeling_big_bird.BigBirdModel.create_masks_for_block_sparse_attn + def create_masks_for_block_sparse_attn(attention_mask: torch.Tensor, block_size: int): + batch_size, seq_length = attention_mask.size() + if seq_length % block_size != 0: + raise ValueError( + f"Sequence length must be multiple of block size, but sequence length is {seq_length}, while block" + f" size is {block_size}." + ) + + def create_band_mask_from_inputs(from_blocked_mask, to_blocked_mask): + """ + Create 3D attention mask from a 2D tensor mask. + + Args: + from_blocked_mask: 2D Tensor of shape [batch_size, + from_seq_length//from_block_size, from_block_size]. + to_blocked_mask: int32 Tensor of shape [batch_size, + to_seq_length//to_block_size, to_block_size]. + + Returns: + float Tensor of shape [batch_size, 1, from_seq_length//from_block_size-4, from_block_size, + 3*to_block_size]. + """ + exp_blocked_to_pad = torch.cat( + [to_blocked_mask[:, 1:-3], to_blocked_mask[:, 2:-2], to_blocked_mask[:, 3:-1]], dim=2 + ) + band_mask = torch.einsum("blq,blk->blqk", from_blocked_mask[:, 2:-2], exp_blocked_to_pad) + band_mask.unsqueeze_(1) + return band_mask + + blocked_encoder_mask = attention_mask.view(batch_size, seq_length // block_size, block_size) + band_mask = create_band_mask_from_inputs(blocked_encoder_mask, blocked_encoder_mask) + + from_mask = attention_mask.view(batch_size, 1, seq_length, 1) + to_mask = attention_mask.view(batch_size, 1, 1, seq_length) + + return blocked_encoder_mask, band_mask, from_mask, to_mask + + def _pad_to_block_size(self, hidden_states: torch.Tensor, attention_mask: torch.Tensor): + """A helper function to pad tokens and mask to work with implementation of BigBird block-sparse attention.""" + # padding + block_size = self.config.block_size + batch_size, seq_len = hidden_states.shape[:2] + + padding_len = (block_size - seq_len % block_size) % block_size + if padding_len > 0: + logger.warning_once( + f"Input ids are automatically padded from {seq_len} to {seq_len + padding_len} to be a multiple of " + f"`config.block_size`: {block_size}" + ) + pad_id = self.config.pad_token_id + device = hidden_states.device + input_ids_padding = torch.ones((batch_size, padding_len), dtype=torch.long, device=device) * pad_id + inputs_embeds_padding = self.embed_tokens(input_ids_padding) + hidden_states = torch.cat([hidden_states, inputs_embeds_padding], dim=-2) + + attention_mask = nn.functional.pad( + attention_mask, (0, padding_len), value=0 + ) # no attention on the padding tokens + + return padding_len, hidden_states, attention_mask + + +class BigBirdPegasusDecoder(BigBirdPegasusPreTrainedModel): + """ + Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`BigBirdPegasusDecoderLayer`] + + Args: + config: BigBirdPegasusConfig + embed_tokens (nn.Embedding): output embedding + """ + + def __init__(self, config: BigBirdPegasusConfig, embed_tokens: Optional[nn.Embedding] = None): + super().__init__(config) + self.dropout = config.dropout + self.layerdrop = config.decoder_layerdrop + self.padding_idx = config.pad_token_id + self.max_target_positions = config.max_position_embeddings + embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0 + + self.embed_tokens = BigBirdPegasusScaledWordEmbedding( + config.vocab_size, config.d_model, self.padding_idx, embed_scale=embed_scale + ) + + if embed_tokens is not None: + self.embed_tokens.weight = embed_tokens.weight + + self.embed_positions = BigBirdPegasusLearnedPositionalEmbedding( + config.max_position_embeddings, + config.d_model, + ) + self.layers = nn.ModuleList( + [BigBirdPegasusDecoderLayer(config, layer_idx=i) for i in range(config.decoder_layers)] + ) + self.layernorm_embedding = nn.LayerNorm(config.d_model) + + self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() + + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + cross_attn_head_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.Tensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.Tensor] = None, + ): + r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you + provide it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention + of the decoder. + encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*): + Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values + selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the cross-attention modules in decoder to avoid performing + cross-attention on hidden heads. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache). + + Contains pre-computed hidden-states (key and values in the self-attention blocks and in the + cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. + + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those + that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of + all `decoder_input_ids` of shape `(batch_size, sequence_length)`. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert `input_ids` indices into associated vectors + than the model's internal embedding lookup matrix. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*): + Indices depicting the position of the input sequence tokens in the sequence. It is used to update the + cache in the correct position and to infer the complete sequence length. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + # retrieve input_ids and inputs_embeds + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time") + elif input_ids is not None: + input_shape = input_ids.size() + input_ids = input_ids.view(-1, input_shape[-1]) + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + else: + raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds") + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + # initialize `past_key_values` + if use_cache and past_key_values is None: + past_key_values = ( + EncoderDecoderCache(DynamicCache(config=self.config), DynamicCache(config=self.config)) + if encoder_hidden_states is not None + else DynamicCache(config=self.config) + ) + if use_cache and isinstance(past_key_values, tuple): + logger.warning_once( + "Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. " + "You should pass an instance of `EncoderDecoderCache` instead, e.g. " + "`past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`." + ) + past_key_values = EncoderDecoderCache.from_legacy_cache(past_key_values) + + batch_size, seq_length = inputs_embeds.size()[:-1] + past_key_values_length = past_key_values.get_seq_length() if past_key_values is not None else 0 + if cache_position is None: + cache_position = torch.arange( + past_key_values_length, past_key_values_length + seq_length, device=inputs_embeds.device + ) + + if attention_mask is None and not is_torchdynamo_compiling(): + # required mask seq length can be calculated via length of past cache + mask_seq_length = past_key_values_length + seq_length + attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device) + + self_attn_cache = ( + past_key_values.self_attention_cache + if isinstance(past_key_values, EncoderDecoderCache) + else past_key_values + ) + + attention_mask = self._update_causal_mask( + attention_mask, + inputs_embeds, + cache_position, + self_attn_cache, + ) + encoder_attention_mask = self._update_cross_attn_mask( + encoder_hidden_states, + encoder_attention_mask, + input_shape, + inputs_embeds, + ) + + # embed positions + positions = self.embed_positions(input, past_key_values_length, position_ids=cache_position) + positions = positions.to(inputs_embeds.device) + + hidden_states = inputs_embeds + positions + + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None + + # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired + for attn_mask, mask_name in zip([head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]): + if attn_mask is not None: + if attn_mask.size()[0] != len(self.layers): + raise ValueError( + f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for" + f" {head_mask.size()[0]}." + ) + for idx, decoder_layer in enumerate(self.layers): + # add LayerDrop (see https://huggingface.co/papers/1909.11556 for description) + if output_hidden_states: + all_hidden_states += (hidden_states,) + if self.training: + dropout_probability = torch.rand([]) + if dropout_probability < self.layerdrop: + continue + + layer_outputs = decoder_layer( + hidden_states, + attention_mask, + encoder_hidden_states, # as a positional argument for gradient checkpointing + encoder_attention_mask=encoder_attention_mask, + layer_head_mask=(head_mask[idx] if head_mask is not None else None), + cross_attn_layer_head_mask=(cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None), + past_key_values=past_key_values, + output_attentions=output_attentions, + use_cache=use_cache, + cache_position=cache_position, + ) + hidden_states = layer_outputs[0] + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + if encoder_hidden_states is not None: + all_cross_attentions += (layer_outputs[2],) + + hidden_states = self.layernorm_embedding(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if not return_dict: + return tuple( + v + for v in [hidden_states, past_key_values, all_hidden_states, all_self_attns, all_cross_attentions] + if v is not None + ) + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=past_key_values, + hidden_states=all_hidden_states, + attentions=all_self_attns, + cross_attentions=all_cross_attentions, + ) + + +@auto_docstring +class BigBirdPegasusModel(BigBirdPegasusPreTrainedModel): + _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] + + def __init__(self, config: BigBirdPegasusConfig): + super().__init__(config) + + padding_idx, vocab_size = config.pad_token_id, config.vocab_size + embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0 + self.shared = BigBirdPegasusScaledWordEmbedding( + vocab_size, config.d_model, padding_idx, embed_scale=embed_scale + ) + + self.encoder = BigBirdPegasusEncoder(config, self.shared) + self.decoder = BigBirdPegasusDecoder(config, self.shared) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.shared + + def set_input_embeddings(self, value): + self.shared = value + self.encoder.embed_tokens = self.shared + self.decoder.embed_tokens = self.shared + + def _tie_weights(self): + if self.config.tie_word_embeddings: + self._tie_or_clone_weights(self.encoder.embed_tokens, self.shared) + self._tie_or_clone_weights(self.decoder.embed_tokens, self.shared) + + def get_encoder(self): + return self.encoder + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + decoder_input_ids: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.Tensor] = None, + decoder_head_mask: Optional[torch.Tensor] = None, + cross_attn_head_mask: Optional[torch.Tensor] = None, + encoder_outputs: Optional[list[torch.FloatTensor]] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + decoder_inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + ) -> Union[tuple, Seq2SeqModelOutput]: + r""" + decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*): + Provide for translation and summarization training. By default, the model will create this tensor by + shifting the `input_ids` to the right, following the paper. + decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*): + Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also + be used by default. + + If you want to change padding behavior, you should read + [`modeling_bigbird_pegasus._prepare_decoder_attention_mask`] and modify to your needs. See diagram 1 in + [the paper](https://huggingface.co/papers/1910.13461) for more information on the default strategy. + decoder_head_mask (`torch.Tensor` of shape `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + """ + # different to other models, BigBirdPegasus automatically creates decoder_input_ids from + # input_ids if no decoder_input_ids are provided + if decoder_input_ids is None and decoder_inputs_embeds is None: + if input_ids is None: + raise ValueError( + "If no `decoder_input_ids` or `decoder_inputs_embeds` are " + "passed, `input_ids` cannot be `None`. Please pass either " + "`input_ids` or `decoder_input_ids` or `decoder_inputs_embeds`." + ) + + decoder_input_ids = shift_tokens_right( + input_ids, self.config.pad_token_id, self.config.decoder_start_token_id + ) + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if encoder_outputs is None: + encoder_outputs = self.encoder( + input_ids=input_ids, + attention_mask=attention_mask, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True + elif return_dict and not isinstance(encoder_outputs, BaseModelOutput): + encoder_outputs = BaseModelOutput( + last_hidden_state=encoder_outputs[0], + hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, + attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None, + ) + + # decoder outputs consists of (dec_features, past_key_values, dec_hidden, dec_attn) + decoder_outputs = self.decoder( + input_ids=decoder_input_ids, + attention_mask=decoder_attention_mask, + encoder_hidden_states=encoder_outputs[0], + encoder_attention_mask=attention_mask, + head_mask=decoder_head_mask, + cross_attn_head_mask=cross_attn_head_mask, + past_key_values=past_key_values, + inputs_embeds=decoder_inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + cache_position=cache_position, + ) + + if not return_dict: + return decoder_outputs + encoder_outputs + + return Seq2SeqModelOutput( + last_hidden_state=decoder_outputs.last_hidden_state, + past_key_values=decoder_outputs.past_key_values, + decoder_hidden_states=decoder_outputs.hidden_states, + decoder_attentions=decoder_outputs.attentions, + cross_attentions=decoder_outputs.cross_attentions, + encoder_last_hidden_state=encoder_outputs.last_hidden_state, + encoder_hidden_states=encoder_outputs.hidden_states, + encoder_attentions=encoder_outputs.attentions, + ) + + +@auto_docstring( + custom_intro=""" + The BigBirdPegasus Model with a language modeling head. Can be used for summarization. + """ +) +# Copied from transformers.models.bart.modeling_bart.BartForConditionalGeneration with Bart->BigBirdPegasus, BART->BIGBIRD_PEGASUS +class BigBirdPegasusForConditionalGeneration(BigBirdPegasusPreTrainedModel, GenerationMixin): + base_model_prefix = "model" + _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"] + _keys_to_ignore_on_load_missing = ["final_logits_bias"] + + def __init__(self, config: BigBirdPegasusConfig): + super().__init__(config) + self.model = BigBirdPegasusModel(config) + self.register_buffer("final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings))) + self.lm_head = nn.Linear(config.d_model, self.model.shared.num_embeddings, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_encoder(self): + return self.model.get_encoder() + + def get_decoder(self): + return self.model.get_decoder() + + def resize_token_embeddings( + self, new_num_tokens: int, pad_to_multiple_of: Optional[int] = None, mean_resizing: bool = True + ) -> nn.Embedding: + new_embeddings = super().resize_token_embeddings(new_num_tokens, pad_to_multiple_of, mean_resizing) + self._resize_final_logits_bias(new_embeddings.weight.shape[0]) + return new_embeddings + + def _resize_final_logits_bias(self, new_num_tokens: int) -> None: + old_num_tokens = self.final_logits_bias.shape[-1] + if new_num_tokens <= old_num_tokens: + new_bias = self.final_logits_bias[:, :new_num_tokens] + else: + extra_bias = torch.zeros((1, new_num_tokens - old_num_tokens), device=self.final_logits_bias.device) + new_bias = torch.cat([self.final_logits_bias, extra_bias], dim=1) + self.register_buffer("final_logits_bias", new_bias) + + def _tie_weights(self): + if self.config.tie_word_embeddings: + self.model._tie_weights() + self._tie_or_clone_weights(self.lm_head, self.model.shared) + + @auto_docstring + # Ignore copy + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + decoder_input_ids: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.Tensor] = None, + decoder_head_mask: Optional[torch.Tensor] = None, + cross_attn_head_mask: Optional[torch.Tensor] = None, + encoder_outputs: Optional[list[torch.FloatTensor]] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + decoder_inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + ) -> Union[tuple, Seq2SeqLMOutput]: + r""" + decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*): + Provide for translation and summarization training. By default, the model will create this tensor by + shifting the `input_ids` to the right, following the paper. + decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*): + Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also + be used by default. + + If you want to change padding behavior, you should read + [`modeling_bigbird_pegasus._prepare_decoder_attention_mask`] and modify to your needs. See diagram 1 in + [the paper](https://huggingface.co/papers/1910.13461) for more information on the default strategy. + decoder_head_mask (`torch.Tensor` of shape `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Example summarization: + + ```python + >>> from transformers import AutoTokenizer, BigBirdPegasusForConditionalGeneration + + >>> model = BigBirdPegasusForConditionalGeneration.from_pretrained("google/bigbird-pegasus-large-arxiv") + >>> tokenizer = AutoTokenizer.from_pretrained("google/bigbird-pegasus-large-arxiv") + + >>> ARTICLE_TO_SUMMARIZE = ( + ... "The dominant sequence transduction models are based on complex recurrent or convolutional neural " + ... "networks in an encoder-decoder configuration. The best performing models also connect the encoder " + ... "and decoder through an attention mechanism. We propose a new simple network architecture, the Transformer, " + ... "based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. " + ... "Experiments on two machine translation tasks show these models to be superior in quality " + ... "while being more parallelizable and requiring significantly less time to train." + ... ) + >>> inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=4096, return_tensors="pt", truncation=True) + + >>> # Generate Summary + >>> summary_ids = model.generate(inputs["input_ids"], num_beams=4, max_length=15) + >>> tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + 'dominant sequence models are based on recurrent or convolutional neural networks .' + ``` + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if labels is not None: + if use_cache: + logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.") + use_cache = False + if decoder_input_ids is None and decoder_inputs_embeds is None: + decoder_input_ids = shift_tokens_right( + labels, self.config.pad_token_id, self.config.decoder_start_token_id + ) + + outputs = self.model( + input_ids, + attention_mask=attention_mask, + decoder_input_ids=decoder_input_ids, + encoder_outputs=encoder_outputs, + decoder_attention_mask=decoder_attention_mask, + head_mask=head_mask, + decoder_head_mask=decoder_head_mask, + cross_attn_head_mask=cross_attn_head_mask, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + decoder_inputs_embeds=decoder_inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + cache_position=cache_position, + ) + + lm_logits = self.lm_head(outputs[0]) + lm_logits = lm_logits + self.final_logits_bias.to(lm_logits.device) + + masked_lm_loss = None + if labels is not None: + labels = labels.to(lm_logits.device) + loss_fct = CrossEntropyLoss() + masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1)) + + if not return_dict: + output = (lm_logits,) + outputs[1:] + return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output + + return Seq2SeqLMOutput( + loss=masked_lm_loss, + logits=lm_logits, + past_key_values=outputs.past_key_values, + decoder_hidden_states=outputs.decoder_hidden_states, + decoder_attentions=outputs.decoder_attentions, + cross_attentions=outputs.cross_attentions, + encoder_last_hidden_state=outputs.encoder_last_hidden_state, + encoder_hidden_states=outputs.encoder_hidden_states, + encoder_attentions=outputs.encoder_attentions, + ) + + def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor): + return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id) + + +@auto_docstring( + custom_intro=""" + BigBirdPegasus model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. + for GLUE tasks. + """ +) +class BigBirdPegasusForSequenceClassification(BigBirdPegasusPreTrainedModel): + _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] + + def __init__(self, config: BigBirdPegasusConfig, **kwargs): + super().__init__(config, **kwargs) + self.model = BigBirdPegasusModel(config) + self.classification_head = BigBirdPegasusClassificationHead( + config.d_model, + config.d_model, + config.num_labels, + config.classifier_dropout, + ) + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + decoder_input_ids: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.Tensor] = None, + decoder_head_mask: Optional[torch.Tensor] = None, + cross_attn_head_mask: Optional[torch.Tensor] = None, + encoder_outputs: Optional[list[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + decoder_inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + ) -> Union[tuple, Seq2SeqSequenceClassifierOutput]: + r""" + decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*): + Provide for translation and summarization training. By default, the model will create this tensor by + shifting the `input_ids` to the right, following the paper. + decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*): + Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also + be used by default. + + If you want to change padding behavior, you should read + [`modeling_bigbird_pegasus._prepare_decoder_attention_mask`] and modify to your needs. See diagram 1 in + [the paper](https://huggingface.co/papers/1910.13461) for more information on the default strategy. + decoder_head_mask (`torch.Tensor` of shape `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + if labels is not None: + use_cache = False + + if input_ids is None and inputs_embeds is not None: + raise NotImplementedError( + f"Passing input embeddings is currently not supported for {self.__class__.__name__}" + ) + + outputs = self.model( + input_ids, + attention_mask=attention_mask, + decoder_input_ids=decoder_input_ids, + decoder_attention_mask=decoder_attention_mask, + head_mask=head_mask, + decoder_head_mask=decoder_head_mask, + cross_attn_head_mask=cross_attn_head_mask, + encoder_outputs=encoder_outputs, + inputs_embeds=inputs_embeds, + decoder_inputs_embeds=decoder_inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + cache_position=cache_position, + ) + hidden_states = outputs[0] # last hidden state + + eos_mask = input_ids.eq(self.config.eos_token_id).to(hidden_states.device) + + if len(torch.unique_consecutive(eos_mask.sum(1))) > 1: + raise ValueError("All examples must have the same number of tokens.") + sentence_representation = hidden_states[eos_mask, :].view(hidden_states.size(0), -1, hidden_states.size(-1))[ + :, -1, : + ] + logits = self.classification_head(sentence_representation) + + loss = None + if labels is not None: + labels = labels.to(logits.device) + if self.config.problem_type is None: + if self.config.num_labels == 1: + self.config.problem_type = "regression" + elif self.config.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.config.num_labels == 1: + loss = loss_fct(logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits, labels) + if not return_dict: + output = (logits,) + outputs[1:] + return ((loss,) + output) if loss is not None else output + + return Seq2SeqSequenceClassifierOutput( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + decoder_hidden_states=outputs.decoder_hidden_states, + decoder_attentions=outputs.decoder_attentions, + cross_attentions=outputs.cross_attentions, + encoder_last_hidden_state=outputs.encoder_last_hidden_state, + encoder_hidden_states=outputs.encoder_hidden_states, + encoder_attentions=outputs.encoder_attentions, + ) + + +@auto_docstring +class BigBirdPegasusForQuestionAnswering(BigBirdPegasusPreTrainedModel): + _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] + + def __init__(self, config): + super().__init__(config) + + config.num_labels = 2 + self.num_labels = config.num_labels + + self.model = BigBirdPegasusModel(config) + self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + decoder_input_ids: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.Tensor] = None, + decoder_head_mask: Optional[torch.Tensor] = None, + cross_attn_head_mask: Optional[torch.Tensor] = None, + encoder_outputs: Optional[list[torch.FloatTensor]] = None, + start_positions: Optional[torch.LongTensor] = None, + end_positions: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + decoder_inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + ) -> Union[tuple, Seq2SeqQuestionAnsweringModelOutput]: + r""" + decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*): + Provide for translation and summarization training. By default, the model will create this tensor by + shifting the `input_ids` to the right, following the paper. + decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*): + Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also + be used by default. + + If you want to change padding behavior, you should read + [`modeling_bigbird_pegasus._prepare_decoder_attention_mask`] and modify to your needs. See diagram 1 in + [the paper](https://huggingface.co/papers/1910.13461) for more information on the default strategy. + decoder_head_mask (`torch.Tensor` of shape `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + if start_positions is not None and end_positions is not None: + use_cache = False + + outputs = self.model( + input_ids, + attention_mask=attention_mask, + decoder_input_ids=decoder_input_ids, + decoder_attention_mask=decoder_attention_mask, + head_mask=head_mask, + decoder_head_mask=decoder_head_mask, + cross_attn_head_mask=cross_attn_head_mask, + encoder_outputs=encoder_outputs, + inputs_embeds=inputs_embeds, + decoder_inputs_embeds=decoder_inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + cache_position=cache_position, + ) + + sequence_output = outputs[0] + + logits = self.qa_outputs(sequence_output) + start_logits, end_logits = logits.split(1, dim=-1) + start_logits = start_logits.squeeze(-1).contiguous() + end_logits = end_logits.squeeze(-1).contiguous() + + total_loss = None + if start_positions is not None and end_positions is not None: + # If we are on multi-GPU, split add a dimension + if len(start_positions.size()) > 1: + start_positions = start_positions.squeeze(-1) + if len(end_positions.size()) > 1: + end_positions = end_positions.squeeze(-1) + # sometimes the start/end positions are outside our model inputs, we ignore these terms + ignored_index = start_logits.size(1) + start_positions = start_positions.clamp(0, ignored_index) + end_positions = end_positions.clamp(0, ignored_index) + + loss_fct = CrossEntropyLoss(ignore_index=ignored_index) + start_loss = loss_fct(start_logits, start_positions) + end_loss = loss_fct(end_logits, end_positions) + total_loss = (start_loss + end_loss) / 2 + + if not return_dict: + output = ( + start_logits, + end_logits, + ) + outputs[1:] + return ((total_loss,) + output) if total_loss is not None else output + + return Seq2SeqQuestionAnsweringModelOutput( + loss=total_loss, + start_logits=start_logits, + end_logits=end_logits, + past_key_values=outputs.past_key_values, + decoder_hidden_states=outputs.decoder_hidden_states, + decoder_attentions=outputs.decoder_attentions, + cross_attentions=outputs.cross_attentions, + encoder_last_hidden_state=outputs.encoder_last_hidden_state, + encoder_hidden_states=outputs.encoder_hidden_states, + encoder_attentions=outputs.encoder_attentions, + ) + + +# Copied from transformers.models.pegasus.modeling_pegasus.PegasusDecoderWrapper with Pegasus->BigBirdPegasus +class BigBirdPegasusDecoderWrapper(BigBirdPegasusPreTrainedModel): + """ + This wrapper class is a helper class to correctly load pretrained checkpoints when the causal language model is + used in combination with the [`EncoderDecoderModel`] framework. + """ + + def __init__(self, config): + super().__init__(config) + self.decoder = BigBirdPegasusDecoder(config) + + def forward(self, *args, **kwargs): + return self.decoder(*args, **kwargs) + + +class BigBirdPegasusForCausalLM(BigBirdPegasusPreTrainedModel, GenerationMixin): + _tied_weights_keys = ["lm_head.weight"] + + def __init__(self, config): + config.is_decoder = True + config.is_encoder_decoder = False + super().__init__(config) + self.model = BigBirdPegasusDecoderWrapper(config) + + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.decoder.embed_tokens + + def set_input_embeddings(self, value): + self.model.decoder.embed_tokens = value + + def set_decoder(self, decoder): + self.model.decoder = decoder + + def get_decoder(self): + return self.model.decoder + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.Tensor] = None, + cross_attn_head_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + ) -> Union[tuple, CausalLMOutputWithCrossAttentions]: + r""" + cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Example: + + ```python + >>> from transformers import AutoTokenizer, BigBirdPegasusForCausalLM + + >>> tokenizer = AutoTokenizer.from_pretrained("google/bigbird-pegasus-large-arxiv") + >>> model = BigBirdPegasusForCausalLM.from_pretrained( + ... "google/bigbird-pegasus-large-arxiv", add_cross_attention=False + ... ) + >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder." + >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") + >>> outputs = model(**inputs) + + >>> logits = outputs.logits + ```""" + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs = self.model.decoder( + input_ids=input_ids, + attention_mask=attention_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + head_mask=head_mask, + cross_attn_head_mask=cross_attn_head_mask, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + cache_position=cache_position, + ) + + logits = self.lm_head(outputs[0]) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.config.vocab_size), labels.view(-1)) + + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return CausalLMOutputWithCrossAttentions( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + cross_attentions=outputs.cross_attentions, + ) + + +__all__ = [ + "BigBirdPegasusForCausalLM", + "BigBirdPegasusForConditionalGeneration", + "BigBirdPegasusForQuestionAnswering", + "BigBirdPegasusForSequenceClassification", + "BigBirdPegasusModel", + "BigBirdPegasusPreTrainedModel", +] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bitnet/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bitnet/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..c3428dd3800b58e8052c53cbd71adb7dce8ea451 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bitnet/__init__.py @@ -0,0 +1,27 @@ +# Copyright 2025 The BitNet Team and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_bitnet import * + from .modeling_bitnet import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bitnet/configuration_bitnet.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bitnet/configuration_bitnet.py new file mode 100644 index 0000000000000000000000000000000000000000..6df31b5b8200e3b30eff59a270708984a90c35ca --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bitnet/configuration_bitnet.py @@ -0,0 +1,147 @@ +# coding=utf-8 +# Copyright 2025 The BitNet Team and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +"""BitNet model configuration""" + +from ...configuration_utils import PretrainedConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class BitNetConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`BitNetModel`]. It is used to instantiate an BitNet + model according to the specified arguments, defining the model architecture. Instantiating a configuration with the + defaults will yield a similar configuration to that of + BitNet b1.58 2B4T [microsoft/bitnet-b1.58-2B-4T](https://huggingface.co/microsoft/bitnet-b1.58-2B-4T). + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 128256): + Vocabulary size of the BitNet model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`BitNetModel`] + hidden_size (`int`, *optional*, defaults to 2560): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 6912): + Dimension of the MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 30): + Number of hidden layers in the Transformer decoder. + num_attention_heads (`int`, *optional*, defaults to 20): + Number of attention heads for each attention layer in the Transformer decoder. + num_key_value_heads (`int`, *optional*, defaults to 5): + This is the number of key_value heads that should be used to implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When + converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed + by meanpooling all the original heads within that group. For more details, check out [this + paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to + `num_attention_heads`. + hidden_act (`str` or `function`, *optional*, defaults to `"relu2"`): + The non-linear activation function (function or string) in the decoder. + max_position_embeddings (`int`, *optional*, defaults to 2048): + The maximum sequence length that this model might ever be used with. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + rms_norm_eps (`float`, *optional*, defaults to 1e-05): + The epsilon used by the rms normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + pad_token_id (`int`, *optional*): + Padding token id. + bos_token_id (`int`, *optional*, defaults to 128000): + Beginning of stream token id. + eos_token_id (`int`, *optional*, defaults to 128001): + End of stream token id. + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether to tie weight embeddings + rope_theta (`float`, *optional*, defaults to 500000.0): + The base period of the RoPE embeddings. + attention_bias (`bool`, *optional*, defaults to `False`): + Whether to use a bias in the query, key, value and output projection layers during self-attention. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + + ```python + >>> from transformers import BitNetModel, BitNetConfig + + >>> # Initializing a BitNet style configuration + >>> configuration = BitNetConfig() + + >>> # Initializing a model from the BitNet style configuration + >>> model = BitNetModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "bitnet" + keys_to_ignore_at_inference = ["past_key_values"] + + def __init__( + self, + vocab_size=128256, + hidden_size=2560, + intermediate_size=6912, + num_hidden_layers=30, + num_attention_heads=20, + num_key_value_heads=5, + hidden_act="relu2", + max_position_embeddings=2048, + initializer_range=0.02, + rms_norm_eps=1e-5, + use_cache=True, + pad_token_id=None, + bos_token_id=128000, + eos_token_id=128001, + tie_word_embeddings=False, + rope_theta=500000.0, + attention_bias=False, + attention_dropout=0.0, + **kwargs, + ): + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + + # for backward compatibility + if num_key_value_heads is None: + num_key_value_heads = num_attention_heads + + self.num_key_value_heads = num_key_value_heads + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + self.use_cache = use_cache + self.rope_theta = rope_theta + self.attention_bias = attention_bias + self.attention_dropout = attention_dropout + + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) + + +__all__ = ["BitNetConfig"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bitnet/modeling_bitnet.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bitnet/modeling_bitnet.py new file mode 100644 index 0000000000000000000000000000000000000000..0af7c794b155fa989da445960376069885b30bf6 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bitnet/modeling_bitnet.py @@ -0,0 +1,488 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/bitnet/modular_bitnet.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_bitnet.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright 2025 The BitNet Team and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and + +from typing import Callable, Optional, Union + +import torch +from torch import nn + +from ...activations import ACT2FN +from ...cache_utils import Cache, DynamicCache +from ...generation import GenerationMixin +from ...integrations import use_kernel_forward_from_hub +from ...masking_utils import create_causal_mask +from ...modeling_flash_attention_utils import FlashAttentionKwargs +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast +from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update +from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel +from ...processing_utils import Unpack +from ...utils import TransformersKwargs, auto_docstring, can_return_tuple +from ...utils.deprecation import deprecate_kwarg +from ...utils.generic import check_model_inputs +from .configuration_bitnet import BitNetConfig + + +@use_kernel_forward_from_hub("RMSNorm") +class BitNetRMSNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + """ + BitNetRMSNorm is equivalent to T5LayerNorm + """ + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + return self.weight * hidden_states.to(input_dtype) + + def extra_repr(self): + return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}" + + +class BitNetMLP(nn.Module): + def __init__(self, config: BitNetConfig): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.intermediate_size = config.intermediate_size + self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) + self.act_fn = ACT2FN[config.hidden_act] + self.ffn_sub_norm = BitNetRMSNorm(config.intermediate_size, eps=config.rms_norm_eps) + + def forward(self, x): + down_proj = self.down_proj(self.ffn_sub_norm(self.act_fn(self.gate_proj(x)) * self.up_proj(x))) + return down_proj + + +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + +def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1): + """Applies Rotary Position Embedding to the query and key tensors. + + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + position_ids (`torch.Tensor`, *optional*): + Deprecated and unused. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + cos = cos.unsqueeze(unsqueeze_dim) + sin = sin.unsqueeze(unsqueeze_dim) + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + +def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: + """ + This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, + num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) + """ + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) + + +def eager_attention_forward( + module: nn.Module, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attention_mask: Optional[torch.Tensor], + scaling: float, + dropout: float = 0.0, + **kwargs: Unpack[TransformersKwargs], +): + key_states = repeat_kv(key, module.num_key_value_groups) + value_states = repeat_kv(value, module.num_key_value_groups) + + attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling + if attention_mask is not None: + causal_mask = attention_mask[:, :, :, : key_states.shape[-2]] + attn_weights = attn_weights + causal_mask + + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype) + attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) + attn_output = torch.matmul(attn_weights, value_states) + attn_output = attn_output.transpose(1, 2).contiguous() + + return attn_output, attn_weights + + +class BitNetAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config: BitNetConfig, layer_idx: int): + super().__init__() + self.config = config + self.layer_idx = layer_idx + self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads) + self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads + self.scaling = self.head_dim**-0.5 + self.attention_dropout = config.attention_dropout + self.is_causal = True + + self.q_proj = nn.Linear( + config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias + ) + self.k_proj = nn.Linear( + config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias + ) + self.v_proj = nn.Linear( + config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias + ) + self.o_proj = nn.Linear( + config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias + ) + self.attn_sub_norm = BitNetRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + position_embeddings: tuple[torch.Tensor, torch.Tensor], + attention_mask: Optional[torch.Tensor], + past_key_values: Optional[Cache] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Unpack[FlashAttentionKwargs], + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: + input_shape = hidden_states.shape[:-1] + hidden_shape = (*input_shape, -1, self.head_dim) + + query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2) + key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2) + value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2) + + cos, sin = position_embeddings + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) + + if past_key_values is not None: + # sin and cos are specific to RoPE models; cache_position needed for the static cache + cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} + key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs) + + attention_interface: Callable = eager_attention_forward + + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + attn_output, attn_weights = attention_interface( + self, + query_states, + key_states, + value_states, + attention_mask, + dropout=0.0 if not self.training else self.attention_dropout, + scaling=self.scaling, + **kwargs, + ) + + attn_output = attn_output.reshape(*input_shape, -1).contiguous() + attn_output = self.attn_sub_norm(attn_output) # diff with Llama + attn_output = self.o_proj(attn_output) + return attn_output, attn_weights + + +class BitNetDecoderLayer(GradientCheckpointingLayer): + def __init__(self, config: BitNetConfig, layer_idx: int): + super().__init__() + self.hidden_size = config.hidden_size + + self.self_attn = BitNetAttention(config=config, layer_idx=layer_idx) + + self.mlp = BitNetMLP(config) + self.input_layernorm = BitNetRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = BitNetRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + use_cache: Optional[bool] = False, + cache_position: Optional[torch.LongTensor] = None, + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + **kwargs: Unpack[TransformersKwargs], + ) -> torch.Tensor: + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + # Self Attention + hidden_states, _ = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + use_cache=use_cache, + cache_position=cache_position, + position_embeddings=position_embeddings, + **kwargs, + ) + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + return hidden_states + + +class BitNetRotaryEmbedding(nn.Module): + inv_freq: torch.Tensor # fix linting for `register_buffer` + + def __init__(self, config: BitNetConfig, device=None): + super().__init__() + # BC: "rope_type" was originally "type" + if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): + self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) + else: + self.rope_type = "default" + self.max_seq_len_cached = config.max_position_embeddings + self.original_max_seq_len = config.max_position_embeddings + + self.config = config + self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + + inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) + self.original_inv_freq = self.inv_freq + + @torch.no_grad() + @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) + def forward(self, x, position_ids): + inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device) + position_ids_expanded = position_ids[:, None, :].float() + + device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" + with torch.autocast(device_type=device_type, enabled=False): # Force float32 + freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) + emb = torch.cat((freqs, freqs), dim=-1) + cos = emb.cos() * self.attention_scaling + sin = emb.sin() * self.attention_scaling + + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) + + +@auto_docstring +class BitNetPreTrainedModel(PreTrainedModel): + config: BitNetConfig + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["BitNetDecoderLayer"] + _skip_keys_device_placement = ["past_key_values"] + _supports_flash_attn = True + _supports_sdpa = True + _supports_flex_attn = True + + _can_compile_fullgraph = True + _supports_attention_backend = True + _can_record_outputs = { + "hidden_states": BitNetDecoderLayer, + "attentions": BitNetAttention, + } + + +@auto_docstring +class BitNetModel(BitNetPreTrainedModel): + def __init__(self, config: BitNetConfig): + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) + self.layers = nn.ModuleList( + [BitNetDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + ) + self.norm = BitNetRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.rotary_emb = BitNetRotaryEmbedding(config=config) + self.gradient_checkpointing = False + + # Initialize weights and apply final processing + self.post_init() + + @check_model_inputs + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + cache_position: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> BaseModelOutputWithPast: + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") + + if inputs_embeds is None: + inputs_embeds: torch.Tensor = self.embed_tokens(input_ids) + + if use_cache and past_key_values is None: + past_key_values = DynamicCache(config=self.config) + + if cache_position is None: + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + cache_position: torch.Tensor = torch.arange( + past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device + ) + + if position_ids is None: + position_ids = cache_position.unsqueeze(0) + + causal_mask = create_causal_mask( + config=self.config, + input_embeds=inputs_embeds, + attention_mask=attention_mask, + cache_position=cache_position, + past_key_values=past_key_values, + position_ids=position_ids, + ) + + hidden_states = inputs_embeds + position_embeddings = self.rotary_emb(hidden_states, position_ids) + + for decoder_layer in self.layers[: self.config.num_hidden_layers]: + hidden_states = decoder_layer( + hidden_states, + attention_mask=causal_mask, + position_ids=position_ids, + past_key_values=past_key_values, + cache_position=cache_position, + position_embeddings=position_embeddings, + **kwargs, + ) + + hidden_states = self.norm(hidden_states) + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=past_key_values, + ) + + +@auto_docstring +class BitNetForCausalLM(BitNetPreTrainedModel, GenerationMixin): + _tied_weights_keys = ["lm_head.weight"] + _tp_plan = None + _pp_plan = None + + def __init__(self, config): + super().__init__(config) + self.model = BitNetModel(config) + self.vocab_size = config.vocab_size + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + @can_return_tuple + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + logits_to_keep: Union[int, torch.Tensor] = 0, + **kwargs: Unpack[TransformersKwargs], + ) -> CausalLMOutputWithPast: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, transformers., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, transformers., config.vocab_size]`. + + Example: + + ```python + >>> from transformers import AutoTokenizer, BitNetForCausalLM + + >>> model = BitNetForCausalLM.from_pretrained("microsoft/bitnet-b1.58-2B-4T") + >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/bitnet-b1.58-2B-4T") + + >>> prompt = f'<|begin_of_text|>User: Hey, are you conscious? Can you talk to me?<|eot_id|>Assistant: ' + >>> inputs = tokenizer(prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(inputs.input_ids, max_length=100) + >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "User: Hey, are you conscious? Can you talk to me?Assistant: No, I'm not conscious. I'm an artificial intelligence designed to assist with information and tasks. How can I help you today?" + ```""" + outputs: BaseModelOutputWithPast = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + cache_position=cache_position, + **kwargs, + ) + + hidden_states = outputs.last_hidden_state + # Only compute necessary logits, and do not upcast them to float if we are not computing the loss + slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep + logits = self.lm_head(hidden_states[:, slice_indices, :]) + + loss = None + if labels is not None: + loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs) + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +__all__ = ["BitNetForCausalLM", "BitNetModel", "BitNetPreTrainedModel"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bitnet/modular_bitnet.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bitnet/modular_bitnet.py new file mode 100644 index 0000000000000000000000000000000000000000..92ad3a32145992eb5b8a6bbac0af97f079ef2352 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bitnet/modular_bitnet.py @@ -0,0 +1,155 @@ +# coding=utf-8 +# Copyright 2025 The BitNet Team and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +"""PyTorch BitNet model.""" + +from typing import Callable, Optional + +import torch + +from ...cache_utils import Cache +from ...modeling_flash_attention_utils import FlashAttentionKwargs +from ...modeling_outputs import CausalLMOutputWithPast +from ...modeling_utils import ALL_ATTENTION_FUNCTIONS +from ...processing_utils import Unpack +from ...utils import logging +from ...utils.deprecation import deprecate_kwarg +from ..gemma.modeling_gemma import GemmaMLP +from ..llama.modeling_llama import ( + LlamaAttention, + LlamaDecoderLayer, + LlamaForCausalLM, + LlamaModel, + LlamaRMSNorm, + apply_rotary_pos_emb, + eager_attention_forward, +) +from .configuration_bitnet import BitNetConfig + + +logger = logging.get_logger(__name__) + + +class BitNetRMSNorm(LlamaRMSNorm): + pass + + +class BitNetMLP(GemmaMLP): + def __init__(self, config: BitNetConfig): + super().__init__(config) + self.ffn_sub_norm = BitNetRMSNorm(config.intermediate_size, eps=config.rms_norm_eps) + + def forward(self, x): + down_proj = self.down_proj(self.ffn_sub_norm(self.act_fn(self.gate_proj(x)) * self.up_proj(x))) + return down_proj + + +class BitNetAttention(LlamaAttention): + def __init__(self, config: BitNetConfig, layer_idx: int): + super().__init__(config, layer_idx) + self.attn_sub_norm = BitNetRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + position_embeddings: tuple[torch.Tensor, torch.Tensor], + attention_mask: Optional[torch.Tensor], + past_key_values: Optional[Cache] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Unpack[FlashAttentionKwargs], + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: + input_shape = hidden_states.shape[:-1] + hidden_shape = (*input_shape, -1, self.head_dim) + + query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2) + key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2) + value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2) + + cos, sin = position_embeddings + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) + + if past_key_values is not None: + # sin and cos are specific to RoPE models; cache_position needed for the static cache + cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} + key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs) + + attention_interface: Callable = eager_attention_forward + + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + attn_output, attn_weights = attention_interface( + self, + query_states, + key_states, + value_states, + attention_mask, + dropout=0.0 if not self.training else self.attention_dropout, + scaling=self.scaling, + **kwargs, + ) + + attn_output = attn_output.reshape(*input_shape, -1).contiguous() + attn_output = self.attn_sub_norm(attn_output) # diff with Llama + attn_output = self.o_proj(attn_output) + return attn_output, attn_weights + + +class BitNetDecoderLayer(LlamaDecoderLayer): + pass + + +class BitNetModel(LlamaModel): + pass + + +class BitNetForCausalLM(LlamaForCausalLM): + _tied_weights_keys = ["lm_head.weight"] + _tp_plan = None + _pp_plan = None + + def forward( + self, + **super_kwargs, + ) -> CausalLMOutputWithPast: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, transformers., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, transformers., config.vocab_size]`. + + Example: + + ```python + >>> from transformers import AutoTokenizer, BitNetForCausalLM + + >>> model = BitNetForCausalLM.from_pretrained("microsoft/bitnet-b1.58-2B-4T") + >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/bitnet-b1.58-2B-4T") + + >>> prompt = f'<|begin_of_text|>User: Hey, are you conscious? Can you talk to me?<|eot_id|>Assistant: ' + >>> inputs = tokenizer(prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(inputs.input_ids, max_length=100) + >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "User: Hey, are you conscious? Can you talk to me?Assistant: No, I'm not conscious. I'm an artificial intelligence designed to assist with information and tasks. How can I help you today?" + ```""" + return super().forward(**super_kwargs) + + +__all__ = [ + "BitNetForCausalLM", + "BitNetModel", + "BitNetPreTrainedModel", # noqa: F822 +] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/blenderbot/__pycache__/__init__.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/blenderbot/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..190652df0129cc8acdb70044bdf4a1e6d793e9d0 Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/blenderbot/__pycache__/__init__.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/blenderbot/__pycache__/configuration_blenderbot.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/blenderbot/__pycache__/configuration_blenderbot.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..57b4027c4fffbf6a3bb7c7b198d8548bf8521948 Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/blenderbot/__pycache__/configuration_blenderbot.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/blenderbot/__pycache__/modeling_blenderbot.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/blenderbot/__pycache__/modeling_blenderbot.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..aaa723ab104738840fba8126647d93110ce376c8 Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/blenderbot/__pycache__/modeling_blenderbot.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/blenderbot/__pycache__/modeling_flax_blenderbot.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/blenderbot/__pycache__/modeling_flax_blenderbot.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..52bc2cd56729cf24b04e306fd137e2271bd86aae Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/blenderbot/__pycache__/modeling_flax_blenderbot.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/blenderbot/__pycache__/modeling_tf_blenderbot.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/blenderbot/__pycache__/modeling_tf_blenderbot.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f0fff07467cffcd1e4ff96be171e21d88c225d05 Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/blenderbot/__pycache__/modeling_tf_blenderbot.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/blenderbot/__pycache__/tokenization_blenderbot.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/blenderbot/__pycache__/tokenization_blenderbot.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ee296b52e776b45e25b4cdcd2452e1c79b4bae67 Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/blenderbot/__pycache__/tokenization_blenderbot.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/blenderbot/__pycache__/tokenization_blenderbot_fast.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/blenderbot/__pycache__/tokenization_blenderbot_fast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..45817c6ecea929a41191924b5a84f304ceaf29cc Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/blenderbot/__pycache__/tokenization_blenderbot_fast.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/blip/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/blip/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..952de2f855a71591e950b31751af1fe6cfbae20a --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/blip/__init__.py @@ -0,0 +1,33 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_blip import * + from .image_processing_blip import * + from .image_processing_blip_fast import * + from .modeling_blip import * + from .modeling_blip_text import * + from .modeling_tf_blip import * + from .modeling_tf_blip_text import * + from .processing_blip import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/blip/configuration_blip.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/blip/configuration_blip.py new file mode 100644 index 0000000000000000000000000000000000000000..6e0a5590c3f769896675e801f76ecfa9234eb8be --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/blip/configuration_blip.py @@ -0,0 +1,317 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Blip model configuration""" + +from ...configuration_utils import PretrainedConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class BlipTextConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`BlipTextModel`]. It is used to instantiate a BLIP + text model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of the `BlipText` used by the [base + architectures](https://huggingface.co/Salesforce/blip-vqa-base). + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 30524): + Vocabulary size of the `Blip` text model. Defines the number of different tokens that can be represented by + the `inputs_ids` passed when calling [`BlipModel`]. + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + encoder_hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers from the vision model. + intermediate_size (`int`, *optional*, defaults to 3072): + Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 8): + Number of attention heads for each attention layer in the Transformer encoder. + max_position_embeddings (`int`, *optional*, defaults to 512): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"` and `"gelu_new"` `"gelu"` are supported. + layer_norm_eps (`float`, *optional*, defaults to 1e-12): + The epsilon used by the layer normalization layers. + hidden_dropout_prob (`float`, *optional*, defaults to 0.0): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + bos_token_id (`int`, *optional*, defaults to 30522): + The id of the `beginning-of-sequence` token. + eos_token_id (`int`, *optional*, defaults to 2): + The id of the `end-of-sequence` token. + pad_token_id (`int`, *optional*, defaults to 0): + The id of the `padding` token. + sep_token_id (`int`, *optional*, defaults to 102): + The id of the `separator` token. + is_decoder (`bool`, *optional*, defaults to `True`): + Whether the model is used as a decoder. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). + label_smoothing (float, *optional*): + A float in [0.0, 1.0]. Specifies the amount of smoothing when computing the loss, where 0.0 means no smoothing. The targets + become a mixture of the original ground truth and a uniform distribution as described in + `Rethinking the Inception Architecture for Computer Vision `__. Default: :math:`0.0`. + + Example: + + ```python + >>> from transformers import BlipTextConfig, BlipTextModel + + >>> # Initializing a BlipTextConfig with Salesforce/blip-vqa-base style configuration + >>> configuration = BlipTextConfig() + + >>> # Initializing a BlipTextModel (with random weights) from the Salesforce/blip-vqa-base style configuration + >>> model = BlipTextModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "blip_text_model" + base_config_key = "text_config" + + def __init__( + self, + vocab_size=30524, + hidden_size=768, + encoder_hidden_size=768, + intermediate_size=3072, + projection_dim=768, + num_hidden_layers=12, + num_attention_heads=8, + max_position_embeddings=512, + hidden_act="gelu", + layer_norm_eps=1e-12, + hidden_dropout_prob=0.0, + attention_probs_dropout_prob=0.0, + initializer_range=0.02, + bos_token_id=30522, + eos_token_id=2, + pad_token_id=0, + sep_token_id=102, + is_decoder=True, + use_cache=True, + label_smoothing=0.0, + **kwargs, + ): + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + sep_token_id=sep_token_id, + **kwargs, + ) + + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.encoder_hidden_size = encoder_hidden_size + self.intermediate_size = intermediate_size + self.projection_dim = projection_dim + self.hidden_dropout_prob = hidden_dropout_prob + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.max_position_embeddings = max_position_embeddings + self.layer_norm_eps = layer_norm_eps + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.is_decoder = is_decoder + self.use_cache = use_cache + self.label_smoothing = label_smoothing + + +class BlipVisionConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`BlipVisionModel`]. It is used to instantiate a + BLIP vision model according to the specified arguments, defining the model architecture. Instantiating a + configuration defaults will yield a similar configuration to that of the Blip-base + [Salesforce/blip-vqa-base](https://huggingface.co/Salesforce/blip-vqa-base) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + intermediate_size (`int`, *optional*, defaults to 3072): + Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + image_size (`int`, *optional*, defaults to 384): + The size (resolution) of each image. + patch_size (`int`, *optional*, defaults to 16): + The size (resolution) of each patch. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"` and `"gelu_new"` `"gelu"` are supported. + layer_norm_eps (`float`, *optional*, defaults to 1e-5): + The epsilon used by the layer normalization layers. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + initializer_range (`float`, *optional*, defaults to 1e-10): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + + Example: + + ```python + >>> from transformers import BlipVisionConfig, BlipVisionModel + + >>> # Initializing a BlipVisionConfig with Salesforce/blip-vqa-base style configuration + >>> configuration = BlipVisionConfig() + + >>> # Initializing a BlipVisionModel (with random weights) from the Salesforce/blip-vqa-base style configuration + >>> model = BlipVisionModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "blip_vision_model" + base_config_key = "vision_config" + + def __init__( + self, + hidden_size=768, + intermediate_size=3072, + projection_dim=512, + num_hidden_layers=12, + num_attention_heads=12, + image_size=384, + patch_size=16, + hidden_act="gelu", + layer_norm_eps=1e-5, + attention_dropout=0.0, + initializer_range=1e-10, + **kwargs, + ): + super().__init__(**kwargs) + + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.projection_dim = projection_dim + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.patch_size = patch_size + self.image_size = image_size + self.initializer_range = initializer_range + self.attention_dropout = attention_dropout + self.layer_norm_eps = layer_norm_eps + self.hidden_act = hidden_act + + +class BlipConfig(PretrainedConfig): + r""" + [`BlipConfig`] is the configuration class to store the configuration of a [`BlipModel`]. It is used to instantiate + a BLIP model according to the specified arguments, defining the text model and vision model configs. Instantiating + a configuration with the defaults will yield a similar configuration to that of the BLIP-base + [Salesforce/blip-vqa-base](https://huggingface.co/Salesforce/blip-vqa-base) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + text_config (`dict`, *optional*): + Dictionary of configuration options used to initialize [`BlipTextConfig`]. + vision_config (`dict`, *optional*): + Dictionary of configuration options used to initialize [`BlipVisionConfig`]. + projection_dim (`int`, *optional*, defaults to 512): + Dimensionality of text and vision projection layers. + logit_scale_init_value (`float`, *optional*, defaults to 2.6592): + The initial value of the *logit_scale* parameter. Default is used as per the original BLIP implementation. + image_text_hidden_size (`int`, *optional*, defaults to 256): + Dimensionality of the hidden state of the image-text fusion layer. + label_smoothing (float, optional, *optional*, defaults to 0.0): + A float in [0.0, 1.0]. Specifies the amount of smoothing when computing the loss, where 0.0 means no smoothing. The targets + become a mixture of the original ground truth and a uniform distribution as described in + `Rethinking the Inception Architecture for Computer Vision `__. Default: :math:`0.0`. + kwargs (*optional*): + Dictionary of keyword arguments. + + Example: + + ```python + >>> from transformers import BlipConfig, BlipModel + + >>> # Initializing a BlipConfig with Salesforce/blip-vqa-base style configuration + >>> configuration = BlipConfig() + + >>> # Initializing a BlipPModel (with random weights) from the Salesforce/blip-vqa-base style configuration + >>> model = BlipModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + + >>> # We can also initialize a BlipConfig from a BlipTextConfig and a BlipVisionConfig + + >>> # Initializing a BLIPText and BLIPVision configuration + >>> config_text = BlipTextConfig() + >>> config_vision = BlipVisionConfig() + + >>> config = BlipConfig.from_text_vision_configs(config_text, config_vision) + ```""" + + model_type = "blip" + sub_configs = {"text_config": BlipTextConfig, "vision_config": BlipVisionConfig} + + def __init__( + self, + text_config=None, + vision_config=None, + projection_dim=512, + logit_scale_init_value=2.6592, + image_text_hidden_size=256, + label_smoothing=0.0, + **kwargs, + ): + super().__init__(**kwargs) + + if text_config is None: + text_config = {} + logger.info("`text_config` is `None`. Initializing the `BlipTextConfig` with default values.") + + if vision_config is None: + vision_config = {} + logger.info("`vision_config` is `None`. Initializing the `BlipVisionConfig` with default values.") + + self.text_config = BlipTextConfig(**text_config) + self.vision_config = BlipVisionConfig(**vision_config) + + self.text_config.encoder_hidden_size = self.vision_config.hidden_size + + self.projection_dim = projection_dim + self.logit_scale_init_value = logit_scale_init_value + self.initializer_factor = 1.0 + self.initializer_range = 0.02 + self.image_text_hidden_size = image_text_hidden_size + self.label_smoothing = label_smoothing + + +__all__ = ["BlipConfig", "BlipTextConfig", "BlipVisionConfig"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/blip/image_processing_blip.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/blip/image_processing_blip.py new file mode 100644 index 0000000000000000000000000000000000000000..78a152374fd0089846f4431c8b408da4c5c9ff54 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/blip/image_processing_blip.py @@ -0,0 +1,297 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Image processor class for BLIP.""" + +from typing import Optional, Union + +import numpy as np + +from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict +from ...image_transforms import convert_to_rgb, resize, to_channel_dimension_format +from ...image_utils import ( + OPENAI_CLIP_MEAN, + OPENAI_CLIP_STD, + ChannelDimension, + ImageInput, + PILImageResampling, + infer_channel_dimension_format, + is_scaled_image, + make_flat_list_of_images, + to_numpy_array, + valid_images, + validate_preprocess_arguments, +) +from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging + + +if is_vision_available(): + import PIL + + +logger = logging.get_logger(__name__) + + +class BlipImageProcessor(BaseImageProcessor): + r""" + Constructs a BLIP image processor. + + Args: + do_resize (`bool`, *optional*, defaults to `True`): + Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the + `do_resize` parameter in the `preprocess` method. + size (`dict`, *optional*, defaults to `{"height": 384, "width": 384}`): + Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess` + method. + resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`): + Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be + overridden by the `resample` parameter in the `preprocess` method. + do_rescale (`bool`, *optional*, defaults to `True`): + Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the + `do_rescale` parameter in the `preprocess` method. + rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): + Scale factor to use if rescaling the image. Only has an effect if `do_rescale` is set to `True`. Can be + overridden by the `rescale_factor` parameter in the `preprocess` method. + do_normalize (`bool`, *optional*, defaults to `True`): + Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess` + method. Can be overridden by the `do_normalize` parameter in the `preprocess` method. + image_mean (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`): + Mean to use if normalizing the image. This is a float or list of floats the length of the number of + channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can be + overridden by the `image_mean` parameter in the `preprocess` method. + image_std (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`): + Standard deviation to use if normalizing the image. This is a float or list of floats the length of the + number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method. + Can be overridden by the `image_std` parameter in the `preprocess` method. + do_convert_rgb (`bool`, *optional*, defaults to `True`): + Whether to convert the image to RGB. + """ + + model_input_names = ["pixel_values"] + + def __init__( + self, + do_resize: bool = True, + size: Optional[dict[str, int]] = None, + resample: PILImageResampling = PILImageResampling.BICUBIC, + do_rescale: bool = True, + rescale_factor: Union[int, float] = 1 / 255, + do_normalize: bool = True, + image_mean: Optional[Union[float, list[float]]] = None, + image_std: Optional[Union[float, list[float]]] = None, + do_convert_rgb: bool = True, + **kwargs, + ) -> None: + super().__init__(**kwargs) + size = size if size is not None else {"height": 384, "width": 384} + size = get_size_dict(size, default_to_square=True) + + self.do_resize = do_resize + self.size = size + self.resample = resample + self.do_rescale = do_rescale + self.rescale_factor = rescale_factor + self.do_normalize = do_normalize + self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN + self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD + self.do_convert_rgb = do_convert_rgb + + # Copied from transformers.models.vit.image_processing_vit.ViTImageProcessor.resize with PILImageResampling.BILINEAR->PILImageResampling.BICUBIC + def resize( + self, + image: np.ndarray, + size: dict[str, int], + resample: PILImageResampling = PILImageResampling.BICUBIC, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ) -> np.ndarray: + """ + Resize an image to `(size["height"], size["width"])`. + + Args: + image (`np.ndarray`): + Image to resize. + size (`dict[str, int]`): + Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image. + resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`): + `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BICUBIC`. + data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the output image. If unset, the channel dimension format of the input + image is used. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + + Returns: + `np.ndarray`: The resized image. + """ + size = get_size_dict(size) + if "height" not in size or "width" not in size: + raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}") + output_size = (size["height"], size["width"]) + return resize( + image, + size=output_size, + resample=resample, + data_format=data_format, + input_data_format=input_data_format, + **kwargs, + ) + + @filter_out_non_signature_kwargs() + def preprocess( + self, + images: ImageInput, + do_resize: Optional[bool] = None, + size: Optional[dict[str, int]] = None, + resample: Optional[PILImageResampling] = None, + do_rescale: Optional[bool] = None, + rescale_factor: Optional[float] = None, + do_normalize: Optional[bool] = None, + image_mean: Optional[Union[float, list[float]]] = None, + image_std: Optional[Union[float, list[float]]] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + do_convert_rgb: Optional[bool] = None, + data_format: ChannelDimension = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> PIL.Image.Image: + """ + Preprocess an image or batch of images. + + Args: + images (`ImageInput`): + Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If + passing in images with pixel values between 0 and 1, set `do_rescale=False`. + do_resize (`bool`, *optional*, defaults to `self.do_resize`): + Whether to resize the image. + size (`dict[str, int]`, *optional*, defaults to `self.size`): + Controls the size of the image after `resize`. The shortest edge of the image is resized to + `size["shortest_edge"]` whilst preserving the aspect ratio. If the longest edge of this resized image + is > `int(size["shortest_edge"] * (1333 / 800))`, then the image is resized again to make the longest + edge equal to `int(size["shortest_edge"] * (1333 / 800))`. + resample (`PILImageResampling`, *optional*, defaults to `self.resample`): + Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. + do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): + Whether to rescale the image values between [0 - 1]. + rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): + Rescale factor to rescale the image by if `do_rescale` is set to `True`. + do_normalize (`bool`, *optional*, defaults to `self.do_normalize`): + Whether to normalize the image. + image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`): + Image mean to normalize the image by if `do_normalize` is set to `True`. + image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`): + Image standard deviation to normalize the image by if `do_normalize` is set to `True`. + do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`): + Whether to convert the image to RGB. + return_tensors (`str` or `TensorType`, *optional*): + The type of tensors to return. Can be one of: + - Unset: Return a list of `np.ndarray`. + - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`. + - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. + - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. + - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. + data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - Unset: Use the channel dimension format of the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + """ + do_resize = do_resize if do_resize is not None else self.do_resize + resample = resample if resample is not None else self.resample + do_rescale = do_rescale if do_rescale is not None else self.do_rescale + rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor + do_normalize = do_normalize if do_normalize is not None else self.do_normalize + image_mean = image_mean if image_mean is not None else self.image_mean + image_std = image_std if image_std is not None else self.image_std + do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb + + size = size if size is not None else self.size + size = get_size_dict(size, default_to_square=False) + images = self.fetch_images(images) + images = make_flat_list_of_images(images) + + if not valid_images(images): + raise ValueError( + "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " + "torch.Tensor, tf.Tensor or jax.ndarray." + ) + + validate_preprocess_arguments( + do_rescale=do_rescale, + rescale_factor=rescale_factor, + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + do_resize=do_resize, + size=size, + resample=resample, + ) + # PIL RGBA images are converted to RGB + if do_convert_rgb: + images = [convert_to_rgb(image) for image in images] + + # All transformations expect numpy arrays. + images = [to_numpy_array(image) for image in images] + + if do_rescale and is_scaled_image(images[0]): + logger.warning_once( + "It looks like you are trying to rescale already rescaled images. If the input" + " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again." + ) + + if input_data_format is None: + # We assume that all images have the same channel dimension format. + input_data_format = infer_channel_dimension_format(images[0]) + + if do_resize: + images = [ + self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format) + for image in images + ] + + if do_rescale: + images = [ + self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format) + for image in images + ] + + if do_normalize: + images = [ + self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format) + for image in images + ] + + images = [ + to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images + ] + + encoded_outputs = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors) + + return encoded_outputs + + +__all__ = ["BlipImageProcessor"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/blip/image_processing_blip_fast.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/blip/image_processing_blip_fast.py new file mode 100644 index 0000000000000000000000000000000000000000..2b1d14f139049162590bcaabf923a08b194aaae5 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/blip/image_processing_blip_fast.py @@ -0,0 +1,36 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Fast Image processor class for BLIP.""" + +from ...image_processing_utils_fast import BaseImageProcessorFast +from ...image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, PILImageResampling +from ...utils import auto_docstring + + +@auto_docstring +class BlipImageProcessorFast(BaseImageProcessorFast): + # To be checked against the slow image processor + # None values left after checking can be removed + resample = PILImageResampling.BICUBIC + image_mean = OPENAI_CLIP_MEAN + image_std = OPENAI_CLIP_STD + size = {"height": 384, "width": 384} + do_resize = True + do_rescale = True + do_normalize = True + do_convert_rgb = True + + +__all__ = ["BlipImageProcessorFast"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/blip/modeling_blip.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/blip/modeling_blip.py new file mode 100644 index 0000000000000000000000000000000000000000..8a58add38af3a68befc369376f4023b461c512b4 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/blip/modeling_blip.py @@ -0,0 +1,1308 @@ +# coding=utf-8 +# Copyright 2022 The Salesforce Team Authors and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch BLIP model.""" + +import warnings +from dataclasses import dataclass +from typing import Any, Optional, Union + +import torch +from torch import nn +from torch.nn.functional import normalize + +from ...activations import ACT2FN +from ...generation import GenerationMixin +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling +from ...modeling_utils import PreTrainedModel +from ...processing_utils import Unpack +from ...utils import ModelOutput, TransformersKwargs, auto_docstring, can_return_tuple, logging, torch_int +from ...utils.generic import check_model_inputs +from .configuration_blip import BlipConfig, BlipTextConfig, BlipVisionConfig +from .modeling_blip_text import BlipTextLMHeadModel, BlipTextModel + + +logger = logging.get_logger(__name__) + + +# Copied from transformers.models.clip.modeling_clip.contrastive_loss +def contrastive_loss(logits: torch.Tensor) -> torch.Tensor: + return nn.functional.cross_entropy(logits, torch.arange(len(logits), device=logits.device)) + + +# Copied from transformers.models.clip.modeling_clip.clip_loss with clip->blip +def blip_loss(similarity: torch.Tensor) -> torch.Tensor: + caption_loss = contrastive_loss(similarity) + image_loss = contrastive_loss(similarity.t()) + return (caption_loss + image_loss) / 2.0 + + +@dataclass +@auto_docstring( + custom_intro=""" + Adapted from the base class for vision model's outputs that also contains image embeddings of the pooling of the + last hidden states. This class also adds the loss term from the text decoder. + """ +) +class BlipForConditionalGenerationModelOutput(ModelOutput): + r""" + loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`): + Language modeling loss from the text decoder. + logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`, *optional*): + Prediction scores of the language modeling head of the text decoder model. + image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*): + The image embeddings obtained after applying the Vision Transformer model to the input image. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional[tuple[torch.FloatTensor]] = None + logits: Optional[tuple[torch.FloatTensor]] = None + image_embeds: Optional[torch.FloatTensor] = None + last_hidden_state: Optional[torch.FloatTensor] = None + hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None + attentions: Optional[tuple[torch.FloatTensor, ...]] = None + + @property + def decoder_logits(self): + warnings.warn( + "`decoder_logits` attribute is deprecated and will be removed in version 5 of Transformers." + " Please use the `logits` attribute to retrieve the final output instead.", + FutureWarning, + ) + return self.logits + + +@dataclass +@auto_docstring( + custom_intro=""" + Adapted from the base class for vision model's outputs that also contains image embeddings of the pooling of the + last hidden states. This class also adds the loss term from the text decoder. + """ +) +class BlipTextVisionModelOutput(ModelOutput): + r""" + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Language modeling loss from the text decoder. + image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`): + The image embeddings obtained by applying the projection layer to the pooler_output. + """ + + loss: Optional[torch.FloatTensor] = None + image_embeds: Optional[torch.FloatTensor] = None + last_hidden_state: Optional[torch.FloatTensor] = None + hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None + attentions: Optional[tuple[torch.FloatTensor, ...]] = None + + +@dataclass +@auto_docstring( + custom_intro=""" + Adapted from the base class for vision model's outputs that also contains image embeddings of the pooling of the + last hidden states. This class also adds the loss term from the text decoder as well as the image-text similarity + scores. + """ +) +class BlipImageTextMatchingModelOutput(ModelOutput): + r""" + itm_score (`torch.FloatTensor`): + The image-text similarity scores. + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Language modeling loss from the text decoder. + image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`): + The image embeddings obtained by applying the projection layer to the pooler_output. + vision_pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*): + Last layer hidden-state of the vision of the vision-only branch of the model. + question_embeds (`torch.FloatTensor`): + The question embeddings obtained by the text projection layer. + """ + + itm_score: Optional[torch.FloatTensor] = None + loss: Optional[torch.FloatTensor] = None + image_embeds: Optional[torch.FloatTensor] = None + last_hidden_state: Optional[torch.FloatTensor] = None + hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None + vision_pooler_output: Optional[torch.FloatTensor] = None + attentions: Optional[tuple[torch.FloatTensor, ...]] = None + question_embeds: Optional[tuple[torch.FloatTensor]] = None + + +@dataclass +@auto_docstring +class BlipOutput(ModelOutput): + r""" + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`): + Contrastive loss for image-text similarity. + logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`): + The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text + similarity scores. + logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`): + The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image + similarity scores. + text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`): + The text embeddings obtained by applying the projection layer to the pooled output of [`BlipTextModel`]. + image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`): + The image embeddings obtained by applying the projection layer to the pooled output of [`BlipVisionModel`]. + text_model_output (`BaseModelOutputWithPooling`): + The output of the [`BlipTextModel`]. + vision_model_output (`BaseModelOutputWithPooling`): + The output of the [`BlipVisionModel`]. + """ + + loss: Optional[torch.FloatTensor] = None + logits_per_image: Optional[torch.FloatTensor] = None + logits_per_text: Optional[torch.FloatTensor] = None + text_embeds: Optional[torch.FloatTensor] = None + image_embeds: Optional[torch.FloatTensor] = None + text_model_output: BaseModelOutputWithPooling = None + vision_model_output: BaseModelOutputWithPooling = None + + def to_tuple(self) -> tuple[Any]: + return tuple( + self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple() + for k in self.keys() + ) + + +class BlipVisionEmbeddings(nn.Module): + def __init__(self, config: BlipVisionConfig): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.image_size = config.image_size + self.patch_size = config.patch_size + + self.class_embedding = nn.Parameter(torch.randn(1, 1, self.embed_dim)) + + self.patch_embedding = nn.Conv2d( + in_channels=3, out_channels=self.embed_dim, kernel_size=self.patch_size, stride=self.patch_size + ) + + self.num_patches = (self.image_size // self.patch_size) ** 2 + self.num_positions = self.num_patches + 1 + + self.position_embedding = nn.Parameter(torch.randn(1, self.num_positions, self.embed_dim)) + + def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor: + """ + This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution + images. This method is also adapted to support torch.jit tracing. + + Adapted from: + - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and + - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211 + """ + + num_patches = embeddings.shape[1] - 1 + num_positions = self.position_embedding.shape[1] - 1 + + # always interpolate when tracing to ensure the exported model works for dynamic input shapes + if not torch.jit.is_tracing() and num_patches == num_positions and height == width: + return self.position_embedding + + class_pos_embed = self.position_embedding[:, :1] + patch_pos_embed = self.position_embedding[:, 1:] + + dim = embeddings.shape[-1] + + new_height = height // self.patch_size + new_width = width // self.patch_size + + sqrt_num_positions = torch_int(num_positions**0.5) + patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim) + patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2) + + patch_pos_embed = nn.functional.interpolate( + patch_pos_embed, + size=(new_height, new_width), + mode="bicubic", + align_corners=False, + ) + + patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim) + + return torch.cat((class_pos_embed, patch_pos_embed), dim=1) + + def forward(self, pixel_values: torch.FloatTensor, interpolate_pos_encoding: bool = False) -> torch.Tensor: + batch_size, _, height, width = pixel_values.shape + target_dtype = self.patch_embedding.weight.dtype + patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid] + patch_embeds = patch_embeds.flatten(2).transpose(1, 2) + class_embeds = self.class_embedding.expand(batch_size, 1, -1).to(target_dtype) + embeddings = torch.cat([class_embeds, patch_embeds], dim=1) + if interpolate_pos_encoding: + position_embedding = self.interpolate_pos_encoding(embeddings, height, width) + else: + position_embedding = self.position_embedding + embeddings = embeddings + position_embedding[:, : embeddings.size(1), :].to(target_dtype) + return embeddings + + +# Copied from transformers.models.clip.modeling_clip.CLIPTextEmbeddings with CLIP->Blip +class BlipTextEmbeddings(nn.Module): + def __init__(self, config: BlipTextConfig): + super().__init__() + embed_dim = config.hidden_size + + self.token_embedding = nn.Embedding(config.vocab_size, embed_dim) + self.position_embedding = nn.Embedding(config.max_position_embeddings, embed_dim) + + # position_ids (1, len position emb) is contiguous in memory and exported when serialized + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + ) -> torch.Tensor: + seq_length = input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2] + max_position_embedding = self.position_embedding.weight.shape[0] + + if seq_length > max_position_embedding: + raise ValueError( + f"Sequence length must be less than max_position_embeddings (got `sequence length`: " + f"{seq_length} and max_position_embeddings: {max_position_embedding}" + ) + + if position_ids is None: + position_ids = self.position_ids[:, :seq_length] + + if inputs_embeds is None: + inputs_embeds = self.token_embedding(input_ids) + + position_embeddings = self.position_embedding(position_ids) + embeddings = inputs_embeds + position_embeddings + + return embeddings + + +class BlipAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.embed_dim // self.num_heads + if self.head_dim * self.num_heads != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:" + f" {self.num_heads})." + ) + self.scale = self.head_dim**-0.5 + self.dropout = nn.Dropout(config.attention_dropout) + + self.qkv = nn.Linear(self.embed_dim, 3 * self.embed_dim) + + self.projection = nn.Linear(self.embed_dim, self.embed_dim) + + def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): + return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() + + def forward( + self, + hidden_states: torch.Tensor, + head_mask: Optional[torch.Tensor] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> tuple[torch.Tensor, torch.Tensor]: + """Input shape: Batch x Time x Channel""" + + bsz, tgt_len, embed_dim = hidden_states.size() + + mixed_qkv = ( + self.qkv(hidden_states) + .reshape(bsz, tgt_len, 3, self.num_heads, embed_dim // self.num_heads) + .permute(2, 0, 3, 1, 4) + ) + query_states, key_states, value_states = mixed_qkv[0], mixed_qkv[1], mixed_qkv[2] + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_states, key_states.transpose(-1, -2)) + + attention_scores = attention_scores * self.scale + + # Normalize the attention scores to probabilities. + attention_probs = nn.functional.softmax(attention_scores, dim=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = attention_probs * head_mask + + context_layer = torch.matmul(attention_probs, value_states).permute(0, 2, 1, 3) + + new_context_layer_shape = context_layer.size()[:-2] + (self.embed_dim,) + context_layer = context_layer.reshape(new_context_layer_shape) + + output = self.projection(context_layer) + + return output, attention_probs + + +# Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->Blip +class BlipMLP(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.activation_fn = ACT2FN[config.hidden_act] + self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size) + self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.fc1(hidden_states) + hidden_states = self.activation_fn(hidden_states) + hidden_states = self.fc2(hidden_states) + return hidden_states + + +class BlipEncoderLayer(GradientCheckpointingLayer): + def __init__(self, config: BlipConfig): + super().__init__() + self.embed_dim = config.hidden_size + self.self_attn = BlipAttention(config) + self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) + self.mlp = BlipMLP(config) + self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) + + @auto_docstring + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: torch.Tensor, + **kwargs: Unpack[TransformersKwargs], + ) -> torch.FloatTensor: + residual = hidden_states + + hidden_states = self.layer_norm1(hidden_states) + hidden_states, _ = self.self_attn( + hidden_states=hidden_states, + head_mask=attention_mask, + **kwargs, + ) + hidden_states = hidden_states + residual + residual = hidden_states + hidden_states = self.layer_norm2(hidden_states) + hidden_states = self.mlp(hidden_states) + + hidden_states = hidden_states + residual + + return hidden_states + + +@auto_docstring +class BlipPreTrainedModel(PreTrainedModel): + config: BlipConfig + base_model_prefix = "blip" + supports_gradient_checkpointing = True + _no_split_modules = ["BlipEncoderLayer", "BlipTextEmbeddings"] + _skip_keys_device_placement = ["past_key_values"] + + def _init_weights(self, module): + """Initialize the weights""" + factor = self.config.initializer_range + if isinstance(module, (nn.Conv2d, nn.Embedding, nn.Linear)): + module.weight.data.normal_(mean=0.0, std=factor) + if hasattr(module, "bias") and module.bias is not None: + module.bias.data.zero_() + + if isinstance(module, BlipVisionEmbeddings): + if hasattr(self.config, "vision_config"): + factor = self.config.vision_config.initializer_range + nn.init.trunc_normal_( + module.position_embedding, + mean=0.0, + std=factor, + ) + + nn.init.trunc_normal_( + module.class_embedding, + mean=0.0, + std=factor, + ) + + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + elif isinstance(module, nn.Linear) and module.bias is not None: + module.bias.data.zero_() + + +class BlipEncoder(nn.Module): + """ + Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a + [`BlipEncoderLayer`]. + + Args: + config (`BlipConfig`): + The corresponding vision configuration for the `BlipEncoder`. + """ + + def __init__(self, config: BlipConfig): + super().__init__() + self.config = config + self.layers = nn.ModuleList([BlipEncoderLayer(config) for _ in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + + @auto_docstring + def forward( + self, + inputs_embeds, + attention_mask: Optional[torch.Tensor] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> Union[tuple, BaseModelOutput]: + hidden_states = inputs_embeds + for encoder_layer in self.layers: + hidden_states = encoder_layer( + hidden_states, + attention_mask=attention_mask, + **kwargs, + ) + + return BaseModelOutput(last_hidden_state=hidden_states) + + +class BlipVisionModel(BlipPreTrainedModel): + main_input_name = "pixel_values" + config: BlipVisionConfig + _can_record_outputs = { + "hidden_states": BlipEncoderLayer, + "attentions": BlipAttention, + } + + def __init__(self, config: BlipVisionConfig): + super().__init__(config) + self.config = config + embed_dim = config.hidden_size + + self.embeddings = BlipVisionEmbeddings(config) + self.encoder = BlipEncoder(config) + self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) + + self.post_init() + + @check_model_inputs(tie_last_hidden_states=False) + @auto_docstring + def forward( + self, + pixel_values: Optional[torch.FloatTensor] = None, + interpolate_pos_encoding: bool = False, + **kwargs: Unpack[TransformersKwargs], + ) -> Union[tuple, BaseModelOutputWithPooling]: + if pixel_values is None: + raise ValueError("You have to specify pixel_values") + + hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding) + + encoder_outputs: BaseModelOutput = self.encoder( + inputs_embeds=hidden_states, + **kwargs, + ) + + last_hidden_state = encoder_outputs.last_hidden_state + last_hidden_state = self.post_layernorm(last_hidden_state) + + pooled_output = last_hidden_state[:, 0, :] + pooled_output = self.post_layernorm(pooled_output) + + return BaseModelOutputWithPooling( + last_hidden_state=last_hidden_state, + pooler_output=pooled_output, + ) + + def get_input_embeddings(self): + return self.embeddings + + +@auto_docstring( + custom_intro=""" + This model is going to be deprecated in future versions. Please use `BlipForConditionalGeneration`, `BlipForQuestionAnswering` or `BlipForImageTextRetrieval` depending on your usecase. + """ +) +class BlipModel(BlipPreTrainedModel): + config: BlipConfig + + def __init__(self, config: BlipConfig): + super().__init__(config) + + if not isinstance(config.text_config, BlipTextConfig): + raise TypeError( + "config.text_config is expected to be of type BlipTextConfig but is of type" + f" {type(config.text_config)}." + ) + + if not isinstance(config.vision_config, BlipVisionConfig): + raise TypeError( + "config.vision_config is expected to be of type BlipVisionConfig but is of type" + f" {type(config.vision_config)}." + ) + + text_config = config.text_config + vision_config = config.vision_config + + self.projection_dim = config.projection_dim + self.text_embed_dim = text_config.hidden_size + self.vision_embed_dim = vision_config.hidden_size + + self.text_model = BlipTextModel(text_config) + self.vision_model = BlipVisionModel(vision_config) + + self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False) + self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False) + self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value)) + + logger.warning( + "`BlipModel` is going to be deprecated in future release, please use `BlipForConditionalGeneration`, `BlipForQuestionAnswering` or `BlipForImageTextRetrieval` depending on your usecase." + ) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.text_model.get_input_embeddings() + + def set_input_embeddings(self, value): + self.text_model.set_input_embeddings(value) + + @auto_docstring + def get_text_features( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + ) -> torch.FloatTensor: + r""" + Returns: + text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by + applying the projection layer to the pooled output of [`BlipTextModel`]. + + Examples: + + ```python + >>> from transformers import AutoProcessor, BlipModel + + >>> model = BlipModel.from_pretrained("Salesforce/blip-image-captioning-base") + >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base") + + >>> inputs = processor(text=["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt") + >>> text_features = model.get_text_features(**inputs) + ```""" + text_outputs = self.text_model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + ) + + pooled_output = text_outputs[1] + text_features = self.text_projection(pooled_output) + + return text_features + + @auto_docstring + def get_image_features( + self, + pixel_values: Optional[torch.FloatTensor] = None, + interpolate_pos_encoding: bool = False, + ) -> torch.FloatTensor: + r""" + Returns: + image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by + applying the projection layer to the pooled output of [`BlipVisionModel`]. + + Examples: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, BlipModel + + >>> model = BlipModel.from_pretrained("Salesforce/blip-image-captioning-base") + >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> inputs = processor(images=image, return_tensors="pt") + + >>> image_features = model.get_image_features(**inputs) + ```""" + + vision_outputs = self.vision_model( + pixel_values=pixel_values, + interpolate_pos_encoding=interpolate_pos_encoding, + ) + + pooled_output = vision_outputs[1] # pooled_output + image_features = self.visual_projection(pooled_output) + + return image_features + + @auto_docstring + def get_multimodal_features( + self, + input_ids: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + interpolate_pos_encoding: bool = False, + ) -> torch.FloatTensor: + r""" + Returns: + multimodal_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The multimodal embeddings + obtained by applying the image embeddings to the text encoder using the cross-attention mechanism. + + Examples: + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, BlipModel + + >>> model = BlipModel.from_pretrained("Salesforce/blip-image-captioning-base") + >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + >>> texts = ["a photo of a cat", "a photo of a dog"] + >>> inputs = processor(images=image, text=texts, padding=True, return_tensors="pt") + + >>> multimodal_features = model.get_multimodal_features(**inputs) + ```""" + vision_outputs = self.vision_model( + pixel_values=pixel_values, + interpolate_pos_encoding=interpolate_pos_encoding, + ) + + image_embeds = vision_outputs[0] + image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long) + + text_outputs = self.text_model( + input_ids=input_ids, + attention_mask=attention_mask, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_atts, + ) + + pooled_output = text_outputs[1] # pooled_output + multimodal_features = self.text_projection(pooled_output) + + return multimodal_features + + @can_return_tuple + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + return_loss: Optional[bool] = None, + interpolate_pos_encoding: bool = False, + **kwargs: Unpack[TransformersKwargs], + ) -> Union[tuple, BlipOutput]: + r""" + return_loss (`bool`, *optional*): + Whether or not to return the contrastive loss. + + Examples: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, BlipModel + + >>> model = BlipModel.from_pretrained("Salesforce/blip-image-captioning-base") + >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> inputs = processor( + ... text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True + ... ) + + >>> outputs = model(**inputs) + >>> logits_per_image = outputs.logits_per_image # this is the image-text similarity score + >>> probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities + ```""" + vision_outputs = self.vision_model( + pixel_values=pixel_values, + interpolate_pos_encoding=interpolate_pos_encoding, + **kwargs, + ) + + text_outputs = self.text_model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + **kwargs, + ) + + image_embeds = vision_outputs.pooler_output + image_embeds = self.visual_projection(image_embeds) + + text_embeds = text_outputs.pooler_output + text_embeds = self.text_projection(text_embeds) + + # normalized features + image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True) + text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True) + + # cosine similarity as logits + logit_scale = self.logit_scale.exp().to(device=text_embeds.device) + image_embeds = image_embeds.to(device=text_embeds.device, dtype=text_embeds.dtype) + logits_per_text = torch.matmul(text_embeds, image_embeds.t()) * logit_scale + logits_per_image = logits_per_text.t() + + loss = None + if return_loss: + loss = blip_loss(logits_per_text) + + return BlipOutput( + loss=loss, + logits_per_image=logits_per_image, + logits_per_text=logits_per_text, + text_embeds=text_embeds, + image_embeds=image_embeds, + text_model_output=text_outputs, + vision_model_output=vision_outputs, + ) + + +@auto_docstring( + custom_intro=""" + BLIP Model for image captioning. The model consists of a vision encoder and a text decoder. One can optionally pass + `input_ids` to the model, which serve as a text prompt, to make the text decoder continue the prompt. Otherwise, + the decoder starts generating text from the [BOS] (beginning-of-sequence) token. will start generating the caption + from the text input. If no text input is provided, the decoder will start with the [BOS] token only. + """ +) +class BlipForConditionalGeneration(BlipPreTrainedModel, GenerationMixin): + config: BlipConfig + _tied_weights_keys = ["text_decoder.cls.predictions.decoder.bias"] + main_input_name = "pixel_values" + + def __init__(self, config: BlipConfig): + super().__init__(config) + + self.vision_model = BlipVisionModel(config.vision_config) + + self.text_decoder = BlipTextLMHeadModel(config.text_config) + + self.decoder_input_ids = config.text_config.bos_token_id + self.decoder_pad_token_id = config.text_config.pad_token_id + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.text_decoder.get_input_embeddings() + + def set_input_embeddings(self, value): + self.text_decoder.set_input_embeddings(value) + + @can_return_tuple + @auto_docstring + def forward( + self, + pixel_values: torch.FloatTensor, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.LongTensor] = None, + labels: Optional[torch.LongTensor] = None, + interpolate_pos_encoding: bool = False, + **kwargs: Unpack[TransformersKwargs], + ) -> Union[tuple, BlipForConditionalGenerationModelOutput]: + r""" + Examples: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, BlipForConditionalGeneration + + >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base") + >>> model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + >>> text = "A picture of" + + >>> inputs = processor(images=image, text=text, return_tensors="pt") + + >>> outputs = model(**inputs) + ```""" + + vision_outputs = self.vision_model( + pixel_values=pixel_values, + interpolate_pos_encoding=interpolate_pos_encoding, + **kwargs, + ) + + image_embeds = vision_outputs.last_hidden_state + + outputs = self.text_decoder( + input_ids=input_ids, + attention_mask=attention_mask, + encoder_hidden_states=image_embeds, + labels=labels, + reduction="mean", + **kwargs, + ) + + return BlipForConditionalGenerationModelOutput( + loss=outputs.loss, + logits=outputs.logits, + image_embeds=image_embeds, + last_hidden_state=vision_outputs.last_hidden_state, + hidden_states=vision_outputs.hidden_states, + attentions=vision_outputs.attentions, + ) + + @torch.no_grad() + def generate( + self, + pixel_values: torch.FloatTensor, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.LongTensor] = None, + interpolate_pos_encoding: bool = False, + **generate_kwargs, + ) -> torch.LongTensor: + r""" + Overrides *generate* function to be able to use the model as a conditional generator + + Parameters: + pixel_values (*torch.FloatTensor* of shape *(batch_size, num_channels, image_height, image_width)*: + Input image to be processed + input_ids (*torch.LongTensor* of shape *(batch_size, sequence_length)*, *optional*): + The sequence used as a prompt for the generation. + attention_mask (*torch.LongTensor* of shape *(batch_size, sequence_length)*, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + + Examples: + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, BlipForConditionalGeneration + + >>> model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base") + >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> inputs = processor(images=image, return_tensors="pt") + + >>> outputs = model.generate(**inputs) + >>> print(processor.decode(outputs[0], skip_special_tokens=True)) + two cats sleeping on a couch + ``` + """ + + batch_size = pixel_values.shape[0] + vision_outputs = self.vision_model( + pixel_values=pixel_values, + interpolate_pos_encoding=interpolate_pos_encoding, + ) + + image_embeds = vision_outputs[0] + + image_attention_mask = torch.ones(image_embeds.size()[:-1], dtype=torch.long, device=image_embeds.device) + + if isinstance(input_ids, list): + input_ids = torch.LongTensor(input_ids) + elif input_ids is None: + input_ids = ( + torch.LongTensor([[self.decoder_input_ids, self.config.text_config.eos_token_id]]) + .repeat(batch_size, 1) + .to(image_embeds.device) + ) + + input_ids[:, 0] = self.config.text_config.bos_token_id + attention_mask = attention_mask[:, :-1] if attention_mask is not None else None + + outputs = self.text_decoder.generate( + input_ids=input_ids[:, :-1], + eos_token_id=self.config.text_config.sep_token_id, + pad_token_id=self.config.text_config.pad_token_id, + attention_mask=attention_mask, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_attention_mask, + **generate_kwargs, + ) + + return outputs + + +@auto_docstring( + custom_intro=""" + BLIP Model for visual question answering. The model consists of a vision encoder, a text encoder as well as a text + decoder. The vision encoder will encode the input image, the text encoder will encode the input question together + with the encoding of the image, and the text decoder will output the answer to the question. + """ +) +class BlipForQuestionAnswering(BlipPreTrainedModel, GenerationMixin): + config: BlipConfig + _tied_weights_keys = ["text_decoder.cls.predictions.decoder.bias"] + + def __init__(self, config: BlipConfig): + super().__init__(config) + + self.vision_model = BlipVisionModel(config.vision_config) + + self.text_encoder = BlipTextModel(config.text_config, add_pooling_layer=False) + + self.text_decoder = BlipTextLMHeadModel(config.text_config) + + self.decoder_pad_token_id = config.text_config.pad_token_id + self.decoder_start_token_id = config.text_config.bos_token_id + + # Initialize weights and apply final processing + self.post_init() + + def set_input_embeddings(self, value): + self.text_encoder.set_input_embeddings(value) + + def get_input_embeddings(self): + # This will return shared embeddings if they are shared else specific to encoder. + return self.text_encoder.get_input_embeddings() + + @can_return_tuple + @auto_docstring + def forward( + self, + input_ids: torch.LongTensor, + pixel_values: torch.FloatTensor, + decoder_input_ids: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.LongTensor] = None, + labels: Optional[torch.LongTensor] = None, + interpolate_pos_encoding: bool = False, + **kwargs: Unpack[TransformersKwargs], + ) -> Union[tuple, BlipTextVisionModelOutput]: + r""" + Examples: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, BlipForQuestionAnswering + + >>> model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base") + >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-vqa-base") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> # training + >>> text = "How many cats are in the picture?" + >>> label = "2" + >>> inputs = processor(images=image, text=text, return_tensors="pt") + >>> labels = processor(text=label, return_tensors="pt").input_ids + + >>> inputs["labels"] = labels + >>> outputs = model(**inputs) + >>> loss = outputs.loss + >>> loss.backward() + + >>> # inference + >>> text = "How many cats are in the picture?" + >>> inputs = processor(images=image, text=text, return_tensors="pt") + >>> outputs = model.generate(**inputs) + >>> print(processor.decode(outputs[0], skip_special_tokens=True)) + 2 + ```""" + if labels is None and decoder_input_ids is None: + raise ValueError( + "Either `decoder_input_ids` or `labels` should be passed when calling `forward` with" + " `BlipForQuestionAnswering`. if you are training the model make sure that `labels` is passed, if you" + " are using the model for inference make sure that `decoder_input_ids` is passed or call `generate`" + ) + + vision_outputs = self.vision_model( + pixel_values=pixel_values, + interpolate_pos_encoding=interpolate_pos_encoding, + **kwargs, + ) + + image_embeds = vision_outputs.last_hidden_state + image_attention_mask = torch.ones(image_embeds.size()[:-1], dtype=torch.long) + + question_embeds = self.text_encoder( + input_ids=input_ids, + attention_mask=attention_mask, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_attention_mask, + **kwargs, + ) + + if labels is not None and decoder_input_ids is None: + # labels are already shifted right, see: https://github.com/huggingface/transformers/pull/23153 + decoder_input_ids = labels + + question_embeds = question_embeds[0] + + answer_output = self.text_decoder( + input_ids=decoder_input_ids, + attention_mask=decoder_attention_mask, + encoder_hidden_states=question_embeds, + encoder_attention_mask=attention_mask, + labels=labels, + reduction="mean", + **kwargs, + ) + + if labels is not None: + decoder_loss = answer_output.loss.mean() + else: + decoder_loss = None + + return BlipTextVisionModelOutput( + loss=decoder_loss, + image_embeds=image_embeds, + last_hidden_state=vision_outputs.last_hidden_state, + hidden_states=vision_outputs.hidden_states, + attentions=vision_outputs.attentions, + ) + + @torch.no_grad() + def generate( + self, + input_ids: torch.LongTensor, + pixel_values: torch.FloatTensor, + attention_mask: Optional[torch.LongTensor] = None, + interpolate_pos_encoding: bool = False, + **generate_kwargs, + ) -> torch.LongTensor: + r""" + Overrides *generate* function to be able to use the model as a conditional generator + + Parameters: + input_ids (*torch.LongTensor* of shape *(batch_size, sequence_length)*): + The sequence used as a prompt for the generation. + pixel_values (*torch.FloatTensor* of shape *(batch_size, num_channels, image_height, image_width)*: + Input image to be processed + attention_mask (*torch.LongTensor* of shape *(batch_size, sequence_length)*, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`. `1` for + tokens that are NOT MASKED, `0` for MASKED tokens. + **generate_kwargs: + Additional arguments passed to the *generate* function of the decoder + + + Examples: + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, BlipForQuestionAnswering + + >>> model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base") + >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-vqa-base") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + >>> text = "How many cats are in the picture?" + + >>> inputs = processor(images=image, text=text, return_tensors="pt") + + >>> outputs = model.generate(**inputs) + >>> print(processor.decode(outputs[0], skip_special_tokens=True)) + 2 + ``` + """ + vision_outputs = self.vision_model( + pixel_values=pixel_values, + interpolate_pos_encoding=interpolate_pos_encoding, + ) + + image_embeds = vision_outputs[0] + + image_attention_mask = torch.ones(image_embeds.size()[:-1], dtype=torch.long, device=image_embeds.device) + + if isinstance(input_ids, list): + input_ids = torch.LongTensor(input_ids) + + question_outputs = self.text_encoder( + input_ids=input_ids, + attention_mask=attention_mask, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_attention_mask, + return_dict=False, + ) + + question_embeds = question_outputs[0] + + question_attention_mask = torch.ones( + question_embeds.size()[:-1], dtype=torch.long, device=question_embeds.device + ) + + bos_ids = torch.full( + (question_embeds.size(0), 1), fill_value=self.decoder_start_token_id, device=question_embeds.device + ) + + outputs = self.text_decoder.generate( + input_ids=bos_ids, + eos_token_id=self.config.text_config.sep_token_id, + pad_token_id=self.config.text_config.pad_token_id, + encoder_hidden_states=question_embeds, + encoder_attention_mask=question_attention_mask, + **generate_kwargs, + ) + + return outputs + + +@auto_docstring( + custom_intro=""" + BLIP Model with a vision and text projector, and a classification head on top. The model is used in the context of + image-text retrieval. Given an image and a text, the model returns the probability of the text being relevant to + the image. + """ +) +class BlipForImageTextRetrieval(BlipPreTrainedModel): + config: BlipConfig + + def __init__(self, config: BlipConfig): + super().__init__(config) + + self.vision_model = BlipVisionModel(config.vision_config) + + self.text_encoder = BlipTextModel(config.text_config, add_pooling_layer=False) + + # vision projection layer + self.vision_proj = nn.Linear(config.vision_config.hidden_size, config.image_text_hidden_size) + + # text projection layer + self.text_proj = nn.Linear(config.text_config.hidden_size, config.image_text_hidden_size) + + # image text matching head + self.itm_head = nn.Linear(config.text_config.hidden_size, 2) + + self.decoder_pad_token_id = ( + config.text_config.pad_token_id + if not hasattr(config, "decoder_pad_token_id") + else config.decoder_pad_token_id + ) + self.decoder_start_token_id = ( + config.text_config.bos_token_id + if not hasattr(config, "decoder_start_token_id") + else config.decoder_start_token_id + ) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.text_encoder.get_input_embeddings() + + def set_input_embeddings(self, value): + self.text_encoder.set_input_embeddings(value) + + @can_return_tuple + @auto_docstring + def forward( + self, + input_ids: torch.LongTensor, + pixel_values: torch.FloatTensor, + use_itm_head: Optional[bool] = True, + attention_mask: Optional[torch.LongTensor] = None, + interpolate_pos_encoding: bool = False, + **kwargs: Unpack[TransformersKwargs], + ) -> Union[tuple, BlipTextVisionModelOutput]: + r""" + use_itm_head (`bool`, *optional*, defaults to `True`): + Whether or not to use the image-text matching head. + + Examples: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, BlipForImageTextRetrieval + + >>> model = BlipForImageTextRetrieval.from_pretrained("Salesforce/blip-itm-base-coco") + >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-itm-base-coco") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + >>> text = "an image of a cat" + + >>> inputs = processor(images=image, text=text, return_tensors="pt") + >>> outputs = model(**inputs) + ``` + """ + vision_outputs = self.vision_model( + pixel_values=pixel_values, + interpolate_pos_encoding=interpolate_pos_encoding, + **kwargs, + ) + + image_embeds = vision_outputs.last_hidden_state + image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long) + + if use_itm_head: + question_embeds = self.text_encoder( + input_ids=input_ids, + attention_mask=attention_mask, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_atts, + **kwargs, + ) + question_embeds = question_embeds.last_hidden_state + + output = self.itm_head(question_embeds[:, 0, :]) + else: + question_embeds = self.text_encoder( + input_ids=input_ids, + attention_mask=attention_mask, + **kwargs, + ) + question_embeds = question_embeds.last_hidden_state + + image_feat = normalize(self.vision_proj(image_embeds[:, 0, :]), dim=-1) + text_feat = normalize(self.text_proj(question_embeds[:, 0, :]), dim=-1) + + output = image_feat @ text_feat.t() + + return BlipImageTextMatchingModelOutput( + itm_score=output, + last_hidden_state=vision_outputs.last_hidden_state, + hidden_states=vision_outputs.hidden_states, + attentions=vision_outputs.attentions, + question_embeds=question_embeds, + ) + + +__all__ = [ + "BlipModel", + "BlipPreTrainedModel", + "BlipForConditionalGeneration", + "BlipForQuestionAnswering", + "BlipVisionModel", + "BlipTextModel", + "BlipForImageTextRetrieval", +] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/blip/modeling_blip_text.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/blip/modeling_blip_text.py new file mode 100644 index 0000000000000000000000000000000000000000..6f1f58c7533463ab9ba01f45a15ba51a294c0651 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/blip/modeling_blip_text.py @@ -0,0 +1,968 @@ +# coding=utf-8 +# Copyright 2022 The Salesforce Team Authors and The HuggingFace Team. All rights reserved. +# +# Licensed under the BSD-3-clause license (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import math +from typing import Optional, Union + +import torch +from torch import Tensor, device, nn +from torch.nn import CrossEntropyLoss + +from ...activations import ACT2FN +from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache +from ...generation import GenerationMixin +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_outputs import ( + BaseModelOutputWithPastAndCrossAttentions, + BaseModelOutputWithPoolingAndCrossAttentions, + CausalLMOutputWithCrossAttentions, +) +from ...modeling_utils import PreTrainedModel +from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer +from ...utils import logging +from ...utils.deprecation import deprecate_kwarg +from .configuration_blip import BlipTextConfig + + +logger = logging.get_logger(__name__) + + +# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L52 +class BlipTextEmbeddings(nn.Module): + """Construct the embeddings from word and position embeddings.""" + + def __init__(self, config): + super().__init__() + self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id) + self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) + + # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load + # any TensorFlow checkpoint file + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + # position_ids (1, len position emb) is contiguous in memory and exported when serialized + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) + self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") + + self.config = config + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + past_key_values_length: int = 0, + ) -> torch.Tensor: + if input_ids is not None: + input_shape = input_ids.size() + else: + input_shape = inputs_embeds.size()[:-1] + + seq_length = input_shape[1] + + if position_ids is None: + position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length] + + if inputs_embeds is None: + inputs_embeds = self.word_embeddings(input_ids) + + embeddings = inputs_embeds + + if self.position_embedding_type == "absolute": + position_embeddings = self.position_embeddings(position_ids) + embeddings += position_embeddings + embeddings = self.LayerNorm(embeddings) + embeddings = self.dropout(embeddings) + return embeddings + + +# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L97 +class BlipTextSelfAttention(nn.Module): + def __init__(self, config, is_cross_attention, layer_idx=None): + super().__init__() + self.config = config + if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): + raise ValueError( + "The hidden size (%d) is not a multiple of the number of attention heads (%d)" + % (config.hidden_size, config.num_attention_heads) + ) + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + self.layer_idx = layer_idx + + self.query = nn.Linear(config.hidden_size, self.all_head_size) + if is_cross_attention: + self.key = nn.Linear(config.encoder_hidden_size, self.all_head_size) + self.value = nn.Linear(config.encoder_hidden_size, self.all_head_size) + else: + self.key = nn.Linear(config.hidden_size, self.all_head_size) + self.value = nn.Linear(config.hidden_size, self.all_head_size) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + self.max_position_embeddings = config.max_position_embeddings + self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) + + def save_attn_gradients(self, attn_gradients): + self.attn_gradients = attn_gradients + + def get_attn_gradients(self): + return self.attn_gradients + + def save_attention_map(self, attention_map): + self.attention_map = attention_map + + def get_attention_map(self): + return self.attention_map + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_values: Optional[Cache] = None, + output_attentions: Optional[bool] = False, + cache_position: Optional[torch.Tensor] = None, + ) -> tuple[torch.Tensor]: + batch_size, seq_length, _ = hidden_states.shape + query_layer = ( + self.query(hidden_states) + .view(batch_size, -1, self.num_attention_heads, self.attention_head_size) + .transpose(1, 2) + ) + + # If this is instantiated as a cross-attention module, the keys + # and values come from an encoder; the attention mask needs to be + # such that the encoder's padding tokens are not attended to. + is_cross_attention = encoder_hidden_states is not None + attention_mask = encoder_attention_mask if is_cross_attention else attention_mask + + is_updated = False + if past_key_values is not None: + if isinstance(past_key_values, EncoderDecoderCache): + is_updated = past_key_values.is_updated.get(self.layer_idx) + if is_cross_attention: + # after the first generated id, we can subsequently re-use all key/value_layer from cache + curr_past_key_value = past_key_values.cross_attention_cache + else: + curr_past_key_value = past_key_values.self_attention_cache + else: + curr_past_key_value = past_key_values + + current_states = encoder_hidden_states if is_cross_attention else hidden_states + if is_cross_attention and past_key_values is not None and is_updated: + # reuse k,v, cross_attentions + key_layer = curr_past_key_value.layers[self.layer_idx].keys + value_layer = curr_past_key_value.layers[self.layer_idx].values + else: + key_layer = ( + self.key(current_states) + .view(batch_size, -1, self.num_attention_heads, self.attention_head_size) + .transpose(1, 2) + ) + value_layer = ( + self.value(current_states) + .view(batch_size, -1, self.num_attention_heads, self.attention_head_size) + .transpose(1, 2) + ) + + if past_key_values is not None: + # save all key/value_layer to cache to be re-used for fast auto-regressive generation + cache_position = cache_position if not is_cross_attention else None + key_layer, value_layer = curr_past_key_value.update( + key_layer, value_layer, self.layer_idx, {"cache_position": cache_position} + ) + # set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls + if is_cross_attention and isinstance(past_key_values, EncoderDecoderCache): + past_key_values.is_updated[self.layer_idx] = True + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + seq_length = hidden_states.size()[1] + position_ids_l = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(-1, 1) + position_ids_r = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(1, -1) + distance = position_ids_l - position_ids_r + positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1) + positional_embedding = positional_embedding.to(dtype=query_layer.dtype) # fp16 compatibility + + if self.position_embedding_type == "relative_key": + relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores + elif self.position_embedding_type == "relative_key_query": + relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key + + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in BlipTextModel forward() function) + attention_scores = attention_scores + attention_mask.to(attention_scores.device) + + # Normalize the attention scores to probabilities. + attention_probs = nn.Softmax(dim=-1)(attention_scores) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs_dropped = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs_dropped = attention_probs_dropped * head_mask + + context_layer = torch.matmul(attention_probs_dropped, value_layer) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(*new_context_layer_shape) + + return context_layer, attention_probs + + +# Copied from transformers.models.bert.modeling_bert.BertSelfOutput with Bert -> BlipText +class BlipTextSelfOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#242 +class BlipTextAttention(nn.Module): + def __init__(self, config, is_cross_attention=False, layer_idx=None): + super().__init__() + self.self = BlipTextSelfAttention(config, is_cross_attention, layer_idx=layer_idx) + self.output = BlipTextSelfOutput(config) + self.pruned_heads = set() + + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads + ) + + # Prune linear layers + self.self.query = prune_linear_layer(self.self.query, index) + self.self.key = prune_linear_layer(self.self.key, index) + self.self.value = prune_linear_layer(self.self.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.self.num_attention_heads = self.self.num_attention_heads - len(heads) + self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + past_key_values: Optional[Cache] = None, + output_attentions: Optional[bool] = False, + cache_position: Optional[torch.Tensor] = None, + ) -> tuple[torch.Tensor]: + self_outputs = self.self( + hidden_states, + attention_mask=attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + past_key_values=past_key_values, + output_attentions=output_attentions, + cache_position=cache_position, + ) + attention_output = self.output(self_outputs[0], hidden_states) + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + return outputs + + +# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert -> BlipText +class BlipTextIntermediate(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +# Copied from transformers.models.bert.modeling_bert.BertOutput with Bert -> BlipText +class BlipTextOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class BlipTextLayer(GradientCheckpointingLayer): + def __init__(self, config, layer_num): + super().__init__() + self.config = config + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 + self.attention = BlipTextAttention(config, layer_idx=layer_num) + self.layer_num = layer_num + if self.config.is_decoder: + self.crossattention = BlipTextAttention( + config, is_cross_attention=self.config.is_decoder, layer_idx=layer_num + ) + self.intermediate = BlipTextIntermediate(config) + self.output = BlipTextOutput(config) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_values: Optional[Cache] = None, + output_attentions: Optional[bool] = False, + cache_position: Optional[torch.Tensor] = None, + ) -> tuple[torch.Tensor]: + self_attention_outputs = self.attention( + hidden_states, + attention_mask=attention_mask, + head_mask=head_mask, + output_attentions=output_attentions, + past_key_values=past_key_values, + cache_position=cache_position, + ) + attention_output = self_attention_outputs[0] + outputs = self_attention_outputs[1:] + + if encoder_hidden_states is not None: + cross_attention_outputs = self.crossattention( + attention_output, + attention_mask=encoder_attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + past_key_values=past_key_values, + output_attentions=output_attentions, + cache_position=cache_position, + ) + attention_output = cross_attention_outputs[0] + outputs = outputs + cross_attention_outputs[1:] # add cross attentions if we output attention weights + layer_output = apply_chunking_to_forward( + self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output + ) + return (layer_output,) + outputs + + def feed_forward_chunk(self, attention_output): + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + return layer_output + + +# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L386 +class BlipTextEncoder(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.layer = nn.ModuleList([BlipTextLayer(config, i) for i in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_values: Optional[Cache] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = False, + output_hidden_states: Optional[bool] = False, + return_dict: Optional[bool] = True, + cache_position: Optional[torch.Tensor] = None, + ) -> Union[tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]: + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + if use_cache: + if isinstance(past_key_values, tuple): + logger.warning_once( + "Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. " + "You should pass an instance of `EncoderDecoderCache` instead, e.g. " + "`past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`." + ) + past_key_values = EncoderDecoderCache.from_legacy_cache(past_key_values) + # The model acts as encoder decoder but is not an encoder decoder. So we cast all cache objects to + # `EncoderDecoderCache` type assuming that the incoming cache is from `self_attention` + elif isinstance(past_key_values, DynamicCache): + past_key_values = EncoderDecoderCache(past_key_values, DynamicCache(config=self.config)) + elif past_key_values is None: + past_key_values = EncoderDecoderCache( + DynamicCache(config=self.config), DynamicCache(config=self.config) + ) + + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + all_cross_attentions = () if output_attentions and encoder_hidden_states is not None else None + + for i in range(self.config.num_hidden_layers): + layer_module = self.layer[i] + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_head_mask = head_mask[i] if head_mask is not None else None + + layer_outputs = layer_module( + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_values, + output_attentions, + cache_position, + ) + + hidden_states = layer_outputs[0] + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + if encoder_hidden_states is not None: + all_cross_attentions = all_cross_attentions + (layer_outputs[2],) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple( + v + for v in [ + hidden_states, + past_key_values, + all_hidden_states, + all_self_attentions, + all_cross_attentions, + ] + if v is not None + ) + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=past_key_values, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + cross_attentions=all_cross_attentions, + ) + + +# Copied from transformers.models.bert.modeling_bert.BertPooler with Bert->BlipText +class BlipTextPooler(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.activation = nn.Tanh() + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. + first_token_tensor = hidden_states[:, 0] + pooled_output = self.dense(first_token_tensor) + pooled_output = self.activation(pooled_output) + return pooled_output + + +# Copied from transformers.models.bert.modeling_bert.BertPredictionHeadTransform with Bert->BlipText +class BlipTextPredictionHeadTransform(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + if isinstance(config.hidden_act, str): + self.transform_act_fn = ACT2FN[config.hidden_act] + else: + self.transform_act_fn = config.hidden_act + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.transform_act_fn(hidden_states) + hidden_states = self.LayerNorm(hidden_states) + return hidden_states + + +# Copied from transformers.models.bert.modeling_bert.BertLMPredictionHead with Bert->BlipText +class BlipTextLMPredictionHead(nn.Module): + def __init__(self, config): + super().__init__() + self.transform = BlipTextPredictionHeadTransform(config) + + # The output weights are the same as the input embeddings, but there is + # an output-only bias for each token. + self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + self.bias = nn.Parameter(torch.zeros(config.vocab_size)) + + # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` + self.decoder.bias = self.bias + + def _tie_weights(self): + self.decoder.bias = self.bias + + def forward(self, hidden_states): + hidden_states = self.transform(hidden_states) + hidden_states = self.decoder(hidden_states) + return hidden_states + + +# Copied from transformers.models.bert.modeling_bert.BertOnlyMLMHead with Bert->BlipText +class BlipTextOnlyMLMHead(nn.Module): + def __init__(self, config): + super().__init__() + self.predictions = BlipTextLMPredictionHead(config) + + def forward(self, sequence_output: torch.Tensor) -> torch.Tensor: + prediction_scores = self.predictions(sequence_output) + return prediction_scores + + +# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L548 +class BlipTextPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config: BlipTextConfig + base_model_prefix = "bert" + _no_split_modules = [] + + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, (nn.Linear, nn.Embedding)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + if isinstance(module, nn.Linear) and module.bias is not None: + module.bias.data.zero_() + + +# Adapted from https://github.com/salesforce/BLIP/blob/3a29b7410476bf5f2ba0955827390eb6ea1f4f9d/models/med.py#L571 +class BlipTextModel(BlipTextPreTrainedModel): + """ + The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of + cross-attention is added between the self-attention layers, following the architecture described in [Attention is + all you need](https://huggingface.co/papers/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, + Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin. argument and `is_decoder` set to `True`; an + `encoder_hidden_states` is then expected as an input to the forward pass. + """ + + def __init__(self, config, add_pooling_layer=True): + super().__init__(config) + self.config = config + + self.embeddings = BlipTextEmbeddings(config) + self.encoder = BlipTextEncoder(config) + self.pooler = BlipTextPooler(config) if add_pooling_layer else None + + self.post_init() + + def get_input_embeddings(self): + return self.embeddings.word_embeddings + + def set_input_embeddings(self, value): + self.embeddings.word_embeddings = value + + # Copied from transformers.models.bert.modeling_bert.BertModel._prune_heads + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + def get_extended_attention_mask( + self, attention_mask: Tensor, input_shape: tuple[int], device: device, is_decoder: bool + ) -> Tensor: + """ + Makes broadcastable attention and causal masks so that future and masked tokens are ignored. + + Arguments: + attention_mask (`torch.Tensor`): + Mask with ones indicating tokens to attend to, zeros for tokens to ignore. + input_shape (`tuple[int]`): + The shape of the input to the model. + device (`torch.device`): + The device of the input to the model. + + Returns: + `torch.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`. + """ + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + if attention_mask.dim() == 3: + extended_attention_mask = attention_mask[:, None, :, :] + elif attention_mask.dim() == 2: + # Provided a padding mask of dimensions [batch_size, seq_length] + # - if the model is a decoder, apply a causal mask in addition to the padding mask + # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length] + if is_decoder: + batch_size, seq_length = input_shape + + seq_ids = torch.arange(seq_length, device=device) + causal_mask = seq_ids[None, None, :].repeat(batch_size, seq_length, 1) <= seq_ids[None, :, None] + # in case past_key_values are used we need to add a prefix ones mask to the causal mask + causal_mask = causal_mask.to(attention_mask.dtype) + + if causal_mask.shape[1] < attention_mask.shape[1]: + prefix_seq_len = attention_mask.shape[1] - causal_mask.shape[1] + causal_mask = torch.cat( + [ + torch.ones( + (batch_size, seq_length, prefix_seq_len), device=device, dtype=causal_mask.dtype + ), + causal_mask, + ], + axis=-1, + ) + + extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[:, None, None, :] + else: + extended_attention_mask = attention_mask[:, None, None, :] + else: + raise ValueError( + f"Wrong shape for input_ids (shape {input_shape}) or attention_mask (shape {attention_mask.shape})" + ) + + # Since attention_mask is 1.0 for positions we want to attend and 0.0 for + # masked positions, this operation will create a tensor which is 0.0 for + # positions we want to attend and -10000.0 for masked positions. + # Since we are adding it to the raw scores before the softmax, this is + # effectively the same as removing these entirely. + extended_attention_mask = extended_attention_mask.to(dtype=self.dtype) # fp16 compatibility + extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 + return extended_attention_mask + + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + encoder_embeds: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[Cache] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + is_decoder: Optional[bool] = False, + cache_position: Optional[torch.Tensor] = None, + ) -> Union[tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]: + r""" + encoder_hidden_states (`torch.FloatTensor`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if + the model is configured as a decoder. + encoder_attention_mask (`torch.FloatTensor`, *optional*): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in + the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + past_key_values (`Cache`, *optional*): + Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `(batch_size, sequence_length)`. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if is_decoder: + use_cache = use_cache if use_cache is not None else self.config.use_cache + else: + use_cache = False + + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask) + input_shape = input_ids.size() + batch_size, seq_length = input_shape + device = input_ids.device + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + batch_size, seq_length = input_shape + device = inputs_embeds.device + elif encoder_embeds is not None: + input_shape = encoder_embeds.size()[:-1] + batch_size, seq_length = input_shape + device = encoder_embeds.device + else: + raise ValueError("You have to specify either input_ids or inputs_embeds or encoder_embeds") + + past_key_values_length = 0 + if past_key_values is not None: + past_key_values_length = ( + past_key_values[0][0].shape[-2] + if not isinstance(past_key_values, Cache) + else past_key_values.get_seq_length() + ) + + if attention_mask is None: + attention_mask = torch.ones((batch_size, seq_length + past_key_values_length)).to(device) + + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + extended_attention_mask: torch.Tensor = self.get_extended_attention_mask( + attention_mask, input_shape, device, is_decoder + ) + + # If a 2D or 3D attention mask is provided for the cross-attention + # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] + if encoder_hidden_states is not None: + if isinstance(encoder_hidden_states, list): + encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states[0].size() + else: + encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size() + encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) + + if isinstance(encoder_attention_mask, list): + encoder_extended_attention_mask = [self.invert_attention_mask(mask) for mask in encoder_attention_mask] + elif encoder_attention_mask is None: + encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device) + encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) + else: + encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) + else: + encoder_extended_attention_mask = None + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + + if encoder_embeds is None: + embedding_output = self.embeddings( + input_ids=input_ids, + position_ids=position_ids, + inputs_embeds=inputs_embeds, + past_key_values_length=past_key_values_length, + ) + else: + embedding_output = encoder_embeds + + encoder_outputs = self.encoder( + embedding_output, + attention_mask=extended_attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_extended_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + cache_position=cache_position, + ) + sequence_output = encoder_outputs[0] + pooled_output = self.pooler(sequence_output) if self.pooler is not None else None + + if not return_dict: + return (sequence_output, pooled_output) + encoder_outputs[1:] + + return BaseModelOutputWithPoolingAndCrossAttentions( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + past_key_values=encoder_outputs.past_key_values, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + cross_attentions=encoder_outputs.cross_attentions, + ) + + +# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L811 +class BlipTextLMHeadModel(BlipTextPreTrainedModel, GenerationMixin): + _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"] + + def __init__(self, config): + super().__init__(config) + + self.bert = BlipTextModel(config, add_pooling_layer=False) + self.cls = BlipTextOnlyMLMHead(config) + self.label_smoothing = config.label_smoothing + + def get_input_embeddings(self): + return self.bert.get_input_embeddings() + + def set_input_embeddings(self, new_embeddings): + self.bert.set_input_embeddings(new_embeddings) + + def get_output_embeddings(self): + return self.cls.predictions.decoder + + def set_output_embeddings(self, new_embeddings): + self.cls.predictions.decoder = new_embeddings + self.cls.predictions.bias = new_embeddings.bias + + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + past_key_values: Optional[Cache] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + return_logits: Optional[bool] = False, + is_decoder: Optional[bool] = True, + reduction: Optional[str] = "mean", + cache_position: Optional[torch.Tensor] = None, + ) -> Union[tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]: + r""" + encoder_hidden_states (`torch.FloatTensor`, *optional*): Sequence of + hidden-states at the output of the last layer of the encoder. Used in the cross-attention if the model is + configured as a decoder. + encoder_attention_mask (`torch.FloatTensor`, *optional*): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in + the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + labels (`torch.LongTensor`, *optional*): + Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in + `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are + ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]` + past_key_values (`Cache`, *optional*): + Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `(batch_size, sequence_length)`. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + if labels is not None: + use_cache = False + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + is_decoder=is_decoder, + cache_position=cache_position, + ) + + sequence_output = outputs[0] + prediction_scores = self.cls(sequence_output) + + if return_logits: + return prediction_scores[:, :-1, :].contiguous() + + lm_loss = None + if labels is not None: + # we are doing next-token prediction; shift prediction scores and input ids by one + shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous() + labels = labels[:, 1:].contiguous().to(shifted_prediction_scores.device) + loss_fct = CrossEntropyLoss(reduction=reduction, label_smoothing=self.label_smoothing) + lm_loss = loss_fct(shifted_prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) + if reduction == "none": + lm_loss = lm_loss.view(prediction_scores.size(0), -1).sum(1) + + if not return_dict: + output = (prediction_scores,) + outputs[2:] + return ((lm_loss,) + output) if lm_loss is not None else output + + return CausalLMOutputWithCrossAttentions( + loss=lm_loss, + logits=prediction_scores, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + cross_attentions=outputs.cross_attentions, + ) + + def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attention_mask=None, **model_kwargs): + # Overwrite -- hardcoded key return (`is_decoder=True`) + + model_inputs = super().prepare_inputs_for_generation( + input_ids, + past_key_values=past_key_values, + attention_mask=attention_mask, + **model_kwargs, + ) + model_inputs["is_decoder"] = True + + return model_inputs + + +__all__ = ["BlipTextModel", "BlipTextLMHeadModel", "BlipTextPreTrainedModel"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/blip/modeling_tf_blip.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/blip/modeling_tf_blip.py new file mode 100644 index 0000000000000000000000000000000000000000..a1a1f7928273c69de30b4ba1c190435d91dfd509 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/blip/modeling_tf_blip.py @@ -0,0 +1,1709 @@ +# coding=utf-8 +# Copyright 2023 The Salesforce Team Authors and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""TensorFlow BLIP model.""" + +from __future__ import annotations + +import warnings +from dataclasses import dataclass +from typing import Any + +import tensorflow as tf + +from ...modeling_tf_outputs import TFBaseModelOutput, TFBaseModelOutputWithPooling +from ...modeling_tf_utils import ( + TFPreTrainedModel, + get_initializer, + get_tf_activation, + keras, + keras_serializable, + shape_list, + unpack_inputs, +) +from ...tf_utils import check_embeddings_within_bounds, stable_softmax +from ...utils import ( + ModelOutput, + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, + replace_return_docstrings, +) +from .configuration_blip import BlipConfig, BlipTextConfig, BlipVisionConfig +from .modeling_tf_blip_text import BLIP_TEXT_INPUTS_DOCSTRING, TFBlipTextLMHeadModel, TFBlipTextModel + + +logger = logging.get_logger(__name__) + +_CHECKPOINT_FOR_DOC = "Salesforce/blip-vqa-base" + + +# Copied from transformers.models.clip.modeling_tf_clip.contrastive_loss +def contrastive_loss(logits: tf.Tensor) -> tf.Tensor: + return tf.math.reduce_mean( + keras.metrics.sparse_categorical_crossentropy( + y_true=tf.range(shape_list(logits)[0]), y_pred=logits, from_logits=True + ) + ) + + +# Copied from transformers.models.clip.modeling_tf_clip.clip_loss with clip->blip +def blip_loss(similarity: tf.Tensor) -> tf.Tensor: + caption_loss = contrastive_loss(similarity) + image_loss = contrastive_loss(tf.transpose(similarity)) + return (caption_loss + image_loss) / 2.0 + + +@dataclass +class TFBlipForConditionalGenerationModelOutput(ModelOutput): + """ + Adapted from the base class for vision model's outputs that also contains image embeddings of the pooling of the + last hidden states. This class also adds the loss term from the text decoder. + + Args: + loss (`tf.Tensor`, *optional*, returned when `labels` is provided, `tf.Tensor` of shape `(1,)`): + Language modeling loss from the text decoder. + logits (`tf.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`, *optional*): + Prediction scores of the language modeling head of the text decoder model. + image_embeds (`tf.Tensor` of shape `(batch_size, output_dim)`, *optional*): + The image embeddings obtained after applying the Vision Transformer model to the input image. + last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the model. + hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True`): + Tuple of `tf.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + one for + the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed): + Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads.` + """ + + loss: tuple[tf.Tensor] | None = None + logits: tuple[tf.Tensor] | None = None + image_embeds: tf.Tensor | None = None + last_hidden_state: tf.Tensor | None = None + hidden_states: tuple[tf.Tensor, ...] | None = None + attentions: tuple[tf.Tensor, ...] | None = None + + @property + def decoder_logits(self): + warnings.warn( + "`decoder_logits` attribute is deprecated and will be removed in version 5 of Transformers." + " Please use the `logits` attribute to retrieve the final output instead.", + FutureWarning, + ) + return self.logits + + +@dataclass +class TFBlipTextVisionModelOutput(ModelOutput): + """ + Adapted from the base class for vision model's outputs that also contains image embeddings of the pooling of the + last hidden states. This class also adds the loss term from the text decoder. + + Args: + loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Language modeling loss from the text decoder. + image_embeds (`tf.Tensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`): + The image embeddings obtained by applying the projection layer to the pooler_output. + last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `tf.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + one for + the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: tf.Tensor | None = None + image_embeds: tf.Tensor | None = None + last_hidden_state: tf.Tensor | None = None + hidden_states: tuple[tf.Tensor, ...] | None = None + attentions: tuple[tf.Tensor, ...] | None = None + + +@dataclass +class TFBlipImageTextMatchingModelOutput(ModelOutput): + """ + Adapted from the base class for vision model's outputs that also contains image embeddings of the pooling of the + last hidden states. This class also adds the loss term from the text decoder as well as the image-text similarity + scores. + + Args: + itm_score (`tf.Tensor`): + The image-text similarity scores. + loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Language modeling loss from the text decoder. + image_embeds (`tf.Tensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`): + The image embeddings obtained by applying the projection layer to the pooler_output. + last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `tf.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + one for + the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + vision_pooler_output (`tf.Tensor` of shape `(batch_size, hidden_size)`, *optional*): + Last layer hidden-state of the vision of the vision-only branch of the model. + attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + question_embeds (`tf.Tensor`): + The question embeddings obtained by the text projection layer. + """ + + itm_score: tf.Tensor | None = None + loss: tf.Tensor | None = None + image_embeds: tf.Tensor | None = None + last_hidden_state: tf.Tensor | None = None + hidden_states: tuple[tf.Tensor, ...] | None = None + vision_pooler_output: tf.Tensor | None = None + attentions: tuple[tf.Tensor, ...] | None = None + question_embeds: tuple[tf.Tensor] | None = None + + +@dataclass +class TFBlipOutput(ModelOutput): + """ + Args: + loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`): + Contrastive loss for image-text similarity. + logits_per_image:(`tf.Tensor` of shape `(image_batch_size, text_batch_size)`): + The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text + similarity scores. + logits_per_text:(`tf.Tensor` of shape `(text_batch_size, image_batch_size)`): + The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image + similarity scores. + text_embeds(`tf.Tensor` of shape `(batch_size, output_dim`): + The text embeddings obtained by applying the projection layer to the pooled output of [`BlipTextModel`]. + image_embeds(`tf.Tensor` of shape `(batch_size, output_dim`): + The image embeddings obtained by applying the projection layer to the pooled output of [`BlipVisionModel`]. + text_model_output(`BaseModelOutputWithPooling`): + The output of the [`BlipTextModel`]. + vision_model_output(`BaseModelOutputWithPooling`): + The output of the [`BlipVisionModel`]. + """ + + loss: tf.Tensor | None = None + logits_per_image: tf.Tensor | None = None + logits_per_text: tf.Tensor | None = None + text_embeds: tf.Tensor | None = None + image_embeds: tf.Tensor | None = None + text_model_output: TFBaseModelOutputWithPooling = None + vision_model_output: TFBaseModelOutputWithPooling = None + + def to_tuple(self) -> tuple[Any]: + return tuple( + self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple() + for k in self.keys() + ) + + +class TFBlipVisionEmbeddings(keras.layers.Layer): + def __init__(self, config: BlipVisionConfig, **kwargs): + super().__init__(**kwargs) + self.config = config + self.embed_dim = config.hidden_size + self.image_size = config.image_size + self.patch_size = config.patch_size + + self.patch_embedding = keras.layers.Conv2D( + filters=self.embed_dim, + kernel_size=self.patch_size, + strides=self.patch_size, + kernel_initializer=get_initializer(self.config.initializer_range), + data_format="channels_last", + name="patch_embedding", + ) + + self.num_patches = (self.image_size // self.patch_size) ** 2 + self.num_positions = self.num_patches + 1 + + def build(self, input_shape=None): + self.class_embedding = self.add_weight( + shape=(1, 1, self.embed_dim), + initializer=get_initializer(self.config.initializer_range), + trainable=True, + name="class_embedding", + ) + + self.position_embedding = self.add_weight( + shape=(1, self.num_positions, self.embed_dim), + initializer=get_initializer(self.config.initializer_range), + trainable=True, + name="position_embedding", + ) + + if self.built: + return + self.built = True + if getattr(self, "patch_embedding", None) is not None: + with tf.name_scope(self.patch_embedding.name): + self.patch_embedding.build([None, None, None, 3]) + + def call(self, pixel_values: tf.Tensor) -> tf.Tensor: + # Input is channels-first, we transpose. PyTorch transposes after the conv because PyTorch + # likes channels-first convs. + batch_size = tf.shape(pixel_values)[0] + pixel_values = tf.transpose(pixel_values, perm=(0, 2, 3, 1)) + patch_embeds = self.patch_embedding(pixel_values) + patch_embeds = tf.reshape(patch_embeds, (batch_size, self.num_patches, -1)) + + class_embeds = tf.broadcast_to(self.class_embedding, (batch_size, 1, self.embed_dim)) + embeddings = tf.concat([class_embeds, patch_embeds], axis=1) + embeddings = embeddings + self.position_embedding[:, : tf.shape(embeddings)[1], :] + return embeddings + + +# Copied from transformers.models.clip.modeling_tf_clip.TFCLIPTextEmbeddings with CLIP->Blip +class TFBlipTextEmbeddings(keras.layers.Layer): + def __init__(self, config: BlipTextConfig, **kwargs): + super().__init__(**kwargs) + + self.embed_dim = config.hidden_size + + self.config = config + + def build(self, input_shape: tf.TensorShape = None): + with tf.name_scope("token_embedding"): + self.weight = self.add_weight( + shape=(self.config.vocab_size, self.embed_dim), + initializer=get_initializer(self.config.initializer_factor * self.config.initializer_range), + trainable=True, + name="weight", + ) + + with tf.name_scope("position_embedding"): + self.position_embedding = self.add_weight( + shape=(self.config.max_position_embeddings, self.embed_dim), + initializer=get_initializer(self.config.initializer_factor * self.config.initializer_range), + trainable=True, + name="embeddings", + ) + + super().build(input_shape) + + def call( + self, + input_ids: tf.Tensor | None = None, + position_ids: tf.Tensor | None = None, + inputs_embeds: tf.Tensor | None = None, + ) -> tf.Tensor: + """ + Applies embedding based on inputs tensor. + + Returns: + final_embeddings (`tf.Tensor`): output embedding tensor. + """ + if input_ids is None and inputs_embeds is None: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + if inputs_embeds is None: + check_embeddings_within_bounds(input_ids, self.config.vocab_size) + inputs_embeds = tf.gather(params=self.weight, indices=input_ids) + + input_shape = shape_list(inputs_embeds)[:-1] + + if position_ids is None: + position_ids = tf.expand_dims(tf.range(start=0, limit=input_shape[-1]), axis=0) + + position_embeds = tf.gather(params=self.position_embedding, indices=position_ids) + position_embeds = tf.tile(input=position_embeds, multiples=(input_shape[0], 1, 1)) + final_embeddings = inputs_embeds + position_embeds + + return final_embeddings + + +class TFBlipAttention(keras.layers.Layer): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + self.config = config + self.embed_dim = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.embed_dim // self.num_heads + if self.head_dim * self.num_heads != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:" + f" {self.num_heads})." + ) + self.scale = self.head_dim**-0.5 + self.dropout = keras.layers.Dropout(config.attention_dropout, name="dropout") + + self.qkv = keras.layers.Dense( + 3 * self.embed_dim, kernel_initializer=get_initializer(config.initializer_range), name="qkv" + ) + + self.projection = keras.layers.Dense( + self.embed_dim, kernel_initializer=get_initializer(config.initializer_range), name="projection" + ) + + def call( + self, + hidden_states: tf.Tensor, + head_mask: tf.Tensor | None = None, + output_attentions: bool | None = False, + training: bool | None = None, + ) -> tuple[tf.Tensor, tf.Tensor | None, tuple[tf.Tensor] | None]: + """Input shape: Batch x Time x Channel""" + + bsz, tgt_len, embed_dim = shape_list(hidden_states) + + mixed_qkv = self.qkv(hidden_states) + mixed_qkv = tf.reshape(mixed_qkv, (bsz, tgt_len, 3, self.num_heads, self.head_dim)) + mixed_qkv = tf.transpose(mixed_qkv, perm=(2, 0, 3, 1, 4)) + + query_states, key_states, value_states = mixed_qkv[0], mixed_qkv[1], mixed_qkv[2] + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = query_states @ tf.transpose(key_states, (0, 1, 3, 2)) + + attention_scores = attention_scores * self.scale + + # Normalize the attention scores to probabilities. + attention_probs = stable_softmax(attention_scores, axis=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs, training=training) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = attention_probs * head_mask + + context_layer = tf.transpose(attention_probs @ value_states, perm=(0, 2, 1, 3)) + + new_context_layer_shape = shape_list(context_layer)[:-2] + [self.embed_dim] + context_layer = tf.reshape(context_layer, new_context_layer_shape) + + output = self.projection(context_layer) + + outputs = (output, attention_probs) if output_attentions else (output, None) + + return outputs + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dropout", None) is not None: + with tf.name_scope(self.dropout.name): + self.dropout.build(None) + if getattr(self, "qkv", None) is not None: + with tf.name_scope(self.qkv.name): + self.qkv.build([None, None, self.embed_dim]) + if getattr(self, "projection", None) is not None: + with tf.name_scope(self.projection.name): + self.projection.build([None, None, self.embed_dim]) + + +class TFBlipMLP(keras.layers.Layer): + def __init__(self, config: BlipConfig, **kwargs): + super().__init__(**kwargs) + + self.activation_fn = get_tf_activation(config.hidden_act) + + in_proj_std = (config.hidden_size**-0.5) * ((2 * config.num_hidden_layers) ** -0.5) + fc_std = (2 * config.hidden_size) ** -0.5 + + self.fc1 = keras.layers.Dense( + units=config.intermediate_size, kernel_initializer=get_initializer(fc_std), name="fc1" + ) + self.fc2 = keras.layers.Dense( + units=config.hidden_size, kernel_initializer=get_initializer(in_proj_std), name="fc2" + ) + self.config = config + + def call(self, hidden_states: tf.Tensor) -> tf.Tensor: + hidden_states = self.fc1(inputs=hidden_states) + hidden_states = self.activation_fn(hidden_states) + hidden_states = self.fc2(inputs=hidden_states) + return hidden_states + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "fc1", None) is not None: + with tf.name_scope(self.fc1.name): + self.fc1.build([None, None, self.config.hidden_size]) + if getattr(self, "fc2", None) is not None: + with tf.name_scope(self.fc2.name): + self.fc2.build([None, None, self.config.intermediate_size]) + + +class TFBlipEncoderLayer(keras.layers.Layer): + def __init__(self, config: BlipConfig, **kwargs): + super().__init__(**kwargs) + self.embed_dim = config.hidden_size + self.self_attn = TFBlipAttention(config, name="self_attn") + self.layer_norm1 = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm1") + self.mlp = TFBlipMLP(config, name="mlp") + self.layer_norm2 = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm2") + + def call( + self, + hidden_states: tf.Tensor, + attention_mask: tf.Tensor, + output_attentions: bool | None = False, + training: bool | None = None, + ) -> tuple[tf.Tensor]: + """ + Args: + hidden_states (`tf.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`tf.Tensor`): attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + `(config.encoder_attention_heads,)`. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + """ + residual = hidden_states + + hidden_states = self.layer_norm1(hidden_states) + hidden_states, attn_weights = self.self_attn( + hidden_states=hidden_states, + head_mask=attention_mask, + output_attentions=output_attentions, + training=training, + ) + hidden_states = hidden_states + residual + residual = hidden_states + hidden_states = self.layer_norm2(hidden_states) + hidden_states = self.mlp(hidden_states) + + hidden_states = hidden_states + residual + + outputs = (hidden_states,) + + if output_attentions: + outputs += (attn_weights,) + + return outputs + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "self_attn", None) is not None: + with tf.name_scope(self.self_attn.name): + self.self_attn.build(None) + if getattr(self, "layer_norm1", None) is not None: + with tf.name_scope(self.layer_norm1.name): + self.layer_norm1.build([None, None, self.embed_dim]) + if getattr(self, "mlp", None) is not None: + with tf.name_scope(self.mlp.name): + self.mlp.build(None) + if getattr(self, "layer_norm2", None) is not None: + with tf.name_scope(self.layer_norm2.name): + self.layer_norm2.build([None, None, self.embed_dim]) + + +class TFBlipPreTrainedModel(TFPreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = BlipConfig + base_model_prefix = "blip" + _keys_to_ignore_on_load_missing = [r"position_ids"] + + +BLIP_START_DOCSTRING = r""" + This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and + behavior. + + Parameters: + config ([`BlipConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights. +""" + +BLIP_VISION_INPUTS_DOCSTRING = r""" + Args: + pixel_values (`tf.Tensor` of shape `(batch_size, num_channels, height, width)`): + Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using + [`BlipImageProcessor`]. See [`BlipImageProcessor.__call__`] for details. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + +BLIP_INPUTS_DOCSTRING = r""" + Args: + input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoProcessor`]. See [`BlipProcessor.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + position_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + + [What are position IDs?](../glossary#position-ids) + pixel_values (`tf.Tensor` of shape `(batch_size, num_channels, height, width)`): + Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using + [`BlipImageProcessor`]. See [`BlipImageProcessor.__call__`] for details. + return_loss (`bool`, *optional*): + Whether or not to return the contrastive loss. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +@keras_serializable +class TFBlipEncoder(keras.layers.Layer): + config_class = BlipConfig + """ + Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a + [`BlipEncoderLayer`]. + + Args: + config (`BlipConfig`): + The corresponding vision configuration for the `BlipEncoder`. + """ + + def __init__(self, config: BlipConfig, **kwargs): + super().__init__(**kwargs) + self.config = config + self.layers = [TFBlipEncoderLayer(config, name=f"layers_._{i}") for i in range(config.num_hidden_layers)] + + @unpack_inputs + def call( + self, + inputs_embeds, + attention_mask: tf.Tensor | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + training: bool | None = None, + ) -> tuple | TFBaseModelOutput: + r""" + Args: + inputs_embeds (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`): + Embedded representation of the inputs. Should be float, not int tokens. + attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + encoder_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + + hidden_states = inputs_embeds + for idx, encoder_layer in enumerate(self.layers): + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + layer_outputs = encoder_layer( + hidden_states, + attention_mask, + output_attentions=output_attentions, + training=training, + ) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) + return TFBaseModelOutput( + last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "layers", None) is not None: + for layer in self.layers: + with tf.name_scope(layer.name): + layer.build(None) + + +class TFBlipVisionModel(TFBlipPreTrainedModel): + main_input_name = "pixel_values" + config_class = BlipVisionConfig + + def __init__(self, config: BlipVisionConfig, *args, **kwargs): + super().__init__(config, *args, **kwargs) + self.config = config + + self.embeddings = TFBlipVisionEmbeddings(config, name="embeddings") + self.encoder = TFBlipEncoder(config, name="encoder") + self.post_layernorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="post_layernorm") + self.embed_dim = config.hidden_size + + def serving_output(self, output: TFBaseModelOutputWithPooling) -> TFBaseModelOutputWithPooling: + hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None + attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None + + return TFBaseModelOutputWithPooling( + last_hidden_state=output.last_hidden_state, + pooler_output=output.pooler_output, + hidden_states=hs, + attentions=attns, + ) + + @unpack_inputs + @add_start_docstrings_to_model_forward(BLIP_VISION_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=TFBaseModelOutputWithPooling, config_class=BlipVisionConfig) + def call( + self, + pixel_values: tf.Tensor | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + training: bool | None = None, + ) -> tuple | TFBaseModelOutputWithPooling: + r""" + Returns: + + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if pixel_values is None: + raise ValueError("You have to specify pixel_values") + + hidden_states = self.embeddings(pixel_values) + + encoder_outputs = self.encoder( + inputs_embeds=hidden_states, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + last_hidden_state = encoder_outputs[0] + last_hidden_state = self.post_layernorm(last_hidden_state) + + pooled_output = last_hidden_state[:, 0, :] + # TF gets confused if we call the layer with inputs of different ranks, so insert a singleton dimension + pooled_output = self.post_layernorm(tf.expand_dims(pooled_output, 1)) + pooled_output = tf.squeeze(pooled_output, 1) + + if not return_dict: + return (last_hidden_state, pooled_output) + encoder_outputs[1:] + + return TFBaseModelOutputWithPooling( + last_hidden_state=last_hidden_state, + pooler_output=pooled_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + def get_input_embeddings(self): + return self.embeddings + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "embeddings", None) is not None: + with tf.name_scope(self.embeddings.name): + self.embeddings.build(None) + if getattr(self, "encoder", None) is not None: + with tf.name_scope(self.encoder.name): + self.encoder.build(None) + if getattr(self, "post_layernorm", None) is not None: + with tf.name_scope(self.post_layernorm.name): + self.post_layernorm.build([None, None, self.embed_dim]) + + +class TFBlipMainLayer(keras.layers.Layer): + config_class = BlipConfig + + def __init__(self, config: BlipConfig, *args, **kwargs): + super().__init__(*args, **kwargs) + + if not isinstance(config.text_config, BlipTextConfig): + raise TypeError( + "config.text_config is expected to be of type BlipTextConfig but is of type" + f" {type(config.text_config)}." + ) + + if not isinstance(config.vision_config, BlipVisionConfig): + raise TypeError( + "config.vision_config is expected to be of type BlipVisionConfig but is of type" + f" {type(config.vision_config)}." + ) + + text_config = config.text_config + vision_config = config.vision_config + + self.projection_dim = config.projection_dim + self.text_embed_dim = text_config.hidden_size + self.vision_embed_dim = vision_config.hidden_size + + self.text_model = TFBlipTextModel(text_config, name="text_model") + self.vision_model = TFBlipVisionModel(vision_config, name="vision_model") + + self.visual_projection = keras.layers.Dense( + self.projection_dim, + use_bias=False, + kernel_initializer=get_initializer(config.initializer_range), + name="visual_projection", + ) + self.text_projection = keras.layers.Dense( + self.projection_dim, + use_bias=False, + kernel_initializer=get_initializer(config.initializer_range), + name="text_projection", + ) + + self.config = config + + def build(self, input_shape=None): + self.logit_scale = self.add_weight( + name="logit_scale", + shape=[], + initializer=keras.initializers.Constant(self.config.logit_scale_init_value), + trainable=True, + ) + + if self.built: + return + self.built = True + if getattr(self, "text_model", None) is not None: + with tf.name_scope(self.text_model.name): + self.text_model.build(None) + if getattr(self, "vision_model", None) is not None: + with tf.name_scope(self.vision_model.name): + self.vision_model.build(None) + if getattr(self, "visual_projection", None) is not None: + with tf.name_scope(self.visual_projection.name): + self.visual_projection.build([None, None, self.vision_embed_dim]) + if getattr(self, "text_projection", None) is not None: + with tf.name_scope(self.text_projection.name): + self.text_projection.build([None, None, self.text_embed_dim]) + + @unpack_inputs + def call( + self, + input_ids: tf.Tensor | None = None, + pixel_values: tf.Tensor | None = None, + attention_mask: tf.Tensor | None = None, + position_ids: tf.Tensor | None = None, + return_loss: bool | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + training: bool | None = None, + ) -> tuple | TFBlipOutput: + # Use BLIP model's config for some fields (if specified) instead of those of vision & text components. + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + vision_outputs = self.vision_model( + pixel_values=pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + text_outputs = self.text_model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + image_embeds = vision_outputs[1] + image_embeds = self.visual_projection(image_embeds) + + text_embeds = text_outputs[1] + text_embeds = self.text_projection(text_embeds) + + # normalized features + image_embeds = image_embeds / tf.norm(image_embeds, ord=2, axis=-1, keepdims=True) + text_embeds = text_embeds / tf.norm(text_embeds, ord=2, axis=-1, keepdims=True) + + # cosine similarity as logits + logit_scale = tf.exp(self.logit_scale) + logits_per_text = tf.matmul(text_embeds, image_embeds, transpose_b=True) * logit_scale + logits_per_image = tf.transpose(logits_per_text) + + loss = None + if return_loss: + loss = blip_loss(logits_per_text) + loss = tf.reshape(loss, (1,)) + + if not return_dict: + output = (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs) + return ((loss,) + output) if loss is not None else output + + return TFBlipOutput( + loss=loss, + logits_per_image=logits_per_image, + logits_per_text=logits_per_text, + text_embeds=text_embeds, + image_embeds=image_embeds, + text_model_output=text_outputs, + vision_model_output=vision_outputs, + ) + + +class TFBlipModel(TFBlipPreTrainedModel): + config_class = BlipConfig + _keys_to_ignore_on_load_missing = [r"text_decoder.cls.predictions.decoder.bias"] + main_input_name = "input_ids" + + def __init__(self, config: BlipConfig, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + self.blip = TFBlipMainLayer(config, name="blip") + + def serving_output(self, output: TFBlipOutput) -> TFBlipOutput: + return TFBlipOutput( + logits_per_image=output.logits_per_image, + logits_per_text=output.logits_per_text, + text_embeds=output.text_embeds, + image_embeds=output.image_embeds, + ) + + @unpack_inputs + @add_start_docstrings_to_model_forward(BLIP_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=TFBlipOutput, config_class=BlipConfig) + def call( + self, + input_ids: tf.Tensor | None = None, + pixel_values: tf.Tensor | None = None, + attention_mask: tf.Tensor | None = None, + position_ids: tf.Tensor | None = None, + return_loss: bool | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + training: bool | None = None, + ) -> tuple | TFBlipOutput: + r""" + Returns: + + Examples: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, TFBlipModel + + >>> model = TFBlipModel.from_pretrained("Salesforce/blip-image-captioning-base") + >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> inputs = processor( + ... text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="tf", padding=True + ... ) + + >>> outputs = model(**inputs) + >>> logits_per_image = outputs.logits_per_image # this is the image-text similarity score + >>> probs = tf.nn.softmax(logits_per_image, axis=1) # we can take the softmax to get the label probabilities + ```""" + outputs = self.blip( + input_ids=input_ids, + pixel_values=pixel_values, + attention_mask=attention_mask, + position_ids=position_ids, + return_loss=return_loss, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + return outputs + + @add_start_docstrings_to_model_forward(BLIP_TEXT_INPUTS_DOCSTRING) + def get_text_features( + self, + input_ids: tf.Tensor | None = None, + attention_mask: tf.Tensor | None = None, + position_ids: tf.Tensor | None = None, + return_dict: bool | None = None, + ) -> tf.Tensor: + r""" + Returns: + text_features (`tf.Tensor` of shape `(batch_size, output_dim`): The text embeddings obtained by applying + the projection layer to the pooled output of [`TFBlipTextModel`]. + + Examples: + + ```python + >>> from transformers import AutoProcessor, TFBlipModel + + >>> model = TFBlipModel.from_pretrained("Salesforce/blip-image-captioning-base") + >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base") + + >>> inputs = processor(text=["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="tf") + >>> text_features = model.get_text_features(**inputs) + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + text_outputs = self.blip.text_model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + return_dict=return_dict, + ) + + pooled_output = text_outputs[1] + text_features = self.blip.text_projection(pooled_output) + + return text_features + + @add_start_docstrings_to_model_forward(BLIP_VISION_INPUTS_DOCSTRING) + def get_image_features( + self, + pixel_values: tf.Tensor | None = None, + return_dict: bool | None = None, + ) -> tf.Tensor: + r""" + Returns: + image_features (`tf.Tensor` of shape `(batch_size, output_dim`): The image embeddings obtained by applying + the projection layer to the pooled output of [`TFBlipVisionModel`]. + + Examples: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, TFBlipModel + + >>> model = TFBlipModel.from_pretrained("Salesforce/blip-image-captioning-base") + >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> inputs = processor(images=image, return_tensors="tf") + + >>> image_features = model.get_image_features(**inputs) + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + vision_outputs = self.blip.vision_model(pixel_values=pixel_values, return_dict=return_dict) + + pooled_output = vision_outputs[1] # pooled_output + image_features = self.blip.visual_projection(pooled_output) + + return image_features + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "blip", None) is not None: + with tf.name_scope(self.blip.name): + self.blip.build(None) + + +@add_start_docstrings( + """ + BLIP Model for image captioning. The model consists of a vision encoder and a text decoder. One can optionally pass + `input_ids` to the model, which serve as a text prompt, to make the text decoder continue the prompt. Otherwise, + the decoder starts generating text from the [BOS] (beginning-of-sequence) token. will start generating the caption + from the text input. If no text input is provided, the decoder will start with the [BOS] token only. + """, + BLIP_START_DOCSTRING, +) +class TFBlipForConditionalGeneration(TFBlipPreTrainedModel): + config_class = BlipConfig + _keys_to_ignore_on_load_missing = [r"text_decoder.cls.predictions.decoder.bias"] + main_input_name = "pixel_values" + + def __init__(self, config: BlipConfig, *args, **kwargs): + super().__init__(config, *args, **kwargs) + + self.vision_model = TFBlipVisionModel(config.vision_config, name="vision_model") + + self.text_decoder = TFBlipTextLMHeadModel(config.text_config, name="text_decoder") + + self.decoder_input_ids = config.text_config.bos_token_id + self.decoder_pad_token_id = config.text_config.pad_token_id + + def get_input_embeddings(self) -> keras.layers.Layer: + return self.vision_model.embeddings.patch_embedding + + @unpack_inputs + @add_start_docstrings_to_model_forward(BLIP_VISION_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=TFBlipForConditionalGenerationModelOutput, config_class=BlipConfig) + def call( + self, + pixel_values: tf.Tensor, + input_ids: tf.Tensor | None = None, + attention_mask: tf.Tensor | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + labels: tf.Tensor | None = None, + return_dict: bool | None = None, + training: bool | None = None, + ) -> tuple | TFBlipForConditionalGenerationModelOutput: + r""" + Returns: + + Examples: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, TFBlipForConditionalGeneration + + >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base") + >>> model = TFBlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + >>> text = "A picture of" + + >>> inputs = processor(images=image, text=text, return_tensors="tf") + + >>> outputs = model(**inputs) + ```""" + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + vision_outputs = self.vision_model( + pixel_values=pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + image_embeds = vision_outputs[0] + + outputs = self.text_decoder( + input_ids=input_ids, + attention_mask=attention_mask, + encoder_hidden_states=image_embeds, + labels=labels, + return_dict=False, + training=training, + ) + + if not return_dict: + outputs = (outputs[0], outputs[1], image_embeds, vision_outputs[0]) + vision_outputs[2:] + return tuple(output for output in outputs if output is not None) + + if labels is not None: + loss = outputs[0] + logits = outputs[1] + else: + loss = None + logits = outputs[0] + + if loss is not None and loss.shape.rank == 0: + loss = tf.reshape(loss, (1,)) + + return TFBlipForConditionalGenerationModelOutput( + loss=loss, + logits=logits, + image_embeds=image_embeds, + last_hidden_state=vision_outputs.last_hidden_state, + hidden_states=vision_outputs.hidden_states, + attentions=vision_outputs.attentions, + ) + + def generate( + self, + pixel_values: tf.Tensor, + input_ids: tf.Tensor | None = None, + attention_mask: tf.Tensor | None = None, + **generate_kwargs, + ) -> tf.Tensor: + r""" + Overrides *generate* function to be able to use the model as a conditional generator + + Parameters: + pixel_values (`tf.Tensor` of shape `(batch_size, num_channels, image_height, image_width)`: + Input image to be processed + input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + The sequence used as a prompt for the generation. + attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + + Examples: + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, TFBlipForConditionalGeneration + + >>> model = TFBlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base") + >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> inputs = processor(images=image, return_tensors="tf") + + >>> outputs = model.generate(**inputs) + >>> print(processor.decode(outputs[0], skip_special_tokens=True)) + two cats sleeping on a couch + ``` + """ + + batch_size = pixel_values.shape[0] + vision_outputs = self.vision_model(pixel_values=pixel_values) + + image_embeds = vision_outputs[0] + + image_attention_mask = tf.ones(shape_list(image_embeds)[:-1], dtype=tf.int32) + + if isinstance(input_ids, list): + input_ids = tf.convert_to_tensor(input_ids, dtype=tf.int32) + elif input_ids is None: + input_ids = tf.convert_to_tensor( + [[self.decoder_input_ids, self.config.text_config.eos_token_id]], dtype=tf.int32 + ) + + input_ids = tf.tile(input_ids, (batch_size, 1)) + + # PyTorch: input_ids[:, 0] = self.config.text_config.bos_token_id + input_ids = tf.concat( + [tf.ones((batch_size, 1), dtype=tf.int32) * self.config.text_config.bos_token_id, input_ids[:, 1:]], axis=1 + ) + attention_mask = attention_mask[:, :-1] if attention_mask is not None else None + + outputs = self.text_decoder.generate( + input_ids=input_ids[:, :-1], + eos_token_id=self.config.text_config.sep_token_id, + pad_token_id=self.config.text_config.pad_token_id, + attention_mask=attention_mask, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_attention_mask, + **generate_kwargs, + ) + + return outputs + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "vision_model", None) is not None: + with tf.name_scope(self.vision_model.name): + self.vision_model.build(None) + if getattr(self, "text_decoder", None) is not None: + with tf.name_scope(self.text_decoder.name): + self.text_decoder.build(None) + + +@add_start_docstrings( + """ + BLIP Model for visual question answering. The model consists of a vision encoder, a text encoder as well as a text + decoder. The vision encoder will encode the input image, the text encoder will encode the input question together + with the encoding of the image, and the text decoder will output the answer to the question. + """, + BLIP_START_DOCSTRING, +) +class TFBlipForQuestionAnswering(TFBlipPreTrainedModel): + config_class = BlipConfig + _keys_to_ignore_on_load_missing = [r"text_decoder.cls.predictions.decoder.bias"] + + def __init__(self, config: BlipConfig, *args, **kwargs): + super().__init__(config, *args, **kwargs) + + self.vision_model = TFBlipVisionModel(config.vision_config, name="vision_model") + + self.text_encoder = TFBlipTextModel(config.text_config, name="text_encoder", add_pooling_layer=False) + + self.text_decoder = TFBlipTextLMHeadModel(config.text_config, name="text_decoder") + + self.decoder_pad_token_id = config.text_config.pad_token_id + self.decoder_start_token_id = config.text_config.bos_token_id + + def get_input_embeddings(self) -> keras.layers.Layer: + return self.vision_model.embeddings.patch_embedding + + # Adapted from transformers.models.t5.modeling_tf_t5.TFT5PreTrainedModel._shift_right + def _shift_right(self, input_ids): + decoder_start_token_id = self.decoder_start_token_id + pad_token_id = self.decoder_pad_token_id + + if decoder_start_token_id is None or pad_token_id is None: + raise ValueError("decoder_start_token_id and pad_token_id must be defined!") + + start_tokens = tf.fill((shape_list(input_ids)[0], 1), decoder_start_token_id) + start_tokens = tf.cast(start_tokens, input_ids.dtype) # Ensure compatible dtypes for concatenation + shifted_input_ids = tf.concat([start_tokens, input_ids[:, :-1]], -1) + + # replace possible -100 values in labels by `pad_token_id` + shifted_input_ids = tf.where( + shifted_input_ids == -100, + tf.cast(tf.fill(shape_list(shifted_input_ids), pad_token_id), shifted_input_ids.dtype), + shifted_input_ids, + ) + + # "Verify that `labels` has only positive values and -100" + tf.debugging.assert_greater_equal(shifted_input_ids, tf.constant(0, dtype=shifted_input_ids.dtype)) + + return shifted_input_ids + + @unpack_inputs + @add_start_docstrings_to_model_forward(BLIP_VISION_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=TFBlipTextVisionModelOutput, config_class=BlipVisionConfig) + def call( + self, + input_ids: tf.Tensor, + pixel_values: tf.Tensor | None = None, + decoder_input_ids: tf.Tensor | None = None, + decoder_attention_mask: tf.Tensor | None = None, + attention_mask: tf.Tensor | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + labels: tf.Tensor | None = None, + return_dict: bool | None = None, + training: bool | None = None, + ) -> tuple | TFBlipTextVisionModelOutput: + r""" + Returns: + + Examples: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, TFBlipForQuestionAnswering + + >>> model = TFBlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base") + >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-vqa-base") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> # training + >>> text = "How many cats are in the picture?" + >>> label = "2" + >>> inputs = processor(images=image, text=text, return_tensors="tf") + >>> labels = processor(text=label, return_tensors="tf").input_ids + + >>> inputs["labels"] = labels + >>> outputs = model(**inputs) + >>> loss = outputs.loss + + >>> # inference + >>> text = "How many cats are in the picture?" + >>> inputs = processor(images=image, text=text, return_tensors="tf") + >>> outputs = model.generate(**inputs) + >>> print(processor.decode(outputs[0], skip_special_tokens=True)) + 2 + ```""" + if labels is None and decoder_input_ids is None: + raise ValueError( + "Either `decoder_input_ids` or `labels` should be passed when calling" + " `TFBlipForQuestionAnswering`. if you are training the model make sure that `labels` is passed, if you" + " are using the model for inference make sure that `decoder_input_ids` is passed or call `generate`" + ) + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + vision_outputs = self.vision_model( + pixel_values=pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + image_embeds = vision_outputs[0] + image_attention_mask = tf.ones(shape_list(image_embeds)[:-1], dtype=tf.int64) + + question_embeds = self.text_encoder( + input_ids=input_ids, + attention_mask=attention_mask, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_attention_mask, + return_dict=return_dict, + training=training, + ) + + question_embeds = question_embeds[0] if not return_dict else question_embeds.last_hidden_state + + if labels is not None and decoder_input_ids is None: + # labels are already shifted right, see: https://github.com/huggingface/transformers/pull/23153 + decoder_input_ids = labels + + answer_output = self.text_decoder( + input_ids=decoder_input_ids, + attention_mask=decoder_attention_mask, + encoder_hidden_states=question_embeds, + encoder_attention_mask=attention_mask, + labels=labels, + return_dict=return_dict, + training=training, + ) + + if labels is not None: + decoder_loss = tf.reduce_mean(answer_output.loss) if return_dict else tf.reduce_mean(answer_output[0]) + else: + decoder_loss = None + + if not return_dict: + outputs = (decoder_loss, image_embeds, vision_outputs[0]) + vision_outputs[2:] + return tuple(output for output in outputs if output is not None) + + return TFBlipTextVisionModelOutput( + loss=decoder_loss, + image_embeds=image_embeds, + last_hidden_state=vision_outputs.last_hidden_state, + hidden_states=vision_outputs.hidden_states, + attentions=vision_outputs.attentions, + ) + + def generate( + self, + input_ids: tf.Tensor, + pixel_values: tf.Tensor, + attention_mask: tf.Tensor | None = None, + **generate_kwargs, + ) -> tf.Tensor: + r""" + Overrides *generate* function to be able to use the model as a conditional generator + + Parameters: + input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`): + The sequence used as a prompt for the generation. + pixel_values (`tf.Tensor` of shape `(batch_size, num_channels, image_height, image_width)`: + Input image to be processed + attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`. `1` for + tokens that are NOT MASKED, `0` for MASKED tokens. + generate_kwargs (dict, *optional*): + Additional arguments passed to the `generate` function of the decoder + + + Examples: + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, TFBlipForQuestionAnswering + + >>> model = TFBlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base") + >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-vqa-base") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + >>> text = "How many cats are in the picture?" + + >>> inputs = processor(images=image, text=text, return_tensors="tf") + + >>> outputs = model.generate(**inputs) + >>> print(processor.decode(outputs[0], skip_special_tokens=True)) + 2 + ``` + """ + vision_outputs = self.vision_model(pixel_values=pixel_values) + + image_embeds = vision_outputs[0] + + image_attention_mask = tf.ones(shape_list(image_embeds)[:-1], dtype=tf.int32) + + if isinstance(input_ids, list): + input_ids = tf.Tensor(input_ids) + + question_outputs = self.text_encoder( + input_ids=input_ids, + attention_mask=attention_mask, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_attention_mask, + return_dict=False, + ) + + question_embeds = question_outputs[0] + + question_attention_mask = tf.ones(shape_list(question_embeds)[:-1], dtype=tf.int32) + + bos_ids = tf.fill( + (tf.shape(question_embeds)[0], 1), value=tf.cast(self.decoder_start_token_id, input_ids.dtype) + ) + + outputs = self.text_decoder.generate( + input_ids=bos_ids, + eos_token_id=self.config.text_config.sep_token_id, + pad_token_id=self.config.text_config.pad_token_id, + encoder_hidden_states=question_embeds, + encoder_attention_mask=question_attention_mask, + **generate_kwargs, + ) + + return outputs + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "vision_model", None) is not None: + with tf.name_scope(self.vision_model.name): + self.vision_model.build(None) + if getattr(self, "text_encoder", None) is not None: + with tf.name_scope(self.text_encoder.name): + self.text_encoder.build(None) + if getattr(self, "text_decoder", None) is not None: + with tf.name_scope(self.text_decoder.name): + self.text_decoder.build(None) + + +@add_start_docstrings( + """ + BLIP Model with a vision and text projector, and a classification head on top. The model is used in the context of + image-text retrieval. Given an image and a text, the model returns the probability of the text being relevant to + the image. + """, + BLIP_START_DOCSTRING, +) +class TFBlipForImageTextRetrieval(TFBlipPreTrainedModel): + config_class = BlipConfig + + def __init__(self, config: BlipConfig, *args, **kwargs): + super().__init__(config, *args, **kwargs) + + self.vision_model = TFBlipVisionModel(config.vision_config, name="vision_model") + + self.text_encoder = TFBlipTextModel(config.text_config, name="text_encoder", add_pooling_layer=False) + + # vision projection layer + self.vision_proj = keras.layers.Dense( + config.image_text_hidden_size, + kernel_initializer=get_initializer(config.initializer_range), + name="vision_proj", + ) + + # text projection layer + self.text_proj = keras.layers.Dense( + config.image_text_hidden_size, + kernel_initializer=get_initializer(config.initializer_range), + name="text_proj", + ) + + # image text matching head + self.itm_head = keras.layers.Dense( + 2, kernel_initializer=get_initializer(config.initializer_range), name="itm_head" + ) + + self.decoder_pad_token_id = ( + config.text_config.pad_token_id + if not hasattr(config, "decoder_pad_token_id") + else config.decoder_pad_token_id + ) + self.decoder_start_token_id = ( + config.text_config.bos_token_id + if not hasattr(config, "decoder_start_token_id") + else config.decoder_start_token_id + ) + self.config = config + + def get_input_embeddings(self) -> keras.layers.Layer: + return self.vision_model.embeddings.patch_embedding + + @unpack_inputs + @add_start_docstrings_to_model_forward(BLIP_VISION_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=TFBlipImageTextMatchingModelOutput, config_class=BlipVisionConfig) + def call( + self, + input_ids: tf.Tensor, + pixel_values: tf.Tensor | None = None, + use_itm_head: bool | None = True, + attention_mask: tf.Tensor | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + training: bool | None = None, + ) -> tuple | TFBlipImageTextMatchingModelOutput: + r""" + Returns: + + Examples: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, TFBlipForImageTextRetrieval + + >>> model = TFBlipForImageTextRetrieval.from_pretrained("Salesforce/blip-itm-base-coco") + >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-itm-base-coco") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + >>> text = "an image of a cat" + + >>> inputs = processor(images=image, text=text, return_tensors="tf") + >>> outputs = model(**inputs) + ``` + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + vision_outputs = self.vision_model( + pixel_values=pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + image_embeds = vision_outputs[0] + image_atts = tf.ones(shape_list(image_embeds)[:-1], dtype=tf.int64) + + # Matt: In PyTorch, only one path (itm/non-itm) is taken. However, in TensorFlow this can result in + # some layers not being built! To avoid this, we always call both paths, then use an if statement to select + # which output to pass to the final output. The unnecessary nodes will be pruned from the final graph, but + # not before the layers have all been built correctly. + itm_question_embeds = self.text_encoder( + input_ids=input_ids, + attention_mask=attention_mask, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_atts, + return_dict=return_dict, + training=training, + ) + itm_question_embeds = itm_question_embeds[0] if not return_dict else itm_question_embeds.last_hidden_state + + itm_output = self.itm_head(itm_question_embeds[:, 0, :]) + + no_itm_question_embeds = self.text_encoder( + input_ids=input_ids, + attention_mask=attention_mask, + return_dict=return_dict, + training=training, + ) + no_itm_question_embeds = ( + no_itm_question_embeds[0] if not return_dict else no_itm_question_embeds.last_hidden_state + ) + + image_feat, _ = tf.linalg.normalize(self.vision_proj(image_embeds[:, 0, :]), ord=2, axis=-1) + text_feat, _ = tf.linalg.normalize(self.text_proj(no_itm_question_embeds[:, 0, :]), ord=2, axis=-1) + + no_itm_output = tf.matmul(image_feat, text_feat, transpose_b=True) + + if use_itm_head: + output = itm_output + question_embeds = itm_question_embeds + else: + output = no_itm_output + question_embeds = no_itm_question_embeds + + if not return_dict: + outputs = (output, vision_outputs[0]) + vision_outputs[2:] + (question_embeds,) + return tuple(output for output in outputs if output is not None) + + return TFBlipImageTextMatchingModelOutput( + itm_score=output, + last_hidden_state=vision_outputs.last_hidden_state, + hidden_states=vision_outputs.hidden_states, + attentions=vision_outputs.attentions, + question_embeds=question_embeds, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "vision_model", None) is not None: + with tf.name_scope(self.vision_model.name): + self.vision_model.build(None) + if getattr(self, "text_encoder", None) is not None: + with tf.name_scope(self.text_encoder.name): + self.text_encoder.build(None) + if getattr(self, "vision_proj", None) is not None: + with tf.name_scope(self.vision_proj.name): + self.vision_proj.build([None, None, self.config.vision_config.hidden_size]) + if getattr(self, "text_proj", None) is not None: + with tf.name_scope(self.text_proj.name): + self.text_proj.build([None, None, self.config.text_config.hidden_size]) + if getattr(self, "itm_head", None) is not None: + with tf.name_scope(self.itm_head.name): + self.itm_head.build([None, None, self.config.text_config.hidden_size]) + + +__all__ = [ + "TFBlipModel", + "TFBlipPreTrainedModel", + "TFBlipForConditionalGeneration", + "TFBlipForQuestionAnswering", + "TFBlipVisionModel", + "TFBlipTextModel", + "TFBlipForImageTextRetrieval", +] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/blip/modeling_tf_blip_text.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/blip/modeling_tf_blip_text.py new file mode 100644 index 0000000000000000000000000000000000000000..7dae1126e03b12b39b17b8fdc00b6050f9681fcd --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/blip/modeling_tf_blip_text.py @@ -0,0 +1,1122 @@ +# coding=utf-8 +# Copyright 2023 The Salesforce Team Authors and The HuggingFace Team. All rights reserved. +# +# Licensed under the BSD-3-clause license (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from __future__ import annotations + +import math + +import tensorflow as tf + +from ...modeling_tf_outputs import ( + TFBaseModelOutputWithPastAndCrossAttentions, + TFBaseModelOutputWithPoolingAndCrossAttentions, + TFCausalLMOutputWithCrossAttentions, +) +from ...modeling_tf_utils import ( + TFModelInputType, + TFPreTrainedModel, + get_initializer, + get_tf_activation, + keras, + keras_serializable, + shape_list, + unpack_inputs, +) +from ...tf_utils import check_embeddings_within_bounds, invert_attention_mask, stable_softmax +from ...utils import add_start_docstrings_to_model_forward, logging +from .configuration_blip import BlipTextConfig + + +logger = logging.get_logger(__name__) + +BLIP_TEXT_INPUTS_DOCSTRING = r""" + Args: + input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoProcessor`]. See [`BlipProcessor.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + position_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + + [What are position IDs?](../glossary#position-ids) + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L52 +class TFBlipTextEmbeddings(keras.layers.Layer): + """Construct the embeddings from word and position embeddings.""" + + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + self.word_embeddings = keras.layers.Embedding( + config.vocab_size, + config.hidden_size, + embeddings_initializer=get_initializer(config.initializer_range), + name="word_embeddings", + ) + self.position_embeddings = keras.layers.Embedding( + config.max_position_embeddings, + config.hidden_size, + embeddings_initializer=get_initializer(config.initializer_range), + name="position_embeddings", + ) + + # self.LayerNorm is not snake-cased to stick with PyTorch model variable name and be able to load + # any TensorFlow checkpoint file + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = keras.layers.Dropout(config.hidden_dropout_prob, name="dropout") + + self.position_ids = tf.expand_dims(tf.range(config.max_position_embeddings), 0) + self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") + + self.config = config + + def call(self, input_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0, training=None): + if input_ids is not None: + input_shape = tf.shape(input_ids) + else: + input_shape = tf.shape(inputs_embeds)[:-1] + + seq_length = input_shape[1] + + if position_ids is None: + position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length] + + if inputs_embeds is None: + check_embeddings_within_bounds(input_ids, self.config.vocab_size) + inputs_embeds = self.word_embeddings(input_ids) + + embeddings = inputs_embeds + + if self.position_embedding_type == "absolute": + position_embeddings = self.position_embeddings(position_ids) + embeddings += position_embeddings + embeddings = self.LayerNorm(embeddings) + embeddings = self.dropout(embeddings, training=training) + return embeddings + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "word_embeddings", None) is not None: + with tf.name_scope(self.word_embeddings.name): + self.word_embeddings.build(None) + if getattr(self, "position_embeddings", None) is not None: + with tf.name_scope(self.position_embeddings.name): + self.position_embeddings.build(None) + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) + if getattr(self, "dropout", None) is not None: + with tf.name_scope(self.dropout.name): + self.dropout.build(None) + + +# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L97 +class TFBlipTextSelfAttention(keras.layers.Layer): + def __init__(self, config, is_cross_attention, **kwargs): + super().__init__(**kwargs) + self.config = config + if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): + raise ValueError( + "The hidden size (%d) is not a multiple of the number of attention heads (%d)" + % (config.hidden_size, config.num_attention_heads) + ) + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = keras.layers.Dense( + self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query" + ) + self.key = keras.layers.Dense( + self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key" + ) + self.value = keras.layers.Dense( + self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value" + ) + + self.dropout = keras.layers.Dropout(config.attention_probs_dropout_prob) + self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + self.max_position_embeddings = config.max_position_embeddings + self.distance_embedding = keras.layers.Embedding( + 2 * config.max_position_embeddings - 1, self.attention_head_size + ) + self.is_cross_attention = is_cross_attention + + def transpose_for_scores(self, x): + new_x_shape = tf.concat( + [tf.shape(x)[:-1], tf.constant([self.num_attention_heads, self.attention_head_size], dtype=tf.int32)], + axis=0, + ) + x = tf.reshape(x, new_x_shape) + return tf.transpose(x, perm=(0, 2, 1, 3)) + + def call( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=None, + output_attentions=False, + training=None, + ): + mixed_query_layer = self.query(hidden_states) + + # If this is instantiated as a cross-attention module, the keys + # and values come from an encoder; the attention mask needs to be + # such that the encoder's padding tokens are not attended to. + is_cross_attention = encoder_hidden_states is not None + + if is_cross_attention: + key_layer = self.transpose_for_scores(self.key(encoder_hidden_states)) + value_layer = self.transpose_for_scores(self.value(encoder_hidden_states)) + attention_mask = encoder_attention_mask + elif past_key_value is not None: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + key_layer = tf.concat([past_key_value[0], key_layer], axis=2) + value_layer = tf.concat([past_key_value[1], value_layer], axis=2) + else: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + + query_layer = self.transpose_for_scores(mixed_query_layer) + + past_key_value = (key_layer, value_layer) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True) + + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + seq_length = shape_list(hidden_states)[1] + position_ids_l = tf.expand_dims(tf.range(seq_length, dtype=tf.int64, device=hidden_states.device), 1) + position_ids_r = tf.expand_dims(tf.range(seq_length, dtype=tf.int64, device=hidden_states.device), 0) + distance = position_ids_l - position_ids_r + positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1) + positional_embedding = tf.cast(positional_embedding, query_layer.dtype) # fp16 compatibility + + if self.position_embedding_type == "relative_key": + relative_position_scores = tf.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores + elif self.position_embedding_type == "relative_key_query": + relative_position_scores_query = tf.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + relative_position_scores_key = tf.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key + + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in BlipTextModel forward() function) + attention_scores = attention_scores + tf.cast(attention_mask, attention_scores.dtype) + + # Normalize the attention scores to probabilities. + attention_probs = stable_softmax(attention_scores, axis=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs_dropped = self.dropout(attention_probs, training=training) + + # Mask heads if we want to + if head_mask is not None: + attention_probs_dropped = attention_probs_dropped * head_mask + + context_layer = attention_probs_dropped @ value_layer + + context_layer = tf.transpose(context_layer, perm=(0, 2, 1, 3)) + new_context_layer_shape = shape_list(context_layer)[:-2] + [self.all_head_size] + context_layer = tf.reshape(context_layer, new_context_layer_shape) + + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + + outputs = outputs + (past_key_value,) + return outputs + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "query", None) is not None: + with tf.name_scope(self.query.name): + self.query.build([None, None, self.config.hidden_size]) + if self.is_cross_attention: + if getattr(self, "key", None) is not None: + with tf.name_scope(self.key.name): + self.key.build([None, None, self.config.encoder_hidden_size]) + if getattr(self, "value", None) is not None: + with tf.name_scope(self.value.name): + self.value.build([None, None, self.config.encoder_hidden_size]) + else: + if getattr(self, "key", None) is not None: + with tf.name_scope(self.key.name): + self.key.build([None, None, self.config.hidden_size]) + if getattr(self, "value", None) is not None: + with tf.name_scope(self.value.name): + self.value.build([None, None, self.config.hidden_size]) + + +class TFBlipTextSelfOutput(keras.layers.Layer): + def __init__(self, config: BlipTextConfig, **kwargs): + super().__init__(**kwargs) + + self.dense = keras.layers.Dense( + units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + ) + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.config = config + + def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool | None = None) -> tf.Tensor: + hidden_states = self.dense(inputs=hidden_states) + hidden_states = self.dropout(inputs=hidden_states, training=training) + hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor) + + return hidden_states + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build([None, None, self.config.hidden_size]) + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) + + +# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#242 +class TFBlipTextAttention(keras.layers.Layer): + def __init__(self, config, is_cross_attention=False, **kwargs): + super().__init__(**kwargs) + self.self = TFBlipTextSelfAttention(config, is_cross_attention, name="self") + # "output" is a protected attribute on TF models + self.self_output = TFBlipTextSelfOutput(config, name="output") + + def call( + self, + hidden_states: tf.Tensor, + attention_mask: tf.Tensor | None = None, + head_mask: tf.Tensor | None = None, + encoder_hidden_states: tf.Tensor | None = None, + encoder_attention_mask: tf.Tensor | None = None, + past_key_value: tuple[tuple[tf.Tensor]] | None = None, + output_attentions: bool | None = False, + training: bool | None = None, + ): + self_outputs = self.self( + hidden_states, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + training=training, + ) + attention_output = self.self_output(self_outputs[0], hidden_states, training=training) + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + return outputs + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "self", None) is not None: + with tf.name_scope(self.self.name): + self.self.build(None) + if getattr(self, "self_output", None) is not None: + with tf.name_scope(self.self_output.name): + self.self_output.build(None) + + +# Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->BlipText +class TFBlipTextIntermediate(keras.layers.Layer): + def __init__(self, config: BlipTextConfig, **kwargs): + super().__init__(**kwargs) + + self.dense = keras.layers.Dense( + units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + ) + + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = get_tf_activation(config.hidden_act) + else: + self.intermediate_act_fn = config.hidden_act + self.config = config + + def call(self, hidden_states: tf.Tensor) -> tf.Tensor: + hidden_states = self.dense(inputs=hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + + return hidden_states + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build([None, None, self.config.hidden_size]) + + +class TFBlipTextOutput(keras.layers.Layer): + def __init__(self, config: BlipTextConfig, **kwargs): + super().__init__(**kwargs) + + self.dense = keras.layers.Dense( + units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + ) + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.config = config + + def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: + hidden_states = self.dense(inputs=hidden_states) + hidden_states = self.dropout(inputs=hidden_states, training=training) + hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor) + + return hidden_states + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build([None, None, self.config.intermediate_size]) + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) + + +class TFBlipTextLayer(keras.layers.Layer): + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + self.config = config + self.attention = TFBlipTextAttention(config, name="attention") + if self.config.is_decoder: + self.crossattention = TFBlipTextAttention( + config, is_cross_attention=self.config.is_decoder, name="crossattention" + ) + self.intermediate = TFBlipTextIntermediate(config, name="intermediate") + self.self_output = TFBlipTextOutput(config, name="output") + + def call( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=None, + output_attentions=False, + training=None, + ): + # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 + self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None + self_attention_outputs = self.attention( + hidden_states, + attention_mask, + head_mask, + output_attentions=output_attentions, + past_key_value=self_attn_past_key_value, + training=training, + ) + attention_output = self_attention_outputs[0] + + outputs = self_attention_outputs[1:-1] + present_key_value = self_attention_outputs[-1] + + if encoder_hidden_states is not None: + cross_attention_outputs = self.crossattention( + attention_output, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + output_attentions=output_attentions, + training=training, + ) + attention_output = cross_attention_outputs[0] + outputs = outputs + cross_attention_outputs[1:-1] # add cross attentions if we output attention weights + intermediate_output = self.intermediate(attention_output) + layer_output = self.self_output(intermediate_output, attention_output, training=training) + outputs = (layer_output,) + outputs + + outputs = outputs + (present_key_value,) + + return outputs + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "attention", None) is not None: + with tf.name_scope(self.attention.name): + self.attention.build(None) + if getattr(self, "intermediate", None) is not None: + with tf.name_scope(self.intermediate.name): + self.intermediate.build(None) + if getattr(self, "self_output", None) is not None: + with tf.name_scope(self.self_output.name): + self.self_output.build(None) + if getattr(self, "crossattention", None) is not None: + with tf.name_scope(self.crossattention.name): + self.crossattention.build(None) + + +# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L386 +@keras_serializable +class TFBlipTextEncoder(keras.layers.Layer): + config_class = BlipTextConfig + + def __init__(self, config, name=None, **kwargs): + super().__init__(name=name, **kwargs) + self.config = config + self.layer = [TFBlipTextLayer(config, name=f"layer_._{i}") for i in range(config.num_hidden_layers)] + + @unpack_inputs + def call( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_values=None, + use_cache=None, + output_attentions=False, + output_hidden_states=False, + return_dict=True, + training=None, + ): + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + all_cross_attentions = () if output_attentions and self.config.is_decoder else None + + next_decoder_cache = () if use_cache else None + + for i in range(self.config.num_hidden_layers): + layer_module = self.layer[i] + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_head_mask = head_mask[i] if head_mask is not None else None + past_key_value = past_key_values[i] if past_key_values is not None else None + + layer_outputs = layer_module( + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + training=training, + ) + + hidden_states = layer_outputs[0] + if use_cache: + next_decoder_cache += (layer_outputs[-1],) + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + all_cross_attentions = all_cross_attentions + (layer_outputs[2],) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple( + v + for v in [ + hidden_states, + next_decoder_cache, + all_hidden_states, + all_self_attentions, + all_cross_attentions, + ] + if v is not None + ) + return TFBaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=next_decoder_cache, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + cross_attentions=all_cross_attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "layer", None) is not None: + for layer in self.layer: + with tf.name_scope(layer.name): + layer.build(None) + + +# Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler with Bert->BlipText +class TFBlipTextPooler(keras.layers.Layer): + def __init__(self, config: BlipTextConfig, **kwargs): + super().__init__(**kwargs) + + self.dense = keras.layers.Dense( + units=config.hidden_size, + kernel_initializer=get_initializer(config.initializer_range), + activation="tanh", + name="dense", + ) + self.config = config + + def call(self, hidden_states: tf.Tensor) -> tf.Tensor: + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. + first_token_tensor = hidden_states[:, 0] + pooled_output = self.dense(inputs=first_token_tensor) + + return pooled_output + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build([None, None, self.config.hidden_size]) + + +# Copied from transformers.models.bert.modeling_tf_bert.TFBertPredictionHeadTransform with Bert->BlipText +class TFBlipTextPredictionHeadTransform(keras.layers.Layer): + def __init__(self, config: BlipTextConfig, **kwargs): + super().__init__(**kwargs) + + self.dense = keras.layers.Dense( + units=config.hidden_size, + kernel_initializer=get_initializer(config.initializer_range), + name="dense", + ) + + if isinstance(config.hidden_act, str): + self.transform_act_fn = get_tf_activation(config.hidden_act) + else: + self.transform_act_fn = config.hidden_act + + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.config = config + + def call(self, hidden_states: tf.Tensor) -> tf.Tensor: + hidden_states = self.dense(inputs=hidden_states) + hidden_states = self.transform_act_fn(hidden_states) + hidden_states = self.LayerNorm(inputs=hidden_states) + + return hidden_states + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build([None, None, self.config.hidden_size]) + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) + + +class TFBlipTextLMPredictionHead(keras.layers.Layer): + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + self.transform = TFBlipTextPredictionHeadTransform(config, name="transform") + + # The output weights are the same as the input embeddings, but there is + # an output-only bias for each token. + self.decoder = keras.layers.Dense( + config.vocab_size, + kernel_initializer=get_initializer(config.initializer_range), + name="decoder", + use_bias=False, + ) + self.config = config + + def build(self, input_shape=None): + self.bias = self.add_weight(name="bias", shape=(self.config.vocab_size,), initializer="zeros", trainable=True) + + if self.built: + return + self.built = True + if getattr(self, "transform", None) is not None: + with tf.name_scope(self.transform.name): + self.transform.build(None) + if getattr(self, "decoder", None) is not None: + with tf.name_scope(self.decoder.name): + self.decoder.build([None, None, self.config.hidden_size]) + + def call(self, hidden_states): + hidden_states = self.transform(hidden_states) + hidden_states = self.decoder(hidden_states) + self.bias + return hidden_states + + +class TFBlipTextOnlyMLMHead(keras.layers.Layer): + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + self.predictions = TFBlipTextLMPredictionHead(config, name="predictions") + + def call(self, sequence_output: tf.Tensor) -> tf.Tensor: + prediction_scores = self.predictions(sequence_output) + return prediction_scores + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "predictions", None) is not None: + with tf.name_scope(self.predictions.name): + self.predictions.build(None) + + +# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L548 +class TFBlipTextPreTrainedModel(TFPreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = BlipTextConfig + base_model_prefix = "bert" + _keys_to_ignore_on_load_missing = [r"position_ids"] + + +# Adapted from https://github.com/salesforce/BLIP/blob/3a29b7410476bf5f2ba0955827390eb6ea1f4f9d/models/med.py#L571 +class TFBlipTextModel(TFBlipTextPreTrainedModel): + """ + The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of + cross-attention is added between the self-attention layers, following the architecture described in [Attention is + all you need](https://huggingface.co/papers/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, + Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin. argument and `is_decoder` set to `True`; an + `encoder_hidden_states` is then expected as an input to the forward pass. + """ + + def __init__(self, config, add_pooling_layer=True, name=None, **kwargs): + super().__init__(config, name=name, **kwargs) + self.config = config + + self.embeddings = TFBlipTextEmbeddings(config, name="embeddings") + self.encoder = TFBlipTextEncoder(config, name="encoder") + self.pooler = TFBlipTextPooler(config, name="pooler") if add_pooling_layer else None + + def get_input_embeddings(self): + return self.embeddings.word_embeddings + + def set_input_embeddings(self, value): + self.embeddings.word_embeddings = value + + @tf.function + def get_extended_attention_mask( + self, attention_mask: tf.Tensor, input_shape: tuple[int], is_decoder: bool + ) -> tf.Tensor: + """ + Makes broadcastable attention and causal masks so that future and masked tokens are ignored. + + Arguments: + attention_mask (`tf.Tensor`): + Mask with ones indicating tokens to attend to, zeros for tokens to ignore. + input_shape (`tuple[int]`): + The shape of the input to the model. + is_decoder (`bool`): + Whether the model is used as a decoder. + + Returns: + `tf.Tensor` The extended attention mask, with the same dtype as `attention_mask.dtype`. + """ + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + if not isinstance(attention_mask, tf.Tensor): + attention_mask = tf.convert_to_tensor(attention_mask) # Catches NumPy inputs that haven't been cast yet + if attention_mask.shape.rank == 3: + extended_attention_mask = attention_mask[:, None, :, :] + elif attention_mask.shape.rank == 2: + # Provided a padding mask of dimensions [batch_size, seq_length] + # - if the model is a decoder, apply a causal mask in addition to the padding mask + # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length] + if is_decoder: + batch_size, seq_length = input_shape + + seq_ids = tf.range(seq_length, dtype=attention_mask.dtype) + causal_mask = tf.broadcast_to(seq_ids, (batch_size, seq_length, seq_length)) <= seq_ids[None, :, None] + # in case past_key_values are used we need to add a prefix ones mask to the causal mask + + if shape_list(causal_mask)[1] < shape_list(attention_mask)[1]: + prefix_seq_len = tf.shape(attention_mask)[1] - tf.shape(causal_mask)[1] + causal_mask = tf.concat( + [ + tf.ones((batch_size, seq_length, prefix_seq_len), dtype=causal_mask.dtype), + causal_mask, + ], + axis=-1, + ) + extended_attention_mask = ( + tf.cast(causal_mask[:, None, :, :], attention_mask.dtype) * attention_mask[:, None, None, :] + ) + else: + extended_attention_mask = attention_mask[:, None, None, :] + else: + raise ValueError( + f"Wrong shape for input_ids (shape {input_shape}) or attention_mask (shape {attention_mask.shape})" + ) + + # Since attention_mask is 1.0 for positions we want to attend and 0.0 for + # masked positions, this operation will create a tensor which is 0.0 for + # positions we want to attend and -10000.0 for masked positions. + # Since we are adding it to the raw scores before the softmax, this is + # effectively the same as removing these entirely. + extended_attention_mask = tf.cast(extended_attention_mask, self.dtype) # fp16 compatibility + extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 + return extended_attention_mask + + @add_start_docstrings_to_model_forward(BLIP_TEXT_INPUTS_DOCSTRING) + @unpack_inputs + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: tf.Tensor | None = None, + position_ids: tf.Tensor | None = None, + head_mask: tf.Tensor | None = None, + inputs_embeds: tf.Tensor | None = None, + encoder_embeds: tf.Tensor | None = None, + encoder_hidden_states: tf.Tensor | None = None, + encoder_attention_mask: tf.Tensor | None = None, + past_key_values: tuple[tuple[tf.Tensor]] | None = None, + use_cache: bool | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + is_decoder: bool = False, + training: bool = False, + ) -> tuple[tf.Tensor] | TFBaseModelOutputWithPoolingAndCrossAttentions: + r""" + encoder_hidden_states (`tf.Tensor`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if + the model is configured as a decoder. + encoder_attention_mask (`tf.Tensor`, *optional*): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in + the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + past_key_values (`tuple(tuple(tf.Tensor))`, *optional*): + Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `(batch_size, sequence_length)`. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if is_decoder: + use_cache = use_cache if use_cache is not None else self.config.use_cache + else: + use_cache = False + + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + input_shape = shape_list(input_ids) + batch_size, seq_length = input_shape + elif inputs_embeds is not None: + input_shape = shape_list(inputs_embeds)[:-1] + batch_size, seq_length = input_shape + elif encoder_embeds is not None: + input_shape = shape_list(encoder_embeds)[:-1] + batch_size, seq_length = input_shape + else: + raise ValueError("You have to specify either input_ids or inputs_embeds or encoder_embeds") + + # past_key_values_length + past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0 + + if attention_mask is None: + attention_mask = tf.ones((batch_size, seq_length + past_key_values_length)) + + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + extended_attention_mask: tf.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, is_decoder) + + # If a 2D or 3D attention mask is provided for the cross-attention + # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] + if encoder_hidden_states is not None: + if isinstance(encoder_hidden_states, list): + encoder_batch_size, encoder_sequence_length, _ = shape_list(encoder_hidden_states[0]) + else: + encoder_batch_size, encoder_sequence_length, _ = shape_list(encoder_hidden_states) + encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) + + if isinstance(encoder_attention_mask, list): + encoder_extended_attention_mask = [invert_attention_mask(mask) for mask in encoder_attention_mask] + elif encoder_attention_mask is None: + encoder_attention_mask = tf.ones(encoder_hidden_shape) + encoder_extended_attention_mask = invert_attention_mask(encoder_attention_mask) + else: + encoder_extended_attention_mask = invert_attention_mask(encoder_attention_mask) + else: + encoder_extended_attention_mask = None + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + + if encoder_embeds is None: + embedding_output = self.embeddings( + input_ids=input_ids, + position_ids=position_ids, + inputs_embeds=inputs_embeds, + past_key_values_length=past_key_values_length, + ) + else: + embedding_output = encoder_embeds + + encoder_outputs = self.encoder( + embedding_output, + attention_mask=extended_attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_extended_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + sequence_output = encoder_outputs[0] + pooled_output = self.pooler(sequence_output) if self.pooler is not None else None + + if not return_dict: + return (sequence_output, pooled_output) + encoder_outputs[1:] + + return TFBaseModelOutputWithPoolingAndCrossAttentions( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + past_key_values=encoder_outputs.past_key_values, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + cross_attentions=encoder_outputs.cross_attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "embeddings", None) is not None: + with tf.name_scope(self.embeddings.name): + self.embeddings.build(None) + if getattr(self, "encoder", None) is not None: + with tf.name_scope(self.encoder.name): + self.encoder.build(None) + if getattr(self, "pooler", None) is not None: + with tf.name_scope(self.pooler.name): + self.pooler.build(None) + + +# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L811 +class TFBlipTextLMHeadModel(TFBlipTextPreTrainedModel): + _keys_to_ignore_on_load_unexpected = [r"pooler"] + _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"] + + def __init__(self, config, **kwargs): + super().__init__(config, **kwargs) + + self.bert = TFBlipTextModel(config, add_pooling_layer=False, name="bert") + self.cls = TFBlipTextOnlyMLMHead(config, name="cls") + self.label_smoothing = config.label_smoothing + + def get_output_embeddings(self): + return self.cls.predictions.decoder + + def set_output_embeddings(self, new_embeddings): + self.cls.predictions.decoder = new_embeddings + + @add_start_docstrings_to_model_forward(BLIP_TEXT_INPUTS_DOCSTRING) + @unpack_inputs + def call( + self, + input_ids=None, + attention_mask=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + labels=None, + past_key_values=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + return_logits=False, + is_decoder=True, + training=None, + ): + r""" + encoder_hidden_states (`tf.Tensor`, *optional*): Sequence of + hidden-states at the output of the last layer of the encoder. Used in the cross-attention if the model is + configured as a decoder. + encoder_attention_mask (`tf.Tensor`, *optional*): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in + the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + labels (`tf.Tensor`, *optional*): + Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in + `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are + ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]` + past_key_values (`tuple(tuple(tf.Tensor))`, *optional*): + Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `(batch_size, sequence_length)`. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + if labels is not None: + use_cache = False + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + is_decoder=is_decoder, + training=training, + ) + + sequence_output = outputs[0] + prediction_scores = self.cls(sequence_output) + + if return_logits: + return prediction_scores[:, :-1, :] + + lm_loss = None + if labels is not None: + # we are doing next-token prediction; shift prediction scores and input ids by one + shifted_prediction_scores = prediction_scores[:, :-1, :] + shifted_prediction_scores = tf.reshape(shifted_prediction_scores, (-1, self.config.vocab_size)) + labels = labels[:, 1:] + labels = tf.reshape(labels, (-1,)) + # Keras won't give us label smoothing for sparse CE, so we de-sparsify things here + # Use relu to clamp masked labels at 0 to avoid NaN (we will be zeroing those out later anyway) + one_hot_labels = tf.one_hot(tf.nn.relu(labels), depth=self.config.vocab_size, dtype=tf.float32) + loss_fct = keras.losses.CategoricalCrossentropy( + from_logits=True, label_smoothing=self.label_smoothing, reduction="none" + ) + masked_positions = tf.cast(tf.not_equal(labels, -100), dtype=tf.float32) + lm_loss = loss_fct(one_hot_labels, shifted_prediction_scores) + lm_loss *= masked_positions + lm_loss = tf.reduce_sum(lm_loss, axis=0) / tf.math.count_nonzero(masked_positions, dtype=tf.float32) + + if not return_dict: + output = (prediction_scores,) + outputs[2:] + return ((lm_loss,) + output) if lm_loss is not None else output + + return TFCausalLMOutputWithCrossAttentions( + loss=lm_loss, + logits=prediction_scores, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + cross_attentions=outputs.cross_attentions, + ) + + def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attention_mask=None, **model_kwargs): + input_shape = input_ids.shape + # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly + if attention_mask is None: + attention_mask = input_ids.new_ones(input_shape) + + # cut decoder_input_ids if past_key_values is used + if past_key_values is not None: + input_ids = input_ids[:, -1:] + + return { + "input_ids": input_ids, + "attention_mask": attention_mask, + "past_key_values": past_key_values, + "encoder_hidden_states": model_kwargs.get("encoder_hidden_states"), + "encoder_attention_mask": model_kwargs.get("encoder_attention_mask"), + "is_decoder": True, + } + + def _reorder_cache(self, past_key_values, beam_idx): + reordered_past = () + for layer_past in past_key_values: + reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),) + return reordered_past + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "bert", None) is not None: + with tf.name_scope(self.bert.name): + self.bert.build(None) + if getattr(self, "cls", None) is not None: + with tf.name_scope(self.cls.name): + self.cls.build(None) + + +__all__ = ["TFBlipTextLMHeadModel", "TFBlipTextModel", "TFBlipTextPreTrainedModel"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/blip/processing_blip.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/blip/processing_blip.py new file mode 100644 index 0000000000000000000000000000000000000000..5cc4334a974cb8e47f1b94b180e1f5cf8f553513 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/blip/processing_blip.py @@ -0,0 +1,125 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Processor class for Blip. +""" + +from typing import Optional, Union + +from ...image_utils import ImageInput +from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack +from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput + + +class BlipProcessorKwargs(ProcessingKwargs, total=False): + _defaults = { + "text_kwargs": { + "add_special_tokens": True, + "padding": False, + "stride": 0, + "return_overflowing_tokens": False, + "return_special_tokens_mask": False, + "return_offsets_mapping": False, + "return_token_type_ids": False, + "return_length": False, + "verbose": True, + }, + "images_kwargs": {}, + } + + +class BlipProcessor(ProcessorMixin): + r""" + Constructs a BLIP processor which wraps a BERT tokenizer and BLIP image processor into a single processor. + + [`BlipProcessor`] offers all the functionalities of [`BlipImageProcessor`] and [`BertTokenizerFast`]. See the + docstring of [`~BlipProcessor.__call__`] and [`~BlipProcessor.decode`] for more information. + + Args: + image_processor (`BlipImageProcessor`): + An instance of [`BlipImageProcessor`]. The image processor is a required input. + tokenizer (`BertTokenizerFast`): + An instance of ['BertTokenizerFast`]. The tokenizer is a required input. + """ + + attributes = ["image_processor", "tokenizer"] + image_processor_class = ("BlipImageProcessor", "BlipImageProcessorFast") + tokenizer_class = ("BertTokenizer", "BertTokenizerFast") + + def __init__(self, image_processor, tokenizer, **kwargs): + tokenizer.return_token_type_ids = False + super().__init__(image_processor, tokenizer) + self.current_processor = self.image_processor + + def __call__( + self, + images: Optional[ImageInput] = None, + text: Optional[Union[str, list[str], TextInput, PreTokenizedInput]] = None, + audio=None, + videos=None, + **kwargs: Unpack[BlipProcessorKwargs], + ) -> BatchEncoding: + """ + This method uses [`BlipImageProcessor.__call__`] method to prepare image(s) for the model, and + [`BertTokenizerFast.__call__`] to prepare text for the model. + + Please refer to the docstring of the above two methods for more information. + Args: + images (`ImageInput`): + The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch + tensor. Both channels-first and channels-last formats are supported. + text (`TextInput`, `PreTokenizedInput`, `list[TextInput]`, `list[PreTokenizedInput]`): + The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings + (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set + `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). + return_tensors (`str` or [`~utils.TensorType`], *optional*): + If set, will return tensors of a particular framework. Acceptable values are: + - `'tf'`: Return TensorFlow `tf.constant` objects. + - `'pt'`: Return PyTorch `torch.Tensor` objects. + - `'np'`: Return NumPy `np.ndarray` objects. + - `'jax'`: Return JAX `jnp.ndarray` objects. + """ + if images is None and text is None: + raise ValueError("You have to specify either images or text.") + + text_encoding = None + + # add pixel_values encoding. If we also have text_encoding, update image encoding and return it. + # else, return the text encoding. + output_kwargs = self._merge_kwargs( + BlipProcessorKwargs, + tokenizer_init_kwargs=self.tokenizer.init_kwargs, + **kwargs, + ) + if text is not None: + text_encoding = self.tokenizer(text, **output_kwargs["text_kwargs"]) + if images is not None: + encoding_image_processor = self.image_processor(images, **output_kwargs["images_kwargs"]) + + if text_encoding is not None: + encoding_image_processor.update(text_encoding) + return encoding_image_processor + + return text_encoding + + @property + def model_input_names(self): + tokenizer_input_names = self.tokenizer.model_input_names + image_processor_input_names = self.image_processor.model_input_names + tokenizer_input_names = [name for name in tokenizer_input_names if name != "token_type_ids"] + return tokenizer_input_names + image_processor_input_names + + +__all__ = ["BlipProcessor"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bloom/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bloom/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..72d1d6e6ca4724235ce46978e5c11e84583d4033 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bloom/__init__.py @@ -0,0 +1,29 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_bloom import * + from .modeling_bloom import * + from .modeling_flax_bloom import * + from .tokenization_bloom_fast import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bloom/configuration_bloom.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bloom/configuration_bloom.py new file mode 100644 index 0000000000000000000000000000000000000000..74748c11304111955f5a5ef038a13256d02d1837 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bloom/configuration_bloom.py @@ -0,0 +1,238 @@ +# coding=utf-8 +# Copyright 2022 the Big Science Workshop and HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Bloom configuration""" + +from collections import OrderedDict +from collections.abc import Mapping +from typing import TYPE_CHECKING, Any, Optional + +from packaging import version + + +if TYPE_CHECKING: + from ... import PreTrainedTokenizer, TensorType + +from ...configuration_utils import PretrainedConfig +from ...onnx import OnnxConfigWithPast, PatchingSpec +from ...utils import is_torch_available, logging + + +logger = logging.get_logger(__name__) + + +class BloomConfig(PretrainedConfig): + """ + This is the configuration class to store the configuration of a [`BloomModel`]. It is used to instantiate a Bloom + model according to the specified arguments, defining the model architecture. Instantiating a configuration with the + defaults will yield a similar configuration to the Bloom architecture + [bigscience/bloom](https://huggingface.co/bigscience/bloom). + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 250880): + Vocabulary size of the Bloom model. Defines the maximum number of different tokens that can be represented + by the `inputs_ids` passed when calling [`BloomModel`]. Check [this + discussion](https://huggingface.co/bigscience/bloom/discussions/120#633d28389addb8530b406c2a) on how the + `vocab_size` has been defined. + hidden_size (`int`, *optional*, defaults to 64): + Dimensionality of the embeddings and hidden states. + n_layer (`int`, *optional*, defaults to 2): + Number of hidden layers in the Transformer encoder. + n_head (`int`, *optional*, defaults to 8): + Number of attention heads for each attention layer in the Transformer encoder. + layer_norm_epsilon (`float`, *optional*, defaults to 1e-5): + The epsilon to use in the layer normalization layers. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + apply_residual_connection_post_layernorm (`bool`, *optional*, defaults to `False`): + If enabled, use the layer norm of the hidden states as the residual in the transformer blocks + hidden_dropout (`float`, *optional*, defaults to 0.1): + Dropout rate of the dropout function on the bias dropout. + attention_dropout (`float`, *optional*, defaults to 0.1): + Dropout rate applied to the attention probs + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). + pretraining_tp (`int`, *optional*, defaults to `1`): + Experimental feature. Tensor parallelism rank used during pretraining with Megatron. Please refer to [this + document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is + necessary to ensure exact reproducibility of the pretraining results. Please refer to [this + issue](https://github.com/pytorch/pytorch/issues/76232). Note also that this is enabled only when + `slow_but_exact=True`. + slow_but_exact (`bool`, *optional*, defaults to `False`): + Experimental feature. Whether to use slow but exact implementation of the attention mechanism. While + merging the TP rank tensors, due to slicing operations the results may be slightly different between the + model trained on Megatron and our model. Please refer to [this + issue](https://github.com/pytorch/pytorch/issues/76232). A solution to obtain more accurate results is to + enable this feature. Enabling this will hurt the computational time of the inference. Will be probably + resolved in the future once the main model has been fine-tuned with TP_rank=1. + + Example: + + ```python + >>> from transformers import BloomConfig, BloomModel + + >>> # Initializing a Bloom configuration + >>> configuration = BloomConfig() + + >>> # Initializing a model (with random weights) from the configuration + >>> model = BloomModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "bloom" + keys_to_ignore_at_inference = ["past_key_values"] + attribute_map = { + "num_hidden_layers": "n_layer", + "num_attention_heads": "n_head", + } + + def __init__( + self, + vocab_size=250880, + hidden_size=64, + n_layer=2, + n_head=8, + layer_norm_epsilon=1e-5, + initializer_range=0.02, + use_cache=True, + bos_token_id=1, + eos_token_id=2, + apply_residual_connection_post_layernorm=False, + hidden_dropout=0.0, + attention_dropout=0.0, + pretraining_tp=1, # TP rank used when training with megatron + slow_but_exact=False, + **kwargs, + ): + self.vocab_size = vocab_size + # Backward compatibility with n_embed kwarg + n_embed = kwargs.pop("n_embed", None) + self.hidden_size = hidden_size if n_embed is None else n_embed + self.n_layer = n_layer + self.n_head = n_head + self.layer_norm_epsilon = layer_norm_epsilon + self.initializer_range = initializer_range + self.use_cache = use_cache + self.pretraining_tp = pretraining_tp + self.apply_residual_connection_post_layernorm = apply_residual_connection_post_layernorm + self.hidden_dropout = hidden_dropout + self.attention_dropout = attention_dropout + + self.bos_token_id = bos_token_id + self.eos_token_id = eos_token_id + self.slow_but_exact = slow_but_exact + + super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) + + +class BloomOnnxConfig(OnnxConfigWithPast): + torch_onnx_minimum_version = version.parse("1.12") + + def __init__( + self, + config: PretrainedConfig, + task: str = "default", + patching_specs: Optional[list[PatchingSpec]] = None, + use_past: bool = False, + ): + super().__init__(config, task=task, patching_specs=patching_specs, use_past=use_past) + if not getattr(self._config, "pad_token_id", None): + # TODO: how to do that better? + self._config.pad_token_id = 0 + + @property + def inputs(self) -> Mapping[str, Mapping[int, str]]: + common_inputs = OrderedDict({"input_ids": {0: "batch", 1: "sequence"}}) + if self.use_past: + # BLOOM stores values on dynamic axis 2. For more details see: https://github.com/huggingface/transformers/pull/18344 + self.fill_with_past_key_values_(common_inputs, direction="inputs", inverted_values_shape=True) + common_inputs["attention_mask"] = {0: "batch", 1: "past_sequence + sequence"} + else: + common_inputs["attention_mask"] = {0: "batch", 1: "sequence"} + + return common_inputs + + @property + def num_layers(self) -> int: + return self._config.n_layer + + @property + def num_attention_heads(self) -> int: + return self._config.n_head + + @property + def atol_for_validation(self) -> float: + return 1e-3 + + def generate_dummy_inputs( + self, + tokenizer: "PreTrainedTokenizer", + batch_size: int = -1, + seq_length: int = -1, + is_pair: bool = False, + framework: Optional["TensorType"] = None, + ) -> Mapping[str, Any]: + common_inputs = super(OnnxConfigWithPast, self).generate_dummy_inputs( + tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair, framework=framework + ) + + # We need to order the input in the way they appears in the forward() + ordered_inputs = OrderedDict({"input_ids": common_inputs["input_ids"]}) + + # Need to add the past_keys + if self.use_past: + if not is_torch_available(): + raise ValueError("Cannot generate dummy past_keys inputs without PyTorch installed.") + else: + import torch + + batch, seqlen = common_inputs["input_ids"].shape + # Not using the same length for past_key_values + past_key_values_length = seqlen + 2 + head_dim = self._config.hidden_size // self.num_attention_heads + past_key_shape = ( + batch * self.num_attention_heads, + head_dim, + past_key_values_length, + ) + past_value_shape = ( + batch * self.num_attention_heads, + past_key_values_length, + head_dim, + ) + ordered_inputs["past_key_values"] = [ + (torch.zeros(past_key_shape), torch.zeros(past_value_shape)) for _ in range(self.num_layers) + ] + + ordered_inputs["attention_mask"] = common_inputs["attention_mask"] + if self.use_past: + mask_dtype = ordered_inputs["attention_mask"].dtype + ordered_inputs["attention_mask"] = torch.cat( + [ordered_inputs["attention_mask"], torch.ones(batch, past_key_values_length, dtype=mask_dtype)], dim=1 + ) + + return ordered_inputs + + @property + def default_onnx_opset(self) -> int: + return 13 + + +__all__ = ["BloomConfig", "BloomOnnxConfig"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bloom/modeling_bloom.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bloom/modeling_bloom.py new file mode 100644 index 0000000000000000000000000000000000000000..6fde63e03b4dd33a8625a73a1a74d96780aebc86 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bloom/modeling_bloom.py @@ -0,0 +1,1252 @@ +# coding=utf-8 +# Copyright 2022 HuggingFace Inc. team and BigScience workshop. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch BLOOM model.""" + +import math +import warnings +from typing import Optional, Union + +import torch +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, LayerNorm, MSELoss +from torch.nn import functional as F + +from ...cache_utils import Cache, DynamicCache, StaticCache +from ...generation import GenerationMixin +from ...modeling_attn_mask_utils import AttentionMaskConverter +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_outputs import ( + BaseModelOutputWithPastAndCrossAttentions, + CausalLMOutputWithCrossAttentions, + QuestionAnsweringModelOutput, + SequenceClassifierOutputWithPast, + TokenClassifierOutput, +) +from ...modeling_utils import PreTrainedModel +from ...utils import ( + auto_docstring, + is_torch_flex_attn_available, + logging, +) +from .configuration_bloom import BloomConfig + + +if is_torch_flex_attn_available(): + from torch.nn.attention.flex_attention import BlockMask + + from ...integrations.flex_attention import make_flex_block_causal_mask + + +logger = logging.get_logger(__name__) + + +def build_alibi_tensor(attention_mask: torch.Tensor, num_heads: int, dtype: torch.dtype) -> torch.Tensor: + """ + Link to paper: https://huggingface.co/papers/2108.12409 Alibi tensor is not causal as the original paper mentions, it + relies on a translation invariance of softmax for quick implementation: with l being a tensor, and a fixed value + `softmax(l+a) = softmax(l)`. Based on + https://github.com/ofirpress/attention_with_linear_biases/blob/a35aaca144e0eb6b789dfcb46784c4b8e31b7983/fairseq/models/transformer.py#L742 + TODO @thomasw21 this doesn't work as nicely due to the masking strategy, and so masking varies slightly. + + Args: + Returns tensor shaped (batch_size * num_heads, 1, max_seq_len) + attention_mask (`torch.Tensor`): + Token-wise attention mask, this should be of shape (batch_size, max_seq_len). + num_heads (`int`): + number of heads + dtype (`torch.dtype`, *optional*, default=`torch.bfloat16`): + dtype of the output tensor + """ + batch_size, seq_length = attention_mask.shape + closest_power_of_2 = 2 ** math.floor(math.log2(num_heads)) + base = torch.tensor( + 2 ** (-(2 ** -(math.log2(closest_power_of_2) - 3))), device=attention_mask.device, dtype=torch.float32 + ) + powers = torch.arange(1, 1 + closest_power_of_2, device=attention_mask.device, dtype=torch.int32) + slopes = torch.pow(base, powers) + + if closest_power_of_2 != num_heads: + extra_base = torch.tensor( + 2 ** (-(2 ** -(math.log2(2 * closest_power_of_2) - 3))), device=attention_mask.device, dtype=torch.float32 + ) + num_remaining_heads = min(closest_power_of_2, num_heads - closest_power_of_2) + extra_powers = torch.arange(1, 1 + 2 * num_remaining_heads, 2, device=attention_mask.device, dtype=torch.int32) + slopes = torch.cat([slopes, torch.pow(extra_base, extra_powers)], dim=0) + + # Note: alibi will added to the attention bias that will be applied to the query, key product of attention + # => therefore alibi will have to be of shape (batch_size, num_heads, query_length, key_length) + # => here we set (batch_size=1, num_heads=num_heads, query_length=1, key_length=max_length) + # => the query_length dimension will then be broadcasted correctly + # This is more or less identical to T5's relative position bias: + # https://github.com/huggingface/transformers/blob/f681437203baa7671de3174b0fa583c349d9d5e1/src/transformers/models/t5/modeling_t5.py#L527 + arange_tensor = ((attention_mask.cumsum(dim=-1) - 1) * attention_mask)[:, None, :] + alibi = slopes[..., None] * arange_tensor + return alibi.reshape(batch_size * num_heads, 1, seq_length).to(dtype) + + +def dropout_add(x: torch.Tensor, residual: torch.Tensor, prob: float, training: bool) -> torch.Tensor: + """ + Dropout add function + + Args: + x (`torch.tensor`): + input tensor + residual (`torch.tensor`): + residual tensor + prob (`float`): + dropout probability + training (`bool`): + training mode + """ + out = F.dropout(x, p=prob, training=training) + out = residual + out + return out + + +def bloom_gelu_forward(x: torch.Tensor) -> torch.Tensor: + """ + Custom bias GELU function. Adapted from Megatron-DeepSpeed code. Here we use a simple implementation (inference) to + make the model jitable. + + Args: + x (`torch.tensor`): + input hidden states + """ + return x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))) + + +def bloom_gelu_back(g: torch.Tensor, x: torch.Tensor) -> torch.Tensor: + """ + gradient of tanh approximation of gelu gradient of actual gelu is: 0.5 * (1. + torch.erf(x * 0.70710678)) + + 0.3989423 * x * torch.exp(-0.5 * x * x) + + Args: + g (`torch.tensor`): + gradient output tensor + x (`torch.tensor`): + input tensor + """ + x = x[0] # x is a tuple of 1 element, needs to unpack it first + tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)) + # sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243 + ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out) + return ff * g + + +class GeLUFunction(torch.autograd.Function): + @staticmethod + def forward(ctx, input: torch.Tensor) -> torch.Tensor: + ctx.save_for_backward(input) + return bloom_gelu_forward(input) + + @staticmethod + def backward(ctx, grad_output: torch.Tensor) -> torch.Tensor: + input = ctx.saved_tensors + tmp = bloom_gelu_back(grad_output, input) + return tmp + + +class BloomGelu(nn.Module): + """ + BloomBiasGelu wrapper function that make use of the simple function on inference mode to make the model + torchscriptable and use the autograd function in training mode to get the accurate results of the gradients Partly + copied from Megatron-DeepSpeed code and adapted for our needs + + See here why autograd functions are not torchscriptable: https://github.com/pytorch/pytorch/issues/22329 + """ + + def __init__(self): + super().__init__() + + def forward(self, x: torch.Tensor) -> torch.Tensor: + if self.training: + return GeLUFunction.apply(x) + else: + return bloom_gelu_forward(x) + + +class BloomAttention(nn.Module): + def __init__(self, config: BloomConfig, layer_idx: Optional[int] = None): + super().__init__() + + self.pretraining_tp = config.pretraining_tp + self.slow_but_exact = config.slow_but_exact + + self.hidden_size = config.hidden_size + self.num_heads = config.n_head + self.head_dim = self.hidden_size // self.num_heads + self.split_size = self.hidden_size + self.hidden_dropout = config.hidden_dropout + + if self.head_dim * self.num_heads != self.hidden_size: + raise ValueError( + f"`hidden_size` must be divisible by num_heads (got `hidden_size`: {self.hidden_size} and `num_heads`:" + f" {self.num_heads})." + ) + + # Layer-wise attention scaling + self.inv_norm_factor = 1.0 / math.sqrt(self.head_dim) + self.beta = 1.0 + self.layer_idx = layer_idx + if layer_idx is None: + logger.warning_once( + f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will " + "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` " + "when creating this class." + ) + + self.query_key_value = nn.Linear(self.hidden_size, 3 * self.hidden_size, bias=True) + self.dense = nn.Linear(self.hidden_size, self.hidden_size) + self.attention_dropout = nn.Dropout(config.attention_dropout) + + def _reshape(self, fused_qkv: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Split the last dimension into (num_heads, head_dim) and reshapes to (bs, heads, len, dim) shape + without making any copies, results share same memory storage as `fused_qkv` + + Args: + fused_qkv (`torch.tensor`): [batch_size, seq_length, num_heads * 3 * head_dim] + + Returns: + query: [batch_size, num_heads, seq_length, head_dim] + key: [batch_size, num_heads, seq_length, head_dim] + value: [batch_size, num_heads, seq_length, head_dim] + """ + batch_size, seq_length, three_times_hidden_size = fused_qkv.shape + fused_qkv = fused_qkv.view(batch_size, seq_length, self.num_heads, 3, self.head_dim) + query_layer = fused_qkv[..., 0, :].transpose(1, 2) + key_layer = fused_qkv[..., 1, :].transpose(1, 2) + value_layer = fused_qkv[..., 2, :].transpose(1, 2) + return query_layer, key_layer, value_layer + + def _merge_heads(self, x: torch.Tensor) -> torch.Tensor: + """ + Merge heads together over the last dimension + + Args: + x (`torch.tensor`): [batch_size * num_heads, seq_length, head_dim] + + Returns: + torch.tensor: [batch_size, seq_length, num_heads * head_dim] + """ + # What we want to achieve is: + # batch_size * num_heads, seq_length, head_dim -> batch_size, seq_length, num_heads * head_dim + batch_size_and_num_heads, seq_length, _ = x.shape + batch_size = batch_size_and_num_heads // self.num_heads + + # First view to decompose the batch size + # batch_size * num_heads, seq_length, head_dim -> batch_size, num_heads, seq_length, head_dim + x = x.view(batch_size, self.num_heads, seq_length, self.head_dim) + + # batch_size, num_heads, seq_length, head_dim -> batch_size, seq_length, num_heads, head_dim + x = x.permute(0, 2, 1, 3) + + # batch_size, seq_length, num_heads, head_dim -> batch_size, seq_length, num_heads * head_dim + return x.reshape(batch_size, seq_length, self.num_heads * self.head_dim) + + def forward( + self, + hidden_states: torch.Tensor, + residual: torch.Tensor, + alibi: torch.Tensor, + attention_mask: torch.Tensor, + layer_past: Optional[Cache] = None, + head_mask: Optional[torch.Tensor] = None, + use_cache: bool = False, + output_attentions: bool = False, + cache_position: Optional[torch.LongTensor] = None, + ): + batch_size, q_length, _ = hidden_states.shape + fused_qkv = self.query_key_value(hidden_states) # [batch_size, seq_length, 3 x hidden_size] + # 3 x [batch_size, num_heads, seq_length, head_dim] + query_layer, key_layer, value_layer = self._reshape(fused_qkv) + + if layer_past is not None: + cache_kwargs = {"cache_position": cache_position} + key_layer, value_layer = layer_past.update(key_layer, value_layer, self.layer_idx, cache_kwargs) + + # reshape qkv for further computations + query_layer = query_layer.reshape(batch_size * self.num_heads, -1, self.head_dim) + key_layer = key_layer.reshape(batch_size * self.num_heads, -1, self.head_dim).transpose(-1, -2) + value_layer = value_layer.reshape(batch_size * self.num_heads, -1, self.head_dim) + + # [batch_size * num_heads, q_length, kv_length] + attention_scores = alibi.baddbmm( + batch1=query_layer, + batch2=key_layer, + beta=self.beta, + alpha=self.inv_norm_factor, + ) + + # change view to [batch_size, num_heads, q_length, kv_length] + attn_weights = attention_scores.view(batch_size, self.num_heads, q_length, -1) + if attention_mask is not None: # no matter the length, we just slice it + causal_mask = attention_mask[:, :, :, : key_layer.shape[-1]] + attn_weights = attn_weights + causal_mask + + # cast attention scores to fp32, compute scaled softmax and cast back to initial dtype + attention_probs = F.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_layer.dtype) + + # [batch_size, num_heads, q_length, kv_length] + attention_probs = self.attention_dropout(attention_probs) + + if head_mask is not None: + attention_probs = attention_probs * head_mask + + # change view [batch_size x num_heads, q_length, kv_length] + attention_probs_reshaped = attention_probs.view(batch_size * self.num_heads, q_length, -1) + + # matmul: [batch_size * num_heads, q_length, head_dim] + context_layer = torch.bmm(attention_probs_reshaped, value_layer) + + # change view [batch_size, q_length, num_heads * head_dim] + context_layer = self._merge_heads(context_layer) + + # aggregate results across tp ranks. See here: https://github.com/pytorch/pytorch/issues/76232 + if self.pretraining_tp > 1 and self.slow_but_exact: + slices = self.hidden_size / self.pretraining_tp + output_tensor = torch.zeros_like(context_layer) + for i in range(self.pretraining_tp): + output_tensor = output_tensor + F.linear( + context_layer[:, :, int(i * slices) : int((i + 1) * slices)], + self.dense.weight[:, int(i * slices) : int((i + 1) * slices)], + ) + else: + output_tensor = self.dense(context_layer) + + output_tensor = dropout_add(output_tensor, residual, self.hidden_dropout, self.training) + return output_tensor, attention_probs + + +class BloomMLP(nn.Module): + def __init__(self, config: BloomConfig): + super().__init__() + hidden_size = config.hidden_size + + self.pretraining_tp = config.pretraining_tp + self.slow_but_exact = config.slow_but_exact + self.dense_h_to_4h = nn.Linear(hidden_size, 4 * hidden_size) + self.gelu_impl = BloomGelu() + self.dense_4h_to_h = nn.Linear(4 * hidden_size, hidden_size) + self.hidden_dropout = config.hidden_dropout + + def forward(self, hidden_states: torch.Tensor, residual: torch.Tensor) -> torch.Tensor: + hidden_states = self.gelu_impl(self.dense_h_to_4h(hidden_states)) + + if self.pretraining_tp > 1 and self.slow_but_exact: + intermediate_output = torch.zeros_like(residual) + slices = self.dense_4h_to_h.weight.shape[-1] / self.pretraining_tp + for i in range(self.pretraining_tp): + intermediate_output = intermediate_output + F.linear( + hidden_states[:, :, int(i * slices) : int((i + 1) * slices)], + self.dense_4h_to_h.weight[:, int(i * slices) : int((i + 1) * slices)], + ) + else: + intermediate_output = self.dense_4h_to_h(hidden_states) + + output = dropout_add(intermediate_output, residual, self.hidden_dropout, self.training) + + return output + + +class BloomBlock(GradientCheckpointingLayer): + def __init__(self, config: BloomConfig, layer_idx: Optional[int] = None): + super().__init__() + hidden_size = config.hidden_size + + self.input_layernorm = LayerNorm(hidden_size, eps=config.layer_norm_epsilon) + self.num_heads = config.n_head + self.self_attention = BloomAttention(config, layer_idx) + self.post_attention_layernorm = LayerNorm(hidden_size, eps=config.layer_norm_epsilon) + + self.mlp = BloomMLP(config) + + self.apply_residual_connection_post_layernorm = config.apply_residual_connection_post_layernorm + self.hidden_dropout = config.hidden_dropout + + def forward( + self, + hidden_states: torch.Tensor, + alibi: torch.Tensor, + attention_mask: torch.Tensor, + layer_past: Optional[Cache] = None, + head_mask: Optional[torch.Tensor] = None, + use_cache: bool = False, + output_attentions: bool = False, + cache_position: Optional[torch.LongTensor] = None, + ): + # hidden_states: [batch_size, seq_length, hidden_size] + + # Layer norm at the beginning of the transformer layer. + layernorm_output = self.input_layernorm(hidden_states) + + # Layer norm post the self attention. + if self.apply_residual_connection_post_layernorm: + residual = layernorm_output + else: + residual = hidden_states + + # Self attention. + attention_output, attn_weights = self.self_attention( + layernorm_output, + residual, + layer_past=layer_past, + attention_mask=attention_mask, + alibi=alibi, + head_mask=head_mask, + use_cache=use_cache, + output_attentions=output_attentions, + cache_position=cache_position, + ) + + layernorm_output = self.post_attention_layernorm(attention_output) + + # Get residual + if self.apply_residual_connection_post_layernorm: + residual = layernorm_output + else: + residual = attention_output + + # MLP. + output = self.mlp(layernorm_output, residual) + + return output, attn_weights # hidden_states, attentions + + +@auto_docstring +class BloomPreTrainedModel(PreTrainedModel): + config: BloomConfig + base_model_prefix = "transformer" + supports_gradient_checkpointing = True + _no_split_modules = ["BloomBlock"] + _skip_keys_device_placement = "past_key_values" + + _can_compile_fullgraph = True + + def __init__(self, *inputs, **kwargs): + super().__init__(*inputs, **kwargs) + + def _init_weights(self, module: nn.Module): + """Initialize the weights.""" + if isinstance(module, nn.Linear): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + +@auto_docstring +class BloomModel(BloomPreTrainedModel): + def __init__(self, config: BloomConfig): + super().__init__(config) + + self.embed_dim = config.hidden_size + self.num_heads = config.n_head + + # Embedding + LN Embedding + self.word_embeddings = nn.Embedding(config.vocab_size, self.embed_dim) + self.word_embeddings_layernorm = LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon) + + # Transformer blocks + self.h = nn.ModuleList([BloomBlock(config, layer_idx=i) for i in range(config.num_hidden_layers)]) + + # Final Layer Norm + self.ln_f = LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon) + + self.gradient_checkpointing = False + + # Initialize weights and apply final processing + self.post_init() + + def build_alibi_tensor(self, attention_mask: torch.Tensor, num_heads: int, dtype: torch.dtype) -> torch.Tensor: + return build_alibi_tensor(attention_mask, num_heads, dtype) + + def get_input_embeddings(self): + return self.word_embeddings + + def set_input_embeddings(self, new_embeddings: torch.Tensor): + self.word_embeddings = new_embeddings + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Union[Cache, tuple[tuple[torch.Tensor, torch.Tensor], ...]]] = None, + attention_mask: Optional[torch.Tensor] = None, + head_mask: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + **deprecated_arguments, + ) -> Union[tuple[torch.Tensor, ...], BaseModelOutputWithPastAndCrossAttentions]: + r""" + input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`): + `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values.get_seq_length()` + (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary. + + If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as + `input_ids`. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + """ + if deprecated_arguments.pop("position_ids", False) is not False: + # `position_ids` could have been `torch.Tensor` or `None` so defaulting pop to `False` allows to detect if users were passing explicitly `None` + warnings.warn( + "`position_ids` have no functionality in BLOOM and will be removed in v5.0.0. You can safely ignore" + " passing `position_ids`.", + FutureWarning, + ) + if len(deprecated_arguments) > 0: + raise ValueError(f"Got unexpected arguments: {deprecated_arguments}") + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") + + if self.gradient_checkpointing and self.training and use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + if inputs_embeds is None: + inputs_embeds = self.word_embeddings(input_ids) + + if use_cache and past_key_values is None: + past_key_values = DynamicCache(config=self.config) + + batch_size, seq_length, _ = inputs_embeds.shape + past_length = past_key_values.get_seq_length() if past_key_values is not None else 0 + seq_length_with_past = seq_length + past_length + if cache_position is None: + cache_position = torch.arange(past_length, past_length + seq_length, device=inputs_embeds.device) + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape batch_size x num_heads x N x N + # head_mask has shape n_layer x batch x num_heads x N x N + head_mask = self.get_head_mask(head_mask, self.config.n_layer) + hidden_states = self.word_embeddings_layernorm(inputs_embeds) + + all_self_attentions = () if output_attentions else None + all_hidden_states = () if output_hidden_states else None + + # Compute alibi tensor: check build_alibi_tensor documentation + if attention_mask is None: + attention_mask = torch.ones((batch_size, seq_length_with_past), device=hidden_states.device) + else: + attention_mask = attention_mask.to(hidden_states.device) + + alibi = self.build_alibi_tensor(attention_mask, self.num_heads, dtype=hidden_states.dtype) + causal_mask = self._update_causal_mask( + attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions + ) + + for i, block in enumerate(self.h): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + outputs = block( + hidden_states, + layer_past=past_key_values, + attention_mask=causal_mask, + head_mask=head_mask[i], + use_cache=use_cache, + output_attentions=output_attentions, + alibi=alibi, + cache_position=cache_position, + ) + + hidden_states = outputs[0] + if output_attentions: + all_self_attentions = all_self_attentions + (outputs[1],) + + # Add last hidden state + hidden_states = self.ln_f(hidden_states) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple( + v for v in [hidden_states, past_key_values, all_hidden_states, all_self_attentions] if v is not None + ) + + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=past_key_values, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + ) + + # Copied from transformers.models.gptj.modeling_gptj.GPTJModel._update_causal_mask + def _update_causal_mask( + self, + attention_mask: Union[torch.Tensor, "BlockMask"], + input_tensor: torch.Tensor, + cache_position: torch.Tensor, + past_key_values: Cache, + output_attentions: bool = False, + ): + if self.config._attn_implementation == "flash_attention_2": + if attention_mask is not None and (attention_mask == 0.0).any(): + return attention_mask + return None + if self.config._attn_implementation == "flex_attention": + if isinstance(attention_mask, torch.Tensor): + attention_mask = make_flex_block_causal_mask(attention_mask) + return attention_mask + + # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in + # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail + # to infer the attention mask. + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + using_compilable_cache = past_key_values.is_compileable if past_key_values is not None else False + + # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward + if self.config._attn_implementation == "sdpa" and not using_compilable_cache and not output_attentions: + if AttentionMaskConverter._ignore_causal_mask_sdpa( + attention_mask, + inputs_embeds=input_tensor, + past_key_values_length=past_seen_tokens, + is_training=self.training, + ): + return None + + dtype = input_tensor.dtype + sequence_length = input_tensor.shape[1] + if using_compilable_cache: + target_length = past_key_values.get_max_cache_shape() + else: + target_length = ( + attention_mask.shape[-1] + if isinstance(attention_mask, torch.Tensor) + else past_seen_tokens + sequence_length + 1 + ) + + # In case the provided `attention` mask is 2D, we generate a causal mask here (4D). + causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position( + attention_mask, + sequence_length=sequence_length, + target_length=target_length, + dtype=dtype, + cache_position=cache_position, + batch_size=input_tensor.shape[0], + ) + + if ( + self.config._attn_implementation == "sdpa" + and attention_mask is not None + and attention_mask.device.type in ["cuda", "xpu", "npu"] + and not output_attentions + ): + # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when + # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path. + # Details: https://github.com/pytorch/pytorch/issues/110213 + min_dtype = torch.finfo(dtype).min + causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype) + + return causal_mask + + @staticmethod + # Copied from transformers.models.gptj.modeling_gptj.GPTJModel._prepare_4d_causal_attention_mask_with_cache_position + def _prepare_4d_causal_attention_mask_with_cache_position( + attention_mask: torch.Tensor, + sequence_length: int, + target_length: int, + dtype: torch.dtype, + cache_position: torch.Tensor, + batch_size: int, + **kwargs, + ): + """ + Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape + `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing. + + Args: + attention_mask (`torch.Tensor`): + A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape + `(batch_size, 1, query_length, key_value_length)`. + sequence_length (`int`): + The sequence length being processed. + target_length (`int`): + The target length: when generating with static cache, the mask should be as long as the static cache, + to account for the 0 padding, the part of the cache that is not filled yet. + dtype (`torch.dtype`): + The dtype to use for the 4D attention mask. + cache_position (`torch.Tensor`): + Indices depicting the position of the input sequence tokens in the sequence. + batch_size (`torch.Tensor`): + Batch size. + """ + if attention_mask is not None and attention_mask.dim() == 4: + # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing. + causal_mask = attention_mask + else: + min_dtype = torch.finfo(dtype).min + causal_mask = torch.full( + (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=cache_position.device + ) + if sequence_length != 1: + causal_mask = torch.triu(causal_mask, diagonal=1) + causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(-1, 1) + causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1) + if attention_mask is not None: + causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit + mask_length = attention_mask.shape[-1] + padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to( + causal_mask.device + ) + padding_mask = padding_mask == 0 + causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill( + padding_mask, min_dtype + ) + + return causal_mask + + +@auto_docstring( + custom_intro=""" + The Bloom Model transformer with a language modeling head on top (linear layer with weights tied to the input + embeddings). + """ +) +class BloomForCausalLM(BloomPreTrainedModel, GenerationMixin): + _tied_weights_keys = ["lm_head.weight"] + + def __init__(self, config: BloomConfig): + super().__init__(config) + self.transformer = BloomModel(config) + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def set_output_embeddings(self, new_embeddings: torch.Tensor): + self.lm_head = new_embeddings + + def prepare_inputs_for_generation( + self, + input_ids, + past_key_values=None, + attention_mask=None, + inputs_embeds=None, + cache_position=None, + use_cache=True, + **kwargs, + ): + # Overwritten because of the fixed-shape attention mask creation + + # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens + # Exception 1: when passing input_embeds, input_ids may be missing entries + # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here + # Exception 3: with synced GPUs cache_position may go out of bounds, but we only want dummy token in that case. + # (we can't check exception 3 while compiling) + # Exception 4: If input_embeds are passed then slice it through `cache_position`, to keep only the unprocessed tokens and + # generate the first token for each sequence. Later use the generated Input ids for continuation. + if past_key_values is not None: + if inputs_embeds is not None and input_ids.shape[1] == 0: # Exception 4 + inputs_embeds = inputs_embeds[:, -cache_position.shape[0] :] + elif ( + inputs_embeds is not None # Exception 1 + or cache_position[-1] >= input_ids.shape[1] # Exception 3 + ): + input_ids = input_ids[:, -cache_position.shape[0] :] + elif input_ids.shape[1] != cache_position.shape[0]: # Default case (the "else", a no op, is Exception 2) + input_ids = input_ids[:, cache_position] + + # if `inputs_embeds` are passed, we only want to use them in the 1st generation step + if inputs_embeds is not None and len(cache_position) == inputs_embeds.shape[1]: + model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None} + else: + # This `clone` call is needed to avoid recapturing cuda graphs with `torch.compile`'s `mode="reduce-overhead`, as otherwise the + # input `position_ids` would have various stride during the decoding. Here, simply using `.contiguous()` is not sufficient as in + # the batch size = 1 case, `position_ids` is already contiguous but with varying stride which retriggers a capture. + model_inputs = {"input_ids": input_ids.clone(memory_format=torch.contiguous_format), "inputs_embeds": None} + + # This part differs from other models because BLOOM needs a 2D mask to construct alibi tensor + # The only difference is the usage of 2D instead of 4D mask, but the shape will be static + if isinstance(past_key_values, StaticCache) and attention_mask is not None: + target_length = past_key_values.get_max_cache_shape() + batch_size, seq_length = attention_mask.shape + diff = target_length - seq_length + + new_attn_mask = torch.zeros(batch_size, diff, device=attention_mask.device, dtype=attention_mask.dtype) + attention_mask = torch.cat( + [attention_mask, new_attn_mask], + dim=-1, + ) + + model_inputs.update( + { + "cache_position": cache_position, + "past_key_values": past_key_values, + "use_cache": use_cache, + "attention_mask": attention_mask, + } + ) + + # Forward ALL kwargs that are uninitialized (e.g. `use_cache`). + for key, value in kwargs.items(): + if key not in model_inputs: + model_inputs[key] = value + + return model_inputs + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Union[Cache, tuple[tuple[torch.Tensor, torch.Tensor], ...]]] = None, + attention_mask: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + **deprecated_arguments, + ) -> Union[tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]: + r""" + input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`): + `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values.get_seq_length()` + (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary. + + If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as + `input_ids`. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set + `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100` + are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]` + """ + # Bloom has deprecated kwargs, so we need to pop num_items_in_batch explicitly + num_items_in_batch = deprecated_arguments.pop("num_items_in_batch", None) + if deprecated_arguments.pop("position_ids", False) is not False: + # `position_ids` could have been `torch.Tensor` or `None` so defaulting pop to `False` allows to detect if users were passing explicitly `None` + warnings.warn( + "`position_ids` have no functionality in BLOOM and will be removed in v5.0.0. You can safely ignore" + " passing `position_ids`.", + FutureWarning, + ) + if len(deprecated_arguments) > 0: + raise ValueError(f"Got unexpected arguments: {deprecated_arguments}") + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + transformer_outputs = self.transformer( + input_ids, + past_key_values=past_key_values, + attention_mask=attention_mask, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + cache_position=cache_position, + ) + hidden_states = transformer_outputs[0] + + lm_logits = self.lm_head(hidden_states) + + loss = None + if labels is not None: + # move labels to correct device to enable model parallelism + labels = labels.to(lm_logits.device) + # Flatten the tokens + loss = self.loss_function( + lm_logits, + labels, + vocab_size=self.config.vocab_size, + num_items_in_batch=num_items_in_batch, + ) + + if not return_dict: + output = (lm_logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return CausalLMOutputWithCrossAttentions( + loss=loss, + logits=lm_logits, + past_key_values=transformer_outputs.past_key_values, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) + + +@auto_docstring( + custom_intro=""" + The Bloom Model transformer with a sequence classification head on top (linear layer). + + [`BloomForSequenceClassification`] uses the last token in order to do the classification, as other causal models + (e.g. GPT-1) do. + + Since it does classification on the last token, it requires to know the position of the last token. If a + `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If + no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the + padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in + each row of the batch). + """ +) +class BloomForSequenceClassification(BloomPreTrainedModel): + def __init__(self, config: BloomConfig): + super().__init__(config) + self.num_labels = config.num_labels + self.transformer = BloomModel(config) + self.score = nn.Linear(config.hidden_size, config.num_labels, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Union[Cache, tuple[tuple[torch.Tensor, torch.Tensor], ...]]] = None, + attention_mask: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + **deprecated_arguments, + ) -> Union[tuple[torch.Tensor], SequenceClassifierOutputWithPast]: + r""" + input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`): + `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values.get_seq_length()` + (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary. + + If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as + `input_ids`. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + if deprecated_arguments.pop("position_ids", False) is not False: + # `position_ids` could have been `torch.Tensor` or `None` so defaulting pop to `False` allows to detect if users were passing explicitly `None` + warnings.warn( + "`position_ids` have no functionality in BLOOM and will be removed in v5.0.0. You can safely ignore" + " passing `position_ids`.", + FutureWarning, + ) + if len(deprecated_arguments) > 0: + raise ValueError(f"Got unexpected arguments: {deprecated_arguments}") + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + transformer_outputs = self.transformer( + input_ids, + past_key_values=past_key_values, + attention_mask=attention_mask, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = transformer_outputs[0] + logits = self.score(hidden_states) + + if input_ids is not None: + batch_size = input_ids.shape[0] + else: + batch_size = inputs_embeds.shape[0] + + if self.config.pad_token_id is None and batch_size != 1: + raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.") + if self.config.pad_token_id is None: + last_non_pad_token = -1 + elif input_ids is not None: + # To handle both left- and right- padding, we take the rightmost token that is not equal to pad_token_id + non_pad_mask = (input_ids != self.config.pad_token_id).to(logits.device, torch.int32) + token_indices = torch.arange(input_ids.shape[-1], device=logits.device, dtype=torch.int32) + last_non_pad_token = (token_indices * non_pad_mask).argmax(-1) + else: + last_non_pad_token = -1 + logger.warning_once( + f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be " + "unexpected if using padding tokens in conjunction with `inputs_embeds.`" + ) + + pooled_logits = logits[torch.arange(batch_size, device=logits.device), last_non_pad_token] + + loss = None + if labels is not None: + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(pooled_logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(pooled_logits, labels) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(pooled_logits, labels) + if not return_dict: + output = (pooled_logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutputWithPast( + loss=loss, + logits=pooled_logits, + past_key_values=transformer_outputs.past_key_values, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) + + +@auto_docstring +class BloomForTokenClassification(BloomPreTrainedModel): + def __init__(self, config: BloomConfig): + super().__init__(config) + self.num_labels = config.num_labels + + self.transformer = BloomModel(config) + if hasattr(config, "classifier_dropout") and config.classifier_dropout is not None: + classifier_dropout = config.classifier_dropout + elif hasattr(config, "hidden_dropout") and config.hidden_dropout is not None: + classifier_dropout = config.hidden_dropout + else: + classifier_dropout = 0.1 + self.dropout = nn.Dropout(classifier_dropout) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Union[Cache, tuple[tuple[torch.Tensor, torch.Tensor], ...]]] = None, + attention_mask: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + **deprecated_arguments, + ) -> Union[tuple[torch.Tensor], TokenClassifierOutput]: + r""" + input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`): + `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values.get_seq_length()` + (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary. + + If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as + `input_ids`. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + if deprecated_arguments.pop("position_ids", False) is not False: + # `position_ids` could have been `torch.Tensor` or `None` so defaulting pop to `False` allows to detect if users were passing explicitly `None` + warnings.warn( + "`position_ids` have no functionality in BLOOM and will be removed in v5.0.0. You can safely ignore" + " passing `position_ids`.", + FutureWarning, + ) + if len(deprecated_arguments) > 0: + raise ValueError(f"Got unexpected arguments: {deprecated_arguments}") + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + transformer_outputs = self.transformer( + input_ids, + past_key_values=past_key_values, + attention_mask=attention_mask, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = transformer_outputs[0] + hidden_states = self.dropout(hidden_states) + logits = self.classifier(hidden_states) + + loss = None + if labels is not None: + # move labels to correct device to enable model parallelism + labels = labels.to(logits.device) + batch_size, seq_length = labels.shape + loss_fct = CrossEntropyLoss() + loss = loss_fct( + logits.view(batch_size * seq_length, self.num_labels), labels.view(batch_size * seq_length) + ) + + if not return_dict: + output = (logits,) + transformer_outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TokenClassifierOutput( + loss=loss, + logits=logits, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) + + +@auto_docstring +class BloomForQuestionAnswering(BloomPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.transformer = BloomModel(config) + self.qa_outputs = nn.Linear(config.hidden_size, 2) + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + start_positions: Optional[torch.LongTensor] = None, + end_positions: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, QuestionAnsweringModelOutput]: + r""" + input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`): + `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values.get_seq_length()` + (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary. + + If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as + `input_ids`. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.transformer( + input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + logits = self.qa_outputs(sequence_output) + start_logits, end_logits = logits.split(1, dim=-1) + start_logits = start_logits.squeeze(-1).contiguous() + end_logits = end_logits.squeeze(-1).contiguous() + + total_loss = None + if start_positions is not None and end_positions is not None: + # If we are on multi-GPU, split add a dimension + if len(start_positions.size()) > 1: + start_positions = start_positions.squeeze(-1) + if len(end_positions.size()) > 1: + end_positions = end_positions.squeeze(-1) + # sometimes the start/end positions are outside our model inputs, we ignore these terms + ignored_index = start_logits.size(1) + start_positions = start_positions.clamp(0, ignored_index) + end_positions = end_positions.clamp(0, ignored_index) + + loss_fct = CrossEntropyLoss(ignore_index=ignored_index) + start_loss = loss_fct(start_logits, start_positions) + end_loss = loss_fct(end_logits, end_positions) + total_loss = (start_loss + end_loss) / 2 + + if not return_dict: + output = (start_logits, end_logits) + outputs[2:] + return ((total_loss,) + output) if total_loss is not None else output + + return QuestionAnsweringModelOutput( + loss=total_loss, + start_logits=start_logits, + end_logits=end_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +__all__ = [ + "BloomForCausalLM", + "BloomModel", + "BloomPreTrainedModel", + "BloomForSequenceClassification", + "BloomForTokenClassification", + "BloomForQuestionAnswering", +] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bloom/modeling_flax_bloom.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bloom/modeling_flax_bloom.py new file mode 100644 index 0000000000000000000000000000000000000000..c7bb1cc9c9a5b098f9c03d6313818e744e61951e --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bloom/modeling_flax_bloom.py @@ -0,0 +1,737 @@ +# coding=utf-8 +# Copyright 2023 HuggingFace Inc. Team and Bigscience Workshop. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Flax BLOOM model.""" + +import math +from functools import partial +from typing import Optional + +import flax.linen as nn +import jax +import jax.numpy as jnp +from flax.core.frozen_dict import FrozenDict, freeze, unfreeze +from flax.linen import combine_masks, dot_product_attention_weights, make_causal_mask +from flax.linen.activation import tanh +from flax.traverse_util import flatten_dict, unflatten_dict +from jax import lax + +from ...modeling_flax_outputs import ( + FlaxBaseModelOutput, + FlaxBaseModelOutputWithPastAndCrossAttentions, + FlaxCausalLMOutput, +) +from ...modeling_flax_utils import FlaxPreTrainedModel, append_call_sample_docstring +from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging +from .configuration_bloom import BloomConfig + + +logger = logging.get_logger(__name__) + +_CHECKPOINT_FOR_DOC = "bigscience/bloom" +_CONFIG_FOR_DOC = "BloomConfig" + + +BLOOM_START_DOCSTRING = r""" + + This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a Flax Linen + [flax.nn.Module](https://flax.readthedocs.io/en/latest/_autosummary/flax.nn.module.html) subclass. Use it as a + regular Flax Module and refer to the Flax documentation for all matter related to general usage and behavior. + + Finally, this model supports inherent JAX features such as: + + - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit) + - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation) + - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap) + - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap) + + Parameters: + config ([`BloomConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights. + dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`): + The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on GPUs) and + `jax.numpy.bfloat16` (on TPUs). + + This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If + specified all the computation will be performed with the given `dtype`. + + **Note that this only specifies the dtype of the computation and does not influence the dtype of model + parameters.** + + If you wish to change the dtype of the model parameters, see [`~FlaxPreTrainedModel.to_fp16`] and + [`~FlaxPreTrainedModel.to_bf16`]. +""" + +BLOOM_INPUTS_DOCSTRING = r""" + Args: + input_ids (`numpy.ndarray` of shape `(batch_size, input_ids_length)`): + `input_ids_length` = `sequence_length`. Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`BloomTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + past_key_values (`dict[str, np.ndarray]`, *optional*, returned by `init_cache` or when passing previous `past_key_values`): + Dictionary of pre-computed hidden-states (key and values in the attention blocks) that can be used for fast + auto-regressive decoding. Pre-computed key and value hidden-states are of shape *[batch_size, max_length]*. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +def build_alibi_tensor(attention_mask: jnp.ndarray, num_heads: int, dtype: Optional[jnp.dtype] = jnp.float32): + """ + Flax implementation of the BLOOM Alibi tensor. BLOOM Alibi tensor is not causal as the original paper mentions, it + relies on a translation invariance of softmax for quick implementation: with l being a tensor, and a fixed value + `softmax(l+a) = softmax(l)`. Based on + https://github.com/ofirpress/attention_with_linear_biases/blob/a35aaca144e0eb6b789dfcb46784c4b8e31b7983/fairseq/models/transformer.py#L742 + Link to paper: https://huggingface.co/papers/2108.12409 + + Args: + attention_mask (`jnp.ndarray`): + Token-wise attention mask, this should be of shape `(batch_size, max_seq_len)`. + num_heads (`int`): + Number of attention heads. + dtype (`jnp.dtype`, *optional*, defaults to `jnp.float32`): + The data type (dtype) of the output tensor. + + Returns: Alibi tensor of shape `(batch_size * num_heads, 1, max_seq_len)`. + """ + batch_size, seq_length = attention_mask.shape + closest_power_of_2 = 2 ** math.floor(math.log2(num_heads)) + base = jnp.array(2 ** (-(2 ** -(math.log2(closest_power_of_2) - 3))), dtype=jnp.float32) + powers = jnp.arange(1, 1 + closest_power_of_2, dtype=jnp.float32) + slopes = jax.lax.pow(base, powers) + + if closest_power_of_2 != num_heads: + extra_base = jnp.array(2 ** (-(2 ** -(math.log2(2 * closest_power_of_2) - 3))), dtype=jnp.float32) + num_remaining_heads = min(closest_power_of_2, num_heads - closest_power_of_2) + extra_powers = jnp.arange(1, 1 + 2 * num_remaining_heads, 2, dtype=jnp.float32) + slopes = jnp.cat([slopes, jax.lax.pow(extra_base, extra_powers)], axis=0) + + # Note: the Alibi tensor will added to the attention bias that will be applied to the query, key product of attention + # therefore, Alibi will have to be of shape (batch_size, num_heads, query_length, key_length) + # => here we set (batch_size=1, num_heads=num_heads, query_length=1, key_length=max_length) + # so that the query_length dimension will then be broadcast correctly. + # This is more or less identical to T5's relative position bias: + # https://github.com/huggingface/transformers/blob/f681437203baa7671de3174b0fa583c349d9d5e1/src/transformers/models/t5/modeling_t5.py#L527 + arange_tensor = ((attention_mask.cumsum(axis=-1) - 1) * attention_mask)[:, None, :] + alibi = slopes[..., None] * arange_tensor + alibi = jnp.expand_dims(alibi, axis=2) + return jnp.asarray(alibi, dtype) + + +class FlaxBloomAttention(nn.Module): + config: BloomConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self): + self.hidden_size = self.config.hidden_size + self.num_heads = self.config.n_head + self.head_dim = self.hidden_size // self.num_heads + self.attention_softmax_in_fp32 = self.dtype is not jnp.float32 + + if self.head_dim * self.num_heads != self.hidden_size: + raise ValueError( + f"`hidden_size` must be divisible by `num_heads` (got `hidden_size`: {self.hidden_size} and " + f"`num_heads`: {self.num_heads})." + ) + + dense = partial( + nn.Dense, + dtype=self.dtype, + kernel_init=jax.nn.initializers.normal(self.config.initializer_range), + ) + + self.query_key_value = dense(self.hidden_size * 3) + self.dense = dense(self.hidden_size) + self.resid_dropout = nn.Dropout(rate=self.config.hidden_dropout) + + def _split_heads(self, hidden_states): + return hidden_states.reshape(hidden_states.shape[:-1] + (self.num_heads, self.head_dim * 3)) + + def _merge_heads(self, hidden_states): + return hidden_states.reshape(hidden_states.shape[:2] + (self.hidden_size,)) + + @nn.compact + # Copied from transformers.models.gptj.modeling_flax_gptj.FlaxGPTJAttention._concatenate_to_cache + def _concatenate_to_cache(self, key, value, query, attention_mask): + """ + This function takes projected key, value states from a single input token and concatenates the states to cached + states from previous steps. This function is slightly adapted from the official Flax repository: + https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252 + """ + # detect if we're initializing by absence of existing cache data. + is_initialized = self.has_variable("cache", "cached_key") + cached_key = self.variable("cache", "cached_key", jnp.zeros, key.shape, key.dtype) + cached_value = self.variable("cache", "cached_value", jnp.zeros, value.shape, value.dtype) + cache_index = self.variable("cache", "cache_index", lambda: jnp.array(0, dtype=jnp.int32)) + + if is_initialized: + *batch_dims, max_length, num_heads, depth_per_head = cached_key.value.shape + # update key, value caches with our new 1d spatial slices + cur_index = cache_index.value + indices = (0,) * len(batch_dims) + (cur_index, 0, 0) + key = lax.dynamic_update_slice(cached_key.value, key, indices) + value = lax.dynamic_update_slice(cached_value.value, value, indices) + cached_key.value = key + cached_value.value = value + num_updated_cache_vectors = query.shape[1] + cache_index.value = cache_index.value + num_updated_cache_vectors + # causal mask for cached decoder self-attention: our single query position should only attend to those key + # positions that have already been generated and cached, not the remaining zero elements. + pad_mask = jnp.broadcast_to( + jnp.arange(max_length) < cur_index + num_updated_cache_vectors, + tuple(batch_dims) + (1, num_updated_cache_vectors, max_length), + ) + attention_mask = combine_masks(pad_mask, attention_mask) + return key, value, attention_mask + + def __call__( + self, + hidden_states, + residual, + alibi, + attention_mask=None, + deterministic: bool = True, + init_cache: bool = False, + output_attentions: bool = False, + ): + batch_size, seq_length = hidden_states.shape[:2] + + # proj q, k, v + fused_qkv = self.query_key_value(hidden_states) + fused_qkv = self._split_heads(fused_qkv) + query, key, value = jnp.split(fused_qkv, 3, axis=-1) + + causal_attention_mask = make_causal_mask(attention_mask, dtype="bool") + + # for fast decoding causal attention mask should be shifted + causal_attention_mask_shift = ( + self.variables["cache"]["cache_index"] if self.has_variable("cache", "cached_key") else 0 + ) + + # fast decoding for generate requires special attention_mask + if self.has_variable("cache", "cached_key"): + max_decoder_length = self.variables["cache"]["cached_key"].shape[1] + causal_attention_mask = jax.lax.dynamic_slice( + causal_attention_mask, + (0, 0, causal_attention_mask_shift, 0), + (1, 1, seq_length, max_decoder_length), + ) + + # broadcast causal attention mask & attention mask to fit for merge + causal_attention_mask = jnp.broadcast_to( + causal_attention_mask, (batch_size,) + causal_attention_mask.shape[1:] + ) + attention_mask = jnp.broadcast_to(jnp.expand_dims(attention_mask, axis=(-3, -2)), causal_attention_mask.shape) + attention_mask = combine_masks(attention_mask, causal_attention_mask) + + dropout_rng = None + if not deterministic and self.config.attention_dropout > 0.0: + dropout_rng = self.make_rng("dropout") + + # During fast autoregressive decoding, we feed one position at a time, + # and cache the keys and values step by step. + if self.has_variable("cache", "cached_key") or init_cache: + key, value, attention_mask = self._concatenate_to_cache(key, value, query, attention_mask) + + # transform boolean mask into float mask + mask_value = jnp.finfo(self.dtype).min + attention_bias = lax.select( + attention_mask > 0, + jnp.full(attention_mask.shape, 0.0).astype(self.dtype), + jnp.full(attention_mask.shape, mask_value).astype(self.dtype), + ) + + attention_bias = attention_bias + alibi + + # Cast in fp32 if the original dtype is different from fp32 + attention_dtype = jnp.float32 if self.attention_softmax_in_fp32 else self.dtype + + attn_weights = dot_product_attention_weights( + query, + key, + bias=attention_bias, + dropout_rng=dropout_rng, + dropout_rate=self.config.attention_dropout, + deterministic=deterministic, + dtype=attention_dtype, + ) + + # Cast back in the original dtype if the native dtype is not fp32 + if self.attention_softmax_in_fp32: + attn_weights = attn_weights.astype(self.dtype) + + attn_output = jnp.einsum("...hqk,...khd->...qhd", attn_weights, value) + attn_output = self._merge_heads(attn_output) + attn_output = self.dense(attn_output) + attn_output = self.resid_dropout(attn_output, deterministic=deterministic) + + attn_output = attn_output + residual + + outputs = (attn_output, attn_weights) if output_attentions else (attn_output,) + return outputs + + +class BloomGELU(nn.Module): + def setup(self): + self.dtype = jnp.float32 + + def __call__(self, x): + return x * 0.5 * (1.0 + tanh(0.79788456 * x * (1 + 0.044715 * x * x))) + + +class FlaxBloomMLP(nn.Module): + config: BloomConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self): + hidden_size = self.config.hidden_size + + kernel_init = jax.nn.initializers.normal(self.config.initializer_range) + + self.dense_h_to_4h = nn.Dense(4 * hidden_size, dtype=self.dtype, kernel_init=kernel_init) + self.dense_4h_to_h = nn.Dense(hidden_size, dtype=self.dtype, kernel_init=kernel_init) + self.hidden_dropout = nn.Dropout(self.config.hidden_dropout) + self.act = BloomGELU() + + def __call__(self, hidden_states, residual, deterministic: bool = True): + hidden_states = self.dense_h_to_4h(hidden_states) + hidden_states = self.act(hidden_states) + + intermediate_output = self.dense_4h_to_h(hidden_states) + + intermediate_output = intermediate_output + residual + hidden_states = self.hidden_dropout(intermediate_output, deterministic=deterministic) + + return hidden_states + + +class FlaxBloomBlock(nn.Module): + config: BloomConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self): + self.input_layernorm = nn.LayerNorm(epsilon=self.config.layer_norm_epsilon, dtype=self.dtype) + + self.self_attention = FlaxBloomAttention(self.config, dtype=self.dtype) + self.post_attention_layernorm = nn.LayerNorm(epsilon=self.config.layer_norm_epsilon, dtype=self.dtype) + + self.mlp = FlaxBloomMLP(self.config, dtype=self.dtype) + + self.apply_residual_connection_post_layernorm = self.config.apply_residual_connection_post_layernorm + self.hidden_dropout = self.config.hidden_dropout + + def __call__( + self, + hidden_states, + alibi, + attention_mask=None, + deterministic: bool = True, + init_cache: bool = False, + output_attentions: bool = False, + ): + layernorm_output = self.input_layernorm(hidden_states) + + # layer norm before saving residual if config calls for it + if self.apply_residual_connection_post_layernorm: + residual = layernorm_output + else: + residual = hidden_states + + # self-attention + attn_outputs = self.self_attention( + layernorm_output, + residual=residual, + alibi=alibi, + attention_mask=attention_mask, + deterministic=deterministic, + init_cache=init_cache, + output_attentions=output_attentions, + ) + + attention_output = attn_outputs[0] + + outputs = attn_outputs[1:] + + post_layernorm = self.post_attention_layernorm(attention_output) + + # set residual based on config + if self.apply_residual_connection_post_layernorm: + residual = post_layernorm + else: + residual = attention_output + + output = self.mlp(post_layernorm, residual, deterministic=deterministic) + + outputs = (output,) + outputs + + return outputs + + +class FlaxBloomPreTrainedModel(FlaxPreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = BloomConfig + base_model_prefix = "transformer" + module_class: nn.Module = None + + def __init__( + self, + config: BloomConfig, + input_shape: tuple = (1, 1), + seed: int = 0, + dtype: jnp.dtype = jnp.float32, + _do_init: bool = True, + **kwargs, + ): + module = self.module_class(config=config, dtype=dtype, **kwargs) + super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init) + + def init_weights(self, rng: jax.random.PRNGKey, input_shape: tuple, params: FrozenDict = None) -> FrozenDict: + # init input tensors + input_ids = jnp.zeros(input_shape, dtype="i4") + attention_mask = jnp.ones_like(input_ids) + params_rng, dropout_rng = jax.random.split(rng) + rngs = {"params": params_rng, "dropout": dropout_rng} + + random_params = self.module.init(rngs, input_ids, attention_mask, return_dict=False)["params"] + + if params is not None: + random_params = flatten_dict(unfreeze(random_params)) + params = flatten_dict(unfreeze(params)) + for missing_key in self._missing_keys: + params[missing_key] = random_params[missing_key] + self._missing_keys = set() + return freeze(unflatten_dict(params)) + else: + return random_params + + def init_cache(self, batch_size, max_length): + r""" + Args: + batch_size (`int`): + batch_size used for fast auto-regressive decoding. Defines the batch size of the initialized cache. + max_length (`int`): + maximum possible length for auto-regressive decoding. Defines the sequence length of the initialized + cache. + """ + # init input variables to retrieve cache + input_ids = jnp.ones((batch_size, max_length), dtype="i4") + attention_mask = jnp.ones_like(input_ids) + + init_variables = self.module.init( + jax.random.PRNGKey(0), input_ids, attention_mask, return_dict=False, init_cache=True + ) + return unfreeze(init_variables["cache"]) + + @add_start_docstrings_to_model_forward(BLOOM_INPUTS_DOCSTRING) + def __call__( + self, + input_ids, + attention_mask=None, + past_key_values: Optional[dict] = None, + params: Optional[dict] = None, + dropout_rng: jax.random.PRNGKey = None, + train: bool = False, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ): + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + batch_size, sequence_length = input_ids.shape + + if attention_mask is None: + attention_mask = jnp.ones((batch_size, sequence_length)) + + # Handle any PRNG if needed + rngs = {} + if dropout_rng is not None: + rngs["dropout"] = dropout_rng + + inputs = {"params": params or self.params} + + # If past_key_values are passed then cache is already initialized a private flag init_cache has to be passed + # down to ensure cache is used. It has to be made sure that cache is marked as mutable so that it can be + # changed by FlaxBloomAttention module + if past_key_values: + inputs["cache"] = past_key_values + mutable = ["cache"] + else: + mutable = False + + outputs = self.module.apply( + inputs, + jnp.array(input_ids, dtype="i4"), + jnp.array(attention_mask, dtype="i4"), + not train, + False, + output_attentions, + output_hidden_states, + return_dict, + rngs=rngs, + mutable=mutable, + ) + + # add updated cache to model output + if past_key_values is not None and return_dict: + outputs, past_key_values = outputs + outputs["past_key_values"] = unfreeze(past_key_values["cache"]) + return outputs + elif past_key_values is not None and not return_dict: + outputs, past_key_values = outputs + outputs = outputs[:1] + (unfreeze(past_key_values["cache"]),) + outputs[1:] + + return outputs + + +class FlaxBloomBlockCollection(nn.Module): + config: BloomConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self): + self.layers = [ + FlaxBloomBlock(self.config, name=str(layer_number), dtype=self.dtype) + for layer_number in range(self.config.num_hidden_layers) + ] + + def __call__( + self, + hidden_states, + alibi, + attention_mask=None, + deterministic: bool = True, + init_cache: bool = False, + output_attentions: bool = False, + output_hidden_states: bool = False, + ): + all_attentions = () if output_attentions else None + all_hidden_states = () if output_hidden_states else None + + for layer_number in range(self.config.num_hidden_layers): + if output_hidden_states: + all_hidden_states += (hidden_states,) + + layer_outputs = self.layers[layer_number]( + hidden_states, + alibi=alibi, + attention_mask=attention_mask, + deterministic=deterministic, + init_cache=init_cache, + output_attentions=output_attentions, + ) + hidden_states = layer_outputs[0] + + if output_attentions: + all_attentions += (layer_outputs[1],) + + # this contains possible `None` values - `FlaxBloomModule` will filter them out + outputs = (hidden_states, all_hidden_states, all_attentions) + + return outputs + + +class FlaxBloomModule(nn.Module): + config: BloomConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self): + self.embed_dim = self.config.hidden_size + + # word embeddings (no positional embedding layer) + self.word_embeddings = nn.Embed( + self.config.vocab_size, + self.embed_dim, + embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range), + dtype=self.dtype, + ) + + # post-embedding layernorm + self.word_embeddings_layernorm = nn.LayerNorm(epsilon=self.config.layer_norm_epsilon, dtype=self.dtype) + + # transformer layers + self.h = FlaxBloomBlockCollection(self.config, dtype=self.dtype) + + # final layernorm + self.ln_f = nn.LayerNorm(epsilon=self.config.layer_norm_epsilon, dtype=self.dtype) + + def __call__( + self, + input_ids=None, + attention_mask=None, + deterministic=True, + init_cache: bool = False, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + inputs_embeds = self.word_embeddings(input_ids) + # do post-embedding layernorm + hidden_states = self.word_embeddings_layernorm(inputs_embeds) + + # build alibi depending on `attention_mask` + alibi = build_alibi_tensor(attention_mask, self.config.n_head, dtype=hidden_states.dtype) + + outputs = self.h( + hidden_states, + alibi=alibi, + attention_mask=attention_mask, + deterministic=deterministic, + init_cache=init_cache, + output_hidden_states=output_hidden_states, + output_attentions=output_attentions, + ) + + hidden_states = outputs[0] + hidden_states = self.ln_f(hidden_states) + + if output_hidden_states: + all_hidden_states = outputs[1] + (hidden_states,) + outputs = (hidden_states, all_hidden_states) + outputs[2:] + else: + outputs = (hidden_states,) + outputs[1:] + + if not return_dict: + return tuple(v for v in [outputs[0], outputs[-1]] if v is not None) + + return FlaxBaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + hidden_states=outputs[1], + attentions=outputs[-1], + ) + + +@add_start_docstrings( + "The bare Bloom Model transformer outputting raw hidden-states without any specific head on top.", + BLOOM_START_DOCSTRING, +) +# Copied from transformers.models.gpt_neo.modeling_flax_gpt_neo.FlaxGPTNeoModel with GPTNeo->Bloom +class FlaxBloomModel(FlaxBloomPreTrainedModel): + module_class = FlaxBloomModule + + +append_call_sample_docstring(FlaxBloomModel, _CHECKPOINT_FOR_DOC, FlaxBaseModelOutput, _CONFIG_FOR_DOC) + + +class FlaxBloomForCausalLMModule(nn.Module): + config: BloomConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self): + self.transformer = FlaxBloomModule(self.config, dtype=self.dtype) + self.lm_head = nn.Dense( + self.config.vocab_size, + use_bias=False, + dtype=self.dtype, + kernel_init=jax.nn.initializers.normal(stddev=self.config.initializer_range), + ) + + def __call__( + self, + input_ids, + attention_mask, + deterministic: bool = True, + init_cache: bool = False, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + outputs = self.transformer( + input_ids, + attention_mask=attention_mask, + deterministic=deterministic, + init_cache=init_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs[0] + + if self.config.tie_word_embeddings: + shared_kernel = self.transformer.variables["params"]["word_embeddings"]["embedding"].T + lm_logits = self.lm_head.apply({"params": {"kernel": shared_kernel}}, hidden_states) + else: + lm_logits = self.lm_head(hidden_states) + + if not return_dict: + return (lm_logits,) + outputs[1:] + + return FlaxCausalLMOutput(logits=lm_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions) + + +@add_start_docstrings( + """ + The Bloom Model transformer with a language modeling head on top (linear layer with weights tied to the input + embeddings). + """, + BLOOM_START_DOCSTRING, +) +class FlaxBloomForCausalLM(FlaxBloomPreTrainedModel): + module_class = FlaxBloomForCausalLMModule + + def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jax.Array] = None): + # initializing the cache + batch_size, seq_length = input_ids.shape + + past_key_values = self.init_cache(batch_size, max_length) + # Note that usually one would have to put 0's in the attention_mask for + # x > input_ids.shape[-1] and x < cache_length. But since Bloom uses a causal mask, + # those positions are masked anyway. Thus, we can create a single static attention_mask here, + # which is more efficient for compilation + extended_attention_mask = jnp.ones((batch_size, max_length), dtype="i4") + if attention_mask is not None: + extended_attention_mask = lax.dynamic_update_slice(extended_attention_mask, attention_mask, (0, 0)) + + return { + "past_key_values": past_key_values, + "attention_mask": extended_attention_mask, + } + + def update_inputs_for_generation(self, model_outputs, model_kwargs): + model_kwargs["past_key_values"] = model_outputs.past_key_values + return model_kwargs + + +append_call_sample_docstring(FlaxBloomForCausalLM, _CHECKPOINT_FOR_DOC, FlaxCausalLMOutput, _CONFIG_FOR_DOC) + + +__all__ = ["FlaxBloomForCausalLM", "FlaxBloomModel", "FlaxBloomPreTrainedModel"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bloom/tokenization_bloom_fast.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bloom/tokenization_bloom_fast.py new file mode 100644 index 0000000000000000000000000000000000000000..b7a9f7449a4e9a5c2306ba4853ff5c9d99c00a20 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bloom/tokenization_bloom_fast.py @@ -0,0 +1,152 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes for Bloom.""" + +import pickle +from typing import Optional + +from ...tokenization_utils_base import BatchEncoding +from ...tokenization_utils_fast import PreTrainedTokenizerFast +from ...utils import logging + + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = {"tokenizer_file": "tokenizer.json"} + + +class BloomTokenizerFast(PreTrainedTokenizerFast): + """ + Construct a "fast" Bloom tokenizer (backed by HuggingFace's *tokenizers* library). Based on byte-level + Byte-Pair-Encoding. + + This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will + be encoded differently whether it is at the beginning of the sentence (without space) or not: + + ```python + >>> from transformers import BloomTokenizerFast + + >>> tokenizer = BloomTokenizerFast.from_pretrained("bigscience/bloom") + >>> tokenizer("Hello world")["input_ids"] + [59414, 8876] + + >>> tokenizer(" Hello world")["input_ids"] + [86153, 8876] + ``` + + You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer, but since + the model was not pretrained this way, it might yield a decrease in performance. + + + + When used with `is_split_into_words=True`, this tokenizer needs to be instantiated with `add_prefix_space=True`. + + + + This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should + refer to this superclass for more information regarding those methods. + + Args: + vocab_file (`str`): + Path to the vocabulary file. + merges_file (`str`): + Path to the merges file. + errors (`str`, *optional*, defaults to `"replace"`): + Paradigm to follow when decoding bytes to UTF-8. See + [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information. + unk_token (`str`, *optional*, defaults to `<|endoftext|>`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + bos_token (`str`, *optional*, defaults to `<|endoftext|>`): + The beginning of sequence token. + eos_token (`str`, *optional*, defaults to `<|endoftext|>`): + The end of sequence token. + add_prefix_space (`bool`, *optional*, defaults to `False`): + Whether or not to add an initial space to the input. This allows to treat the leading word just as any + other word. (Bloom tokenizer detect beginning of words by the preceding space). + trim_offsets (`bool`, *optional*, defaults to `True`): + Whether or not the post-processing step should trim offsets to avoid including whitespaces. + """ + + vocab_files_names = VOCAB_FILES_NAMES + model_input_names = ["input_ids", "attention_mask"] + slow_tokenizer_class = None + # No `max_model_input_sizes` as BLOOM uses ALiBi positional embeddings + + def __init__( + self, + vocab_file=None, + merges_file=None, + tokenizer_file=None, + unk_token="", + bos_token="", + eos_token="", + pad_token="", + add_prefix_space=False, + clean_up_tokenization_spaces=False, + **kwargs, + ): + super().__init__( + vocab_file=vocab_file, + merges_file=merges_file, + tokenizer_file=tokenizer_file, + unk_token=unk_token, + bos_token=bos_token, + eos_token=eos_token, + pad_token=pad_token, + add_prefix_space=add_prefix_space, + clean_up_tokenization_spaces=clean_up_tokenization_spaces, + **kwargs, + ) + # TODO @ArthurZucker this can only work one way for now, to update later-on. Tests should also properly + # check this as they were green before. + pre_tok_state = pickle.dumps(self.backend_tokenizer.pre_tokenizer) + decoder_state = pickle.dumps(self.backend_tokenizer.decoder) + + if add_prefix_space: + pre_tok_state = pre_tok_state.replace(b'"add_prefix_space":false', b'"add_prefix_space": true') + decoder_state = decoder_state.replace(b'"add_prefix_space":false', b'"add_prefix_space": true') + self.backend_tokenizer.pre_tokenizer = pickle.loads(pre_tok_state) + self.backend_tokenizer.decoder = pickle.loads(decoder_state) + + self.add_prefix_space = add_prefix_space + + def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding: + is_split_into_words = kwargs.get("is_split_into_words", False) + if not (self.add_prefix_space or not is_split_into_words): + raise Exception( + f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True to use it with" + " pretokenized inputs." + ) + + return super()._batch_encode_plus(*args, **kwargs) + + def _encode_plus(self, *args, **kwargs) -> BatchEncoding: + is_split_into_words = kwargs.get("is_split_into_words", False) + + if not (self.add_prefix_space or not is_split_into_words): + raise Exception( + f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True to use it with" + " pretokenized inputs." + ) + + return super()._encode_plus(*args, **kwargs) + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]: + files = self._tokenizer.model.save(save_directory, name=filename_prefix) + return tuple(files) + + +__all__ = ["BloomTokenizerFast"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bros/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bros/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..2178cfd03a40593bc9acb97111204aab0423860c --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bros/__init__.py @@ -0,0 +1,28 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_bros import * + from .modeling_bros import * + from .processing_bros import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bros/configuration_bros.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bros/configuration_bros.py new file mode 100644 index 0000000000000000000000000000000000000000..84c9989f309fb782b47aed37f9728dd7a26ba6b8 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bros/configuration_bros.py @@ -0,0 +1,138 @@ +# coding=utf-8 +# Copyright 2023-present NAVER Corp, The Microsoft Research Asia LayoutLM Team Authors and the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Bros model configuration""" + +from ...configuration_utils import PretrainedConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class BrosConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`BrosModel`] or a [`TFBrosModel`]. It is used to + instantiate a Bros model according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of the Bros + [jinho8345/bros-base-uncased](https://huggingface.co/jinho8345/bros-base-uncased) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + vocab_size (`int`, *optional*, defaults to 30522): + Vocabulary size of the Bros model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`BrosModel`] or [`TFBrosModel`]. + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + intermediate_size (`int`, *optional*, defaults to 3072): + Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder. + hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"silu"` and `"gelu_new"` are supported. + hidden_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout ratio for the attention probabilities. + max_position_embeddings (`int`, *optional*, defaults to 512): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + type_vocab_size (`int`, *optional*, defaults to 2): + The vocabulary size of the `token_type_ids` passed when calling [`BrosModel`] or [`TFBrosModel`]. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-12): + The epsilon used by the layer normalization layers. + pad_token_id (`int`, *optional*, defaults to 0): + The index of the padding token in the token vocabulary. + dim_bbox (`int`, *optional*, defaults to 8): + The dimension of the bounding box coordinates. (x0, y1, x1, y0, x1, y1, x0, y1) + bbox_scale (`float`, *optional*, defaults to 100.0): + The scale factor of the bounding box coordinates. + n_relations (`int`, *optional*, defaults to 1): + The number of relations for SpadeEE(entity extraction), SpadeEL(entity linking) head. + classifier_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout ratio for the classifier head. + + + Examples: + + ```python + >>> from transformers import BrosConfig, BrosModel + + >>> # Initializing a BROS jinho8345/bros-base-uncased style configuration + >>> configuration = BrosConfig() + + >>> # Initializing a model from the jinho8345/bros-base-uncased style configuration + >>> model = BrosModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "bros" + + def __init__( + self, + vocab_size=30522, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=2, + initializer_range=0.02, + layer_norm_eps=1e-12, + pad_token_id=0, + dim_bbox=8, + bbox_scale=100.0, + n_relations=1, + classifier_dropout_prob=0.1, + **kwargs, + ): + super().__init__( + vocab_size=vocab_size, + hidden_size=hidden_size, + num_hidden_layers=num_hidden_layers, + num_attention_heads=num_attention_heads, + intermediate_size=intermediate_size, + hidden_act=hidden_act, + hidden_dropout_prob=hidden_dropout_prob, + attention_probs_dropout_prob=attention_probs_dropout_prob, + max_position_embeddings=max_position_embeddings, + type_vocab_size=type_vocab_size, + initializer_range=initializer_range, + layer_norm_eps=layer_norm_eps, + pad_token_id=pad_token_id, + **kwargs, + ) + + self.dim_bbox = dim_bbox + self.bbox_scale = bbox_scale + self.n_relations = n_relations + self.dim_bbox_sinusoid_emb_2d = self.hidden_size // 4 + self.dim_bbox_sinusoid_emb_1d = self.dim_bbox_sinusoid_emb_2d // self.dim_bbox + self.dim_bbox_projection = self.hidden_size // self.num_attention_heads + self.classifier_dropout_prob = classifier_dropout_prob + + +__all__ = ["BrosConfig"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bros/modeling_bros.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bros/modeling_bros.py new file mode 100644 index 0000000000000000000000000000000000000000..d01a4c5a1c6da7644e6b3433d1657f5fe4f93849 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bros/modeling_bros.py @@ -0,0 +1,1136 @@ +# coding=utf-8 +# Copyright 2023-present NAVER Corp, The Microsoft Research Asia LayoutLM Team Authors and the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch Bros model.""" + +import math +from dataclasses import dataclass +from typing import Optional, Union + +import torch +from torch import nn +from torch.nn import CrossEntropyLoss + +from ...activations import ACT2FN +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_outputs import ( + BaseModelOutputWithCrossAttentions, + BaseModelOutputWithPoolingAndCrossAttentions, + TokenClassifierOutput, +) +from ...modeling_utils import PreTrainedModel +from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer +from ...utils import ModelOutput, auto_docstring, can_return_tuple, logging +from .configuration_bros import BrosConfig + + +logger = logging.get_logger(__name__) + + +@dataclass +@auto_docstring( + custom_intro=""" + Base class for outputs of token classification models. + """ +) +class BrosSpadeOutput(ModelOutput): + r""" + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Classification loss. + initial_token_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`): + Classification scores for entity initial tokens (before SoftMax). + subsequent_token_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, sequence_length+1)`): + Classification scores for entity sequence tokens (before SoftMax). + """ + + loss: Optional[torch.FloatTensor] = None + initial_token_logits: Optional[torch.FloatTensor] = None + subsequent_token_logits: Optional[torch.FloatTensor] = None + hidden_states: Optional[tuple[torch.FloatTensor]] = None + attentions: Optional[tuple[torch.FloatTensor]] = None + + +class BrosPositionalEmbedding1D(nn.Module): + # Reference: https://github.com/kimiyoung/transformer-xl/blob/master/pytorch/mem_transformer.py#L15 + + def __init__(self, config): + super().__init__() + + self.dim_bbox_sinusoid_emb_1d = config.dim_bbox_sinusoid_emb_1d + + inv_freq = 1 / ( + 10000 ** (torch.arange(0.0, self.dim_bbox_sinusoid_emb_1d, 2.0) / self.dim_bbox_sinusoid_emb_1d) + ) + self.register_buffer("inv_freq", inv_freq) + + def forward(self, pos_seq: torch.Tensor) -> torch.Tensor: + seq_size = pos_seq.size() + b1, b2, b3 = seq_size + sinusoid_inp = pos_seq.view(b1, b2, b3, 1) * self.inv_freq.view(1, 1, 1, self.dim_bbox_sinusoid_emb_1d // 2) + pos_emb = torch.cat([sinusoid_inp.sin(), sinusoid_inp.cos()], dim=-1) + return pos_emb + + +class BrosPositionalEmbedding2D(nn.Module): + def __init__(self, config): + super().__init__() + + self.dim_bbox = config.dim_bbox + self.x_pos_emb = BrosPositionalEmbedding1D(config) + self.y_pos_emb = BrosPositionalEmbedding1D(config) + + def forward(self, bbox: torch.Tensor) -> torch.Tensor: + stack = [] + for i in range(self.dim_bbox): + if i % 2 == 0: + stack.append(self.x_pos_emb(bbox[..., i])) + else: + stack.append(self.y_pos_emb(bbox[..., i])) + bbox_pos_emb = torch.cat(stack, dim=-1) + return bbox_pos_emb + + +class BrosBboxEmbeddings(nn.Module): + def __init__(self, config): + super().__init__() + self.bbox_sinusoid_emb = BrosPositionalEmbedding2D(config) + self.bbox_projection = nn.Linear(config.dim_bbox_sinusoid_emb_2d, config.dim_bbox_projection, bias=False) + + def forward(self, bbox: torch.Tensor): + bbox_t = bbox.transpose(0, 1) + bbox_pos = bbox_t[None, :, :, :] - bbox_t[:, None, :, :] + bbox_pos_emb = self.bbox_sinusoid_emb(bbox_pos) + bbox_pos_emb = self.bbox_projection(bbox_pos_emb) + + return bbox_pos_emb + + +class BrosTextEmbeddings(nn.Module): + """Construct the embeddings from word, position and token_type embeddings.""" + + def __init__(self, config): + super().__init__() + + self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id) + self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) + self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size) + + # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load + # any TensorFlow checkpoint file + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + # position_ids (1, len position emb) is contiguous in memory and exported when serialized + self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") + self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "token_type_ids", + torch.zeros( + self.position_ids.size(), + dtype=torch.long, + device=self.position_ids.device, + ), + persistent=False, + ) + + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + if input_ids is not None: + input_shape = input_ids.size() + else: + input_shape = inputs_embeds.size()[:-1] + + seq_length = input_shape[1] + + if position_ids is None: + position_ids = self.position_ids[:, :seq_length] + + if token_type_ids is None: + if hasattr(self, "token_type_ids"): + buffered_token_type_ids = self.token_type_ids[:, :seq_length] + buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length) + token_type_ids = buffered_token_type_ids_expanded + else: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device) + + if inputs_embeds is None: + inputs_embeds = self.word_embeddings(input_ids) + token_type_embeddings = self.token_type_embeddings(token_type_ids) + + embeddings = inputs_embeds + token_type_embeddings + if self.position_embedding_type == "absolute": + position_embeddings = self.position_embeddings(position_ids) + embeddings += position_embeddings + embeddings = self.LayerNorm(embeddings) + embeddings = self.dropout(embeddings) + return embeddings + + +class BrosSelfAttention(nn.Module): + def __init__(self, config): + super().__init__() + if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): + raise ValueError( + f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention " + f"heads ({config.num_attention_heads})" + ) + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size) + self.key = nn.Linear(config.hidden_size, self.all_head_size) + self.value = nn.Linear(config.hidden_size, self.all_head_size) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + self.max_position_embeddings = config.max_position_embeddings + self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) + + self.is_decoder = config.is_decoder + + def forward( + self, + hidden_states: torch.Tensor, + bbox_pos_emb: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[torch.Tensor] = False, + ) -> tuple[torch.Tensor]: + hidden_shape = (hidden_states.shape[0], -1, self.num_attention_heads, self.attention_head_size) + query_layer = self.query(hidden_states).view(hidden_shape).transpose(1, 2) + + # If this is instantiated as a cross-attention module, the keys + # and values come from an encoder; the attention mask needs to be + # such that the encoder's padding tokens are not attended to. + is_cross_attention = encoder_hidden_states is not None + + if is_cross_attention: + key_layer = self.key(encoder_hidden_states).view(hidden_shape).transpose(1, 2) + value_layer = self.value(encoder_hidden_states).view(hidden_shape).transpose(1, 2) + attention_mask = encoder_attention_mask + else: + key_layer = self.key(hidden_states).view(hidden_shape).transpose(1, 2) + value_layer = self.value(hidden_states).view(hidden_shape).transpose(1, 2) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + seq_length = hidden_states.size()[1] + position_ids_l = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(-1, 1) + position_ids_r = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(1, -1) + distance = position_ids_l - position_ids_r + positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1) + positional_embedding = positional_embedding.to(dtype=query_layer.dtype) # fp16 compatibility + + if self.position_embedding_type == "relative_key": + relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores + elif self.position_embedding_type == "relative_key_query": + relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding) + + attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key + + # bbox positional encoding + batch_size, n_head, seq_length, d_head = query_layer.shape + bbox_pos_emb = bbox_pos_emb.view(seq_length, seq_length, batch_size, d_head) + bbox_pos_emb = bbox_pos_emb.permute([2, 0, 1, 3]) + bbox_pos_scores = torch.einsum("bnid,bijd->bnij", (query_layer, bbox_pos_emb)) + + attention_scores = attention_scores + bbox_pos_scores + + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in BrosModel forward() function) + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = nn.Softmax(dim=-1)(attention_scores) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = attention_probs * head_mask + + context_layer = torch.matmul(attention_probs, value_layer) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(*new_context_layer_shape) + + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + + if self.is_decoder: + outputs = outputs + (None,) + return outputs + + +# Copied from transformers.models.bert.modeling_bert.BertSelfOutput with Bert->Bros +class BrosSelfOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class BrosAttention(nn.Module): + def __init__(self, config): + super().__init__() + self.self = BrosSelfAttention(config) + self.output = BrosSelfOutput(config) + self.pruned_heads = set() + + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, + self.self.num_attention_heads, + self.self.attention_head_size, + self.pruned_heads, + ) + + # Prune linear layers + self.self.query = prune_linear_layer(self.self.query, index) + self.self.key = prune_linear_layer(self.self.key, index) + self.self.value = prune_linear_layer(self.self.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.self.num_attention_heads = self.self.num_attention_heads - len(heads) + self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + def forward( + self, + hidden_states: torch.Tensor, + bbox_pos_emb: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = False, + ) -> tuple[torch.Tensor]: + self_outputs = self.self( + hidden_states=hidden_states, + bbox_pos_emb=bbox_pos_emb, + attention_mask=attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + output_attentions=output_attentions, + ) + attention_output = self.output(self_outputs[0], hidden_states) + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + return outputs + + +# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->Bros +class BrosIntermediate(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +class BrosOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class BrosLayer(GradientCheckpointingLayer): + def __init__(self, config): + super().__init__() + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 + self.attention = BrosAttention(config) + self.is_decoder = config.is_decoder + self.add_cross_attention = config.add_cross_attention + if self.add_cross_attention: + if not self.is_decoder: + raise Exception(f"{self} should be used as a decoder model if cross attention is added") + self.crossattention = BrosAttention(config) + self.intermediate = BrosIntermediate(config) + self.output = BrosOutput(config) + + def forward( + self, + hidden_states: torch.Tensor, + bbox_pos_emb: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = False, + ) -> tuple[torch.Tensor]: + self_attention_outputs = self.attention( + hidden_states, + bbox_pos_emb=bbox_pos_emb, + attention_mask=attention_mask, + head_mask=head_mask, + output_attentions=output_attentions, + ) + attention_output = self_attention_outputs[0] + + # if decoder, the last output is tuple of self-attn cache + if self.is_decoder: + outputs = self_attention_outputs[1:-1] + else: + outputs = self_attention_outputs[1:] # add self attentions if we output attention weights + + if self.is_decoder and encoder_hidden_states is not None: + if hasattr(self, "crossattention"): + raise Exception( + f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`" + ) + + cross_attention_outputs = self.crossattention( + attention_output, + attention_mask=attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + output_attentions=output_attentions, + ) + attention_output = cross_attention_outputs[0] + outputs = outputs + cross_attention_outputs[1:-1] # add cross attentions if we output attention weights + + layer_output = apply_chunking_to_forward( + self.feed_forward_chunk, + self.chunk_size_feed_forward, + self.seq_len_dim, + attention_output, + ) + outputs = (layer_output,) + outputs + + # if decoder, return the attn key/values as the last output + if self.is_decoder: + outputs = outputs + (None,) + + return outputs + + def feed_forward_chunk(self, attention_output): + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + return layer_output + + +class BrosEncoder(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.layer = nn.ModuleList([BrosLayer(config) for _ in range(config.num_hidden_layers)]) + + @can_return_tuple + def forward( + self, + hidden_states: torch.Tensor, + bbox_pos_emb: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = False, + output_hidden_states: Optional[bool] = False, + return_dict: Optional[bool] = True, + ) -> Union[tuple[torch.Tensor], BaseModelOutputWithCrossAttentions]: + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None + + for i, layer_module in enumerate(self.layer): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_head_mask = head_mask[i] if head_mask is not None else None + + layer_outputs = layer_module( + hidden_states=hidden_states, + bbox_pos_emb=bbox_pos_emb, + attention_mask=attention_mask, + head_mask=layer_head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + output_attentions=output_attentions, + ) + + hidden_states = layer_outputs[0] + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + if self.config.add_cross_attention: + all_cross_attentions = all_cross_attentions + (layer_outputs[2],) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + return BaseModelOutputWithCrossAttentions( + last_hidden_state=hidden_states, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + cross_attentions=all_cross_attentions, + ) + + +# Copied from transformers.models.bert.modeling_bert.BertPooler with Bert->Bros +class BrosPooler(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.activation = nn.Tanh() + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. + first_token_tensor = hidden_states[:, 0] + pooled_output = self.dense(first_token_tensor) + pooled_output = self.activation(pooled_output) + return pooled_output + + +class BrosRelationExtractor(nn.Module): + def __init__(self, config): + super().__init__() + self.n_relations = config.n_relations + self.backbone_hidden_size = config.hidden_size + self.head_hidden_size = config.hidden_size + self.classifier_dropout_prob = config.classifier_dropout_prob + + self.drop = nn.Dropout(self.classifier_dropout_prob) + self.query = nn.Linear(self.backbone_hidden_size, self.n_relations * self.head_hidden_size) + + self.key = nn.Linear(self.backbone_hidden_size, self.n_relations * self.head_hidden_size) + + self.dummy_node = nn.Parameter(torch.zeros(1, self.backbone_hidden_size)) + + def forward(self, query_layer: torch.Tensor, key_layer: torch.Tensor): + query_layer = self.query(self.drop(query_layer)) + + dummy_vec = self.dummy_node.unsqueeze(0).repeat(1, key_layer.size(1), 1) + key_layer = torch.cat([key_layer, dummy_vec], axis=0) + key_layer = self.key(self.drop(key_layer)) + + query_layer = query_layer.view( + query_layer.size(0), query_layer.size(1), self.n_relations, self.head_hidden_size + ) + key_layer = key_layer.view(key_layer.size(0), key_layer.size(1), self.n_relations, self.head_hidden_size) + + relation_score = torch.matmul( + query_layer.permute(2, 1, 0, 3), key_layer.permute(2, 1, 3, 0) + ) # equivalent to torch.einsum("ibnd,jbnd->nbij", (query_layer, key_layer)) + + return relation_score + + +@auto_docstring +class BrosPreTrainedModel(PreTrainedModel): + config: BrosConfig + base_model_prefix = "bros" + + def _init_weights(self, module: nn.Module): + """Initialize the weights""" + std = self.config.initializer_range + if isinstance(module, nn.Linear): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + elif isinstance(module, BrosRelationExtractor): + nn.init.normal_(module.dummy_node, std=std) + + +@auto_docstring +class BrosModel(BrosPreTrainedModel): + def __init__(self, config, add_pooling_layer=True): + r""" + add_pooling_layer (bool, *optional*, defaults to `True`): + Whether to add a pooling layer + """ + super().__init__(config) + self.config = config + + self.embeddings = BrosTextEmbeddings(config) + self.bbox_embeddings = BrosBboxEmbeddings(config) + self.encoder = BrosEncoder(config) + + self.pooler = BrosPooler(config) if add_pooling_layer else None + + self.init_weights() + + def get_input_embeddings(self): + return self.embeddings.word_embeddings + + def set_input_embeddings(self, value): + self.embeddings.word_embeddings = value + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + @can_return_tuple + @auto_docstring + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + bbox: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]: + r""" + bbox ('torch.FloatTensor' of shape '(batch_size, num_boxes, 4)'): + Bounding box coordinates for each token in the input sequence. Each bounding box is a list of four values + (x1, y1, x2, y2), where (x1, y1) is the top left corner, and (x2, y2) is the bottom right corner of the + bounding box. + + Examples: + + ```python + >>> import torch + >>> from transformers import BrosProcessor, BrosModel + + >>> processor = BrosProcessor.from_pretrained("jinho8345/bros-base-uncased") + + >>> model = BrosModel.from_pretrained("jinho8345/bros-base-uncased") + + >>> encoding = processor("Hello, my dog is cute", add_special_tokens=False, return_tensors="pt") + >>> bbox = torch.tensor([[[0, 0, 1, 1]]]).repeat(1, encoding["input_ids"].shape[-1], 1) + >>> encoding["bbox"] = bbox + + >>> outputs = model(**encoding) + >>> last_hidden_states = outputs.last_hidden_state + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + input_shape = input_ids.size() + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + if bbox is None: + raise ValueError("You have to specify bbox") + + batch_size, seq_length = input_shape + device = input_ids.device if input_ids is not None else inputs_embeds.device + + if attention_mask is None: + attention_mask = torch.ones(input_shape, device=device) + + if token_type_ids is None: + if hasattr(self.embeddings, "token_type_ids"): + buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length] + buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length) + token_type_ids = buffered_token_type_ids_expanded + else: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) + + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, device) + + # If a 2D or 3D attention mask is provided for the cross-attention + # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] + if self.config.is_decoder and encoder_hidden_states is not None: + encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size() + encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) + if encoder_attention_mask is None: + encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device) + encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) + else: + encoder_extended_attention_mask = None + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + + embedding_output = self.embeddings( + input_ids=input_ids, + position_ids=position_ids, + token_type_ids=token_type_ids, + inputs_embeds=inputs_embeds, + ) + + # if bbox has 2 points (4 float tensors) per token, convert it to 4 points (8 float tensors) per token + if bbox.shape[-1] == 4: + bbox = bbox[:, :, [0, 1, 2, 1, 2, 3, 0, 3]] + scaled_bbox = bbox * self.config.bbox_scale + bbox_position_embeddings = self.bbox_embeddings(scaled_bbox) + + encoder_outputs = self.encoder( + embedding_output, + bbox_pos_emb=bbox_position_embeddings, + attention_mask=extended_attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_extended_attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=True, + ) + sequence_output = encoder_outputs[0] + pooled_output = self.pooler(sequence_output) if self.pooler is not None else None + + return BaseModelOutputWithPoolingAndCrossAttentions( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + cross_attentions=encoder_outputs.cross_attentions, + ) + + +@auto_docstring +class BrosForTokenClassification(BrosPreTrainedModel): + _keys_to_ignore_on_load_unexpected = [r"pooler"] + + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + + self.bros = BrosModel(config) + classifier_dropout = ( + config.classifier_dropout if hasattr(config, "classifier_dropout") else config.hidden_dropout_prob + ) + self.dropout = nn.Dropout(classifier_dropout) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + + self.init_weights() + + @can_return_tuple + @auto_docstring + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + bbox: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + bbox_first_token_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple[torch.Tensor], TokenClassifierOutput]: + r""" + bbox ('torch.FloatTensor' of shape '(batch_size, num_boxes, 4)'): + Bounding box coordinates for each token in the input sequence. Each bounding box is a list of four values + (x1, y1, x2, y2), where (x1, y1) is the top left corner, and (x2, y2) is the bottom right corner of the + bounding box. + bbox_first_token_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to indicate the first token of each bounding box. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + Examples: + + ```python + >>> import torch + >>> from transformers import BrosProcessor, BrosForTokenClassification + + >>> processor = BrosProcessor.from_pretrained("jinho8345/bros-base-uncased") + + >>> model = BrosForTokenClassification.from_pretrained("jinho8345/bros-base-uncased") + + >>> encoding = processor("Hello, my dog is cute", add_special_tokens=False, return_tensors="pt") + >>> bbox = torch.tensor([[[0, 0, 1, 1]]]).repeat(1, encoding["input_ids"].shape[-1], 1) + >>> encoding["bbox"] = bbox + + >>> outputs = model(**encoding) + ```""" + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.bros( + input_ids, + bbox=bbox, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=True, + ) + + sequence_output = outputs[0] + + sequence_output = self.dropout(sequence_output) + logits = self.classifier(sequence_output) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + if bbox_first_token_mask is not None: + bbox_first_token_mask = bbox_first_token_mask.view(-1) + loss = loss_fct( + logits.view(-1, self.num_labels)[bbox_first_token_mask], labels.view(-1)[bbox_first_token_mask] + ) + else: + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + + return TokenClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@auto_docstring( + custom_intro=""" + Bros Model with a token classification head on top (initial_token_layers and subsequent_token_layer on top of the + hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. The initial_token_classifier is used to + predict the first token of each entity, and the subsequent_token_classifier is used to predict the subsequent + tokens within an entity. Compared to BrosForTokenClassification, this model is more robust to serialization errors + since it predicts next token from one token. + """ +) +class BrosSpadeEEForTokenClassification(BrosPreTrainedModel): + _keys_to_ignore_on_load_unexpected = [r"pooler"] + + def __init__(self, config): + super().__init__(config) + self.config = config + self.num_labels = config.num_labels + self.n_relations = config.n_relations + self.backbone_hidden_size = config.hidden_size + + self.bros = BrosModel(config) + classifier_dropout = ( + config.classifier_dropout if hasattr(config, "classifier_dropout") else config.hidden_dropout_prob + ) + + # Initial token classification for Entity Extraction (NER) + self.initial_token_classifier = nn.Sequential( + nn.Dropout(classifier_dropout), + nn.Linear(config.hidden_size, config.hidden_size), + nn.Dropout(classifier_dropout), + nn.Linear(config.hidden_size, config.num_labels), + ) + + # Subsequent token classification for Entity Extraction (NER) + self.subsequent_token_classifier = BrosRelationExtractor(config) + + self.init_weights() + + @can_return_tuple + @auto_docstring + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + bbox: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + bbox_first_token_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + initial_token_labels: Optional[torch.Tensor] = None, + subsequent_token_labels: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple[torch.Tensor], BrosSpadeOutput]: + r""" + bbox ('torch.FloatTensor' of shape '(batch_size, num_boxes, 4)'): + Bounding box coordinates for each token in the input sequence. Each bounding box is a list of four values + (x1, y1, x2, y2), where (x1, y1) is the top left corner, and (x2, y2) is the bottom right corner of the + bounding box. + bbox_first_token_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to indicate the first token of each bounding box. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + initial_token_labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for the initial token classification. + subsequent_token_labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for the subsequent token classification. + + Examples: + + ```python + >>> import torch + >>> from transformers import BrosProcessor, BrosSpadeEEForTokenClassification + + >>> processor = BrosProcessor.from_pretrained("jinho8345/bros-base-uncased") + + >>> model = BrosSpadeEEForTokenClassification.from_pretrained("jinho8345/bros-base-uncased") + + >>> encoding = processor("Hello, my dog is cute", add_special_tokens=False, return_tensors="pt") + >>> bbox = torch.tensor([[[0, 0, 1, 1]]]).repeat(1, encoding["input_ids"].shape[-1], 1) + >>> encoding["bbox"] = bbox + + >>> outputs = model(**encoding) + ```""" + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.bros( + input_ids=input_ids, + bbox=bbox, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=True, + ) + + last_hidden_states = outputs[0] + last_hidden_states = last_hidden_states.transpose(0, 1).contiguous() + initial_token_logits = self.initial_token_classifier(last_hidden_states).transpose(0, 1).contiguous() + subsequent_token_logits = self.subsequent_token_classifier(last_hidden_states, last_hidden_states).squeeze(0) + + # make subsequent token (sequence token classification) mask + inv_attention_mask = 1 - attention_mask + batch_size, max_seq_length = inv_attention_mask.shape + device = inv_attention_mask.device + invalid_token_mask = torch.cat([inv_attention_mask, torch.zeros([batch_size, 1]).to(device)], axis=1).bool() + subsequent_token_logits = subsequent_token_logits.masked_fill( + invalid_token_mask[:, None, :], torch.finfo(subsequent_token_logits.dtype).min + ) + self_token_mask = torch.eye(max_seq_length, max_seq_length + 1).to(device=device, dtype=torch.bool) + subsequent_token_logits = subsequent_token_logits.masked_fill( + self_token_mask[None, :, :], torch.finfo(subsequent_token_logits.dtype).min + ) + subsequent_token_mask = attention_mask.view(-1).bool() + + loss = None + if initial_token_labels is not None and subsequent_token_labels is not None: + loss_fct = CrossEntropyLoss() + + # get initial token loss + initial_token_labels = initial_token_labels.view(-1) + if bbox_first_token_mask is not None: + bbox_first_token_mask = bbox_first_token_mask.view(-1) + initial_token_loss = loss_fct( + initial_token_logits.view(-1, self.num_labels)[bbox_first_token_mask], + initial_token_labels[bbox_first_token_mask], + ) + else: + initial_token_loss = loss_fct(initial_token_logits.view(-1, self.num_labels), initial_token_labels) + + subsequent_token_labels = subsequent_token_labels.view(-1) + subsequent_token_loss = loss_fct( + subsequent_token_logits.view(-1, max_seq_length + 1)[subsequent_token_mask], + subsequent_token_labels[subsequent_token_mask], + ) + + loss = initial_token_loss + subsequent_token_loss + + return BrosSpadeOutput( + loss=loss, + initial_token_logits=initial_token_logits, + subsequent_token_logits=subsequent_token_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@auto_docstring( + custom_intro=""" + Bros Model with a token classification head on top (a entity_linker layer on top of the hidden-states output) e.g. + for Entity-Linking. The entity_linker is used to predict intra-entity links (one entity to another entity). + """ +) +class BrosSpadeELForTokenClassification(BrosPreTrainedModel): + _keys_to_ignore_on_load_unexpected = [r"pooler"] + + def __init__(self, config): + super().__init__(config) + self.config = config + self.num_labels = config.num_labels + self.n_relations = config.n_relations + self.backbone_hidden_size = config.hidden_size + + self.bros = BrosModel(config) + (config.classifier_dropout if hasattr(config, "classifier_dropout") else config.hidden_dropout_prob) + + self.entity_linker = BrosRelationExtractor(config) + + self.init_weights() + + @can_return_tuple + @auto_docstring + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + bbox: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + bbox_first_token_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple[torch.Tensor], TokenClassifierOutput]: + r""" + bbox ('torch.FloatTensor' of shape '(batch_size, num_boxes, 4)'): + Bounding box coordinates for each token in the input sequence. Each bounding box is a list of four values + (x1, y1, x2, y2), where (x1, y1) is the top left corner, and (x2, y2) is the bottom right corner of the + bounding box. + bbox_first_token_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to indicate the first token of each bounding box. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + Examples: + + ```python + >>> import torch + >>> from transformers import BrosProcessor, BrosSpadeELForTokenClassification + + >>> processor = BrosProcessor.from_pretrained("jinho8345/bros-base-uncased") + + >>> model = BrosSpadeELForTokenClassification.from_pretrained("jinho8345/bros-base-uncased") + + >>> encoding = processor("Hello, my dog is cute", add_special_tokens=False, return_tensors="pt") + >>> bbox = torch.tensor([[[0, 0, 1, 1]]]).repeat(1, encoding["input_ids"].shape[-1], 1) + >>> encoding["bbox"] = bbox + + >>> outputs = model(**encoding) + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.bros( + input_ids=input_ids, + bbox=bbox, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=True, + ) + + last_hidden_states = outputs[0] + last_hidden_states = last_hidden_states.transpose(0, 1).contiguous() + + logits = self.entity_linker(last_hidden_states, last_hidden_states).squeeze(0) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + + batch_size, max_seq_length = attention_mask.shape + device = attention_mask.device + + self_token_mask = torch.eye(max_seq_length, max_seq_length + 1).to(device=device, dtype=torch.bool) + + mask = bbox_first_token_mask.view(-1) + bbox_first_token_mask = torch.cat( + [ + ~bbox_first_token_mask, + torch.zeros([batch_size, 1], dtype=torch.bool, device=device), + ], + axis=1, + ) + logits = logits.masked_fill(bbox_first_token_mask[:, None, :], torch.finfo(logits.dtype).min) + logits = logits.masked_fill(self_token_mask[None, :, :], torch.finfo(logits.dtype).min) + + loss = loss_fct(logits.view(-1, max_seq_length + 1)[mask], labels.view(-1)[mask]) + + return TokenClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +__all__ = [ + "BrosPreTrainedModel", + "BrosModel", + "BrosForTokenClassification", + "BrosSpadeEEForTokenClassification", + "BrosSpadeELForTokenClassification", +] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bros/processing_bros.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bros/processing_bros.py new file mode 100644 index 0000000000000000000000000000000000000000..8de0a1c49b0d0adcb3650bb8fba55582d4fa9588 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/bros/processing_bros.py @@ -0,0 +1,60 @@ +# coding=utf-8 +# Copyright 2023 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Processor class for Bros. +""" + +from ...processing_utils import ProcessingKwargs, ProcessorMixin + + +class BrosProcessorKwargs(ProcessingKwargs, total=False): + _defaults = { + "text_kwargs": { + "add_special_tokens": True, + "padding": False, + "stride": 0, + "return_overflowing_tokens": False, + "return_special_tokens_mask": False, + "return_offsets_mapping": False, + "return_length": False, + "verbose": True, + }, + } + + +class BrosProcessor(ProcessorMixin): + r""" + Constructs a Bros processor which wraps a BERT tokenizer. + + [`BrosProcessor`] offers all the functionalities of [`BertTokenizerFast`]. See the docstring of + [`~BrosProcessor.__call__`] and [`~BrosProcessor.decode`] for more information. + + Args: + tokenizer (`BertTokenizerFast`, *optional*): + An instance of ['BertTokenizerFast`]. The tokenizer is a required input. + """ + + attributes = ["tokenizer"] + tokenizer_class = ("BertTokenizer", "BertTokenizerFast") + valid_processor_kwargs = BrosProcessorKwargs + + def __init__(self, tokenizer=None, **kwargs): + if tokenizer is None: + raise ValueError("You need to specify a `tokenizer`.") + + super().__init__(tokenizer) + + +__all__ = ["BrosProcessor"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/camembert/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/camembert/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..9d90f64de97f78ccaf1592c3c5cad40a9b2d5dcb --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/camembert/__init__.py @@ -0,0 +1,30 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_camembert import * + from .modeling_camembert import * + from .modeling_tf_camembert import * + from .tokenization_camembert import * + from .tokenization_camembert_fast import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/camembert/configuration_camembert.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/camembert/configuration_camembert.py new file mode 100644 index 0000000000000000000000000000000000000000..3979e54874439aa41feb6abe3815df5c7a997419 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/camembert/configuration_camembert.py @@ -0,0 +1,155 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""CamemBERT configuration""" + +from collections import OrderedDict +from collections.abc import Mapping + +from ...configuration_utils import PretrainedConfig +from ...onnx import OnnxConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class CamembertConfig(PretrainedConfig): + """ + This is the configuration class to store the configuration of a [`CamembertModel`] or a [`TFCamembertModel`]. It is + used to instantiate a Camembert model according to the specified arguments, defining the model architecture. + Instantiating a configuration with the defaults will yield a similar configuration to that of the Camembert + [almanach/camembert-base](https://huggingface.co/almanach/camembert-base) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 30522): + Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`CamembertModel`] or [`TFCamembertModel`]. + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + intermediate_size (`int`, *optional*, defaults to 3072): + Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder. + hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"silu"` and `"gelu_new"` are supported. + hidden_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout ratio for the attention probabilities. + max_position_embeddings (`int`, *optional*, defaults to 512): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + type_vocab_size (`int`, *optional*, defaults to 2): + The vocabulary size of the `token_type_ids` passed when calling [`CamembertModel`] or [`TFCamembertModel`]. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-12): + The epsilon used by the layer normalization layers. + position_embedding_type (`str`, *optional*, defaults to `"absolute"`): + Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For + positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to + [Self-Attention with Relative Position Representations (Shaw et al.)](https://huggingface.co/papers/1803.02155). + For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models + with Better Relative Position Embeddings (Huang et al.)](https://huggingface.co/papers/2009.13658). + is_decoder (`bool`, *optional*, defaults to `False`): + Whether the model is used as a decoder or not. If `False`, the model is used as an encoder. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + classifier_dropout (`float`, *optional*): + The dropout ratio for the classification head. + + Example: + + ```python + >>> from transformers import CamembertConfig, CamembertModel + + >>> # Initializing a Camembert almanach/camembert-base style configuration + >>> configuration = CamembertConfig() + + >>> # Initializing a model (with random weights) from the almanach/camembert-base style configuration + >>> model = CamembertModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "camembert" + + def __init__( + self, + vocab_size=30522, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=2, + initializer_range=0.02, + layer_norm_eps=1e-12, + pad_token_id=1, + bos_token_id=0, + eos_token_id=2, + position_embedding_type="absolute", + use_cache=True, + classifier_dropout=None, + **kwargs, + ): + super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) + + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.hidden_act = hidden_act + self.intermediate_size = intermediate_size + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + self.position_embedding_type = position_embedding_type + self.use_cache = use_cache + self.classifier_dropout = classifier_dropout + + +class CamembertOnnxConfig(OnnxConfig): + @property + def inputs(self) -> Mapping[str, Mapping[int, str]]: + if self.task == "multiple-choice": + dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"} + else: + dynamic_axis = {0: "batch", 1: "sequence"} + return OrderedDict( + [ + ("input_ids", dynamic_axis), + ("attention_mask", dynamic_axis), + ] + ) + + +__all__ = ["CamembertConfig", "CamembertOnnxConfig"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/camembert/modeling_camembert.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/camembert/modeling_camembert.py new file mode 100644 index 0000000000000000000000000000000000000000..3a07402f739a603ad3b8a50a197b7784bfc1d2a7 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/camembert/modeling_camembert.py @@ -0,0 +1,1576 @@ +# coding=utf-8 +# Copyright 2019 Inria, Facebook AI Research and the HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch CamemBERT model.""" + +import math +from typing import Optional, Union + +import torch +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from ...activations import ACT2FN, gelu +from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache +from ...generation import GenerationMixin +from ...modeling_attn_mask_utils import _prepare_4d_attention_mask_for_sdpa, _prepare_4d_causal_attention_mask_for_sdpa +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_outputs import ( + BaseModelOutputWithPastAndCrossAttentions, + BaseModelOutputWithPoolingAndCrossAttentions, + CausalLMOutputWithCrossAttentions, + MaskedLMOutput, + MultipleChoiceModelOutput, + QuestionAnsweringModelOutput, + SequenceClassifierOutput, + TokenClassifierOutput, +) +from ...modeling_utils import PreTrainedModel +from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer +from ...utils import auto_docstring, logging +from ...utils.deprecation import deprecate_kwarg +from .configuration_camembert import CamembertConfig + + +logger = logging.get_logger(__name__) + + +# Copied from transformers.models.roberta.modeling_roberta.RobertaEmbeddings with Roberta->Camembert +class CamembertEmbeddings(nn.Module): + """ + Same as BertEmbeddings with a tiny tweak for positional embeddings indexing. + """ + + # Copied from transformers.models.bert.modeling_bert.BertEmbeddings.__init__ + def __init__(self, config): + super().__init__() + self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id) + self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) + self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size) + + # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load + # any TensorFlow checkpoint file + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + # position_ids (1, len position emb) is contiguous in memory and exported when serialized + self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) + self.register_buffer( + "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False + ) + + # End copy + self.padding_idx = config.pad_token_id + self.position_embeddings = nn.Embedding( + config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx + ) + + def forward( + self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0 + ): + if position_ids is None: + if input_ids is not None: + # Create the position ids from the input token ids. Any padded tokens remain padded. + position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length) + else: + position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds) + + if input_ids is not None: + input_shape = input_ids.size() + else: + input_shape = inputs_embeds.size()[:-1] + + seq_length = input_shape[1] + + # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs + # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves + # issue #5664 + if token_type_ids is None: + if hasattr(self, "token_type_ids"): + buffered_token_type_ids = self.token_type_ids[:, :seq_length] + buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length) + token_type_ids = buffered_token_type_ids_expanded + else: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device) + + if inputs_embeds is None: + inputs_embeds = self.word_embeddings(input_ids) + token_type_embeddings = self.token_type_embeddings(token_type_ids) + + embeddings = inputs_embeds + token_type_embeddings + if self.position_embedding_type == "absolute": + position_embeddings = self.position_embeddings(position_ids) + embeddings += position_embeddings + embeddings = self.LayerNorm(embeddings) + embeddings = self.dropout(embeddings) + return embeddings + + def create_position_ids_from_inputs_embeds(self, inputs_embeds): + """ + We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids. + + Args: + inputs_embeds: torch.Tensor + + Returns: torch.Tensor + """ + input_shape = inputs_embeds.size()[:-1] + sequence_length = input_shape[1] + + position_ids = torch.arange( + self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device + ) + return position_ids.unsqueeze(0).expand(input_shape) + + +# Copied from transformers.models.roberta.modeling_roberta.RobertaSelfAttention with Roberta->Camembert +class CamembertSelfAttention(nn.Module): + def __init__(self, config, position_embedding_type=None, layer_idx=None): + super().__init__() + if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): + raise ValueError( + f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention " + f"heads ({config.num_attention_heads})" + ) + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size) + self.key = nn.Linear(config.hidden_size, self.all_head_size) + self.value = nn.Linear(config.hidden_size, self.all_head_size) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + self.position_embedding_type = position_embedding_type or getattr( + config, "position_embedding_type", "absolute" + ) + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + self.max_position_embeddings = config.max_position_embeddings + self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) + + self.is_decoder = config.is_decoder + self.layer_idx = layer_idx + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + past_key_values: Optional[Cache] = None, + output_attentions: Optional[bool] = False, + cache_position: Optional[torch.Tensor] = None, + ) -> tuple[torch.Tensor]: + batch_size, seq_length, _ = hidden_states.shape + query_layer = self.query(hidden_states) + query_layer = query_layer.view(batch_size, -1, self.num_attention_heads, self.attention_head_size).transpose( + 1, 2 + ) + + is_updated = False + is_cross_attention = encoder_hidden_states is not None + if past_key_values is not None: + if isinstance(past_key_values, EncoderDecoderCache): + is_updated = past_key_values.is_updated.get(self.layer_idx) + if is_cross_attention: + # after the first generated id, we can subsequently re-use all key/value_layer from cache + curr_past_key_value = past_key_values.cross_attention_cache + else: + curr_past_key_value = past_key_values.self_attention_cache + else: + curr_past_key_value = past_key_values + + current_states = encoder_hidden_states if is_cross_attention else hidden_states + if is_cross_attention and past_key_values is not None and is_updated: + # reuse k,v, cross_attentions + key_layer = curr_past_key_value.layers[self.layer_idx].keys + value_layer = curr_past_key_value.layers[self.layer_idx].values + else: + key_layer = self.key(current_states) + key_layer = key_layer.view(batch_size, -1, self.num_attention_heads, self.attention_head_size).transpose( + 1, 2 + ) + value_layer = self.value(current_states) + value_layer = value_layer.view( + batch_size, -1, self.num_attention_heads, self.attention_head_size + ).transpose(1, 2) + + if past_key_values is not None: + # save all key/value_layer to cache to be re-used for fast auto-regressive generation + cache_position = cache_position if not is_cross_attention else None + key_layer, value_layer = curr_past_key_value.update( + key_layer, value_layer, self.layer_idx, {"cache_position": cache_position} + ) + # set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls + if is_cross_attention and isinstance(past_key_values, EncoderDecoderCache): + past_key_values.is_updated[self.layer_idx] = True + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + query_length, key_length = query_layer.shape[2], key_layer.shape[2] + if past_key_values is not None: + position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=hidden_states.device).view( + -1, 1 + ) + else: + position_ids_l = torch.arange(query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1) + position_ids_r = torch.arange(key_length, dtype=torch.long, device=hidden_states.device).view(1, -1) + distance = position_ids_l - position_ids_r + + positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1) + positional_embedding = positional_embedding.to(dtype=query_layer.dtype) # fp16 compatibility + + if self.position_embedding_type == "relative_key": + relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores + elif self.position_embedding_type == "relative_key_query": + relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key + + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in CamembertModel forward() function) + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = nn.functional.softmax(attention_scores, dim=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = attention_probs * head_mask + + context_layer = torch.matmul(attention_probs, value_layer) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(new_context_layer_shape) + + return context_layer, attention_probs + + +# Copied from transformers.models.roberta.modeling_roberta.RobertaSdpaSelfAttention with Roberta->Camembert +class CamembertSdpaSelfAttention(CamembertSelfAttention): + def __init__(self, config, position_embedding_type=None, layer_idx=None): + super().__init__(config, position_embedding_type=position_embedding_type, layer_idx=layer_idx) + self.dropout_prob = config.attention_probs_dropout_prob + + # Adapted from CamembertSelfAttention + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + past_key_values: Optional[Cache] = None, + output_attentions: Optional[bool] = False, + cache_position: Optional[torch.Tensor] = None, + ) -> tuple[torch.Tensor]: + if self.position_embedding_type != "absolute" or output_attentions or head_mask is not None: + # TODO: Improve this warning with e.g. `model.config._attn_implementation = "manual"` once implemented. + logger.warning_once( + "CamembertSdpaSelfAttention is used but `torch.nn.functional.scaled_dot_product_attention` does not support " + "non-absolute `position_embedding_type` or `output_attentions=True` or `head_mask`. Falling back to " + "the manual attention implementation, but specifying the manual implementation will be required from " + "Transformers version v5.0.0 onwards. This warning can be removed using the argument " + '`attn_implementation="eager"` when loading the model.' + ) + return super().forward( + hidden_states, + attention_mask, + head_mask, + encoder_hidden_states, + past_key_values, + output_attentions, + cache_position, + ) + + bsz, tgt_len, _ = hidden_states.size() + + query_layer = ( + self.query(hidden_states).view(bsz, -1, self.num_attention_heads, self.attention_head_size).transpose(1, 2) + ) + + is_updated = False + is_cross_attention = encoder_hidden_states is not None + current_states = encoder_hidden_states if is_cross_attention else hidden_states + if past_key_values is not None: + if isinstance(past_key_values, EncoderDecoderCache): + is_updated = past_key_values.is_updated.get(self.layer_idx) + if is_cross_attention: + # after the first generated id, we can subsequently re-use all key/value_states from cache + curr_past_key_value = past_key_values.cross_attention_cache + else: + curr_past_key_value = past_key_values.self_attention_cache + else: + curr_past_key_value = past_key_values + + current_states = encoder_hidden_states if is_cross_attention else hidden_states + if is_cross_attention and past_key_values is not None and is_updated: + # reuse k,v, cross_attentions + key_layer = curr_past_key_value.layers[self.layer_idx].keys + value_layer = curr_past_key_value.layers[self.layer_idx].values + else: + key_layer = ( + self.key(current_states) + .view(bsz, -1, self.num_attention_heads, self.attention_head_size) + .transpose(1, 2) + ) + value_layer = ( + self.value(current_states) + .view(bsz, -1, self.num_attention_heads, self.attention_head_size) + .transpose(1, 2) + ) + + if past_key_values is not None: + # save all key/value_layer to cache to be re-used for fast auto-regressive generation + cache_position = cache_position if not is_cross_attention else None + key_layer, value_layer = curr_past_key_value.update( + key_layer, value_layer, self.layer_idx, {"cache_position": cache_position} + ) + # set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls + if is_cross_attention and isinstance(past_key_values, EncoderDecoderCache): + past_key_values.is_updated[self.layer_idx] = True + + # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment + # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling. + # The tgt_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create + # a causal mask in case tgt_len == 1. + is_causal = self.is_decoder and not is_cross_attention and attention_mask is None and tgt_len > 1 + + attn_output = torch.nn.functional.scaled_dot_product_attention( + query_layer, + key_layer, + value_layer, + attn_mask=attention_mask, + dropout_p=self.dropout_prob if self.training else 0.0, + is_causal=is_causal, + ) + + attn_output = attn_output.transpose(1, 2) + attn_output = attn_output.reshape(bsz, tgt_len, self.all_head_size) + + return attn_output, None + + +# Copied from transformers.models.roberta.modeling_roberta.RobertaSelfOutput with Roberta->Camembert +class CamembertSelfOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +CAMEMBERT_SELF_ATTENTION_CLASSES = { + "eager": CamembertSelfAttention, + "sdpa": CamembertSdpaSelfAttention, +} + + +# Copied from transformers.models.roberta.modeling_roberta.RobertaAttention with Roberta->Camembert,ROBERTA->CAMEMBERT +class CamembertAttention(nn.Module): + def __init__(self, config, position_embedding_type=None, layer_idx=None): + super().__init__() + self.self = CAMEMBERT_SELF_ATTENTION_CLASSES[config._attn_implementation]( + config, + position_embedding_type=position_embedding_type, + layer_idx=layer_idx, + ) + self.output = CamembertSelfOutput(config) + self.pruned_heads = set() + + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads + ) + + # Prune linear layers + self.self.query = prune_linear_layer(self.self.query, index) + self.self.key = prune_linear_layer(self.self.key, index) + self.self.value = prune_linear_layer(self.self.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.self.num_attention_heads = self.self.num_attention_heads - len(heads) + self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + past_key_values: Optional[Cache] = None, + output_attentions: Optional[bool] = False, + cache_position: Optional[torch.Tensor] = None, + ) -> tuple[torch.Tensor]: + self_outputs = self.self( + hidden_states, + attention_mask=attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + past_key_values=past_key_values, + output_attentions=output_attentions, + cache_position=cache_position, + ) + attention_output = self.output(self_outputs[0], hidden_states) + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + return outputs + + +# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->Roberta->Camembert +class CamembertIntermediate(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +# Copied from transformers.models.bert.modeling_bert.BertOutput with Bert->Roberta->Camembert +class CamembertOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +# Copied from transformers.models.roberta.modeling_roberta.RobertaLayer with Roberta->Camembert +class CamembertLayer(GradientCheckpointingLayer): + def __init__(self, config, layer_idx=None): + super().__init__() + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 + self.attention = CamembertAttention(config, layer_idx=layer_idx) + self.is_decoder = config.is_decoder + self.add_cross_attention = config.add_cross_attention + if self.add_cross_attention: + if not self.is_decoder: + raise ValueError(f"{self} should be used as a decoder model if cross attention is added") + self.crossattention = CamembertAttention(config, position_embedding_type="absolute", layer_idx=layer_idx) + self.intermediate = CamembertIntermediate(config) + self.output = CamembertOutput(config) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_values: Optional[Cache] = None, + output_attentions: Optional[bool] = False, + cache_position: Optional[torch.Tensor] = None, + ) -> tuple[torch.Tensor]: + self_attention_outputs = self.attention( + hidden_states, + attention_mask=attention_mask, + head_mask=head_mask, + output_attentions=output_attentions, + past_key_values=past_key_values, + cache_position=cache_position, + ) + attention_output = self_attention_outputs[0] + outputs = self_attention_outputs[1:] # add self attentions if we output attention weights + + if self.is_decoder and encoder_hidden_states is not None: + if not hasattr(self, "crossattention"): + raise ValueError( + f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers" + " by setting `config.add_cross_attention=True`" + ) + + cross_attention_outputs = self.crossattention( + attention_output, + attention_mask=encoder_attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + past_key_values=past_key_values, + output_attentions=output_attentions, + cache_position=cache_position, + ) + attention_output = cross_attention_outputs[0] + outputs = outputs + cross_attention_outputs[1:] # add cross attentions if we output attention weights + + layer_output = apply_chunking_to_forward( + self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output + ) + outputs = (layer_output,) + outputs + + return outputs + + def feed_forward_chunk(self, attention_output): + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + return layer_output + + +# Copied from transformers.models.roberta.modeling_roberta.RobertaEncoder with Roberta->Camembert +class CamembertEncoder(nn.Module): + def __init__(self, config, layer_idx=None): + super().__init__() + self.config = config + self.layer = nn.ModuleList([CamembertLayer(config, layer_idx=i) for i in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_values: Optional[Cache] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = False, + output_hidden_states: Optional[bool] = False, + return_dict: Optional[bool] = True, + cache_position: Optional[torch.Tensor] = None, + ) -> Union[tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]: + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + if use_cache and self.config.is_decoder and past_key_values is None: + past_key_values = EncoderDecoderCache(DynamicCache(config=self.config), DynamicCache(config=self.config)) + + if use_cache and self.config.is_decoder and isinstance(past_key_values, tuple): + logger.warning_once( + "Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. " + "You should pass an instance of `EncoderDecoderCache` instead, e.g. " + "`past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`." + ) + past_key_values = EncoderDecoderCache.from_legacy_cache(past_key_values) + + for i, layer_module in enumerate(self.layer): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_head_mask = head_mask[i] if head_mask is not None else None + + layer_outputs = layer_module( + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, # as a positional argument for gradient checkpointing + encoder_attention_mask=encoder_attention_mask, + past_key_values=past_key_values, + output_attentions=output_attentions, + cache_position=cache_position, + ) + + hidden_states = layer_outputs[0] + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + if self.config.add_cross_attention: + all_cross_attentions = all_cross_attentions + (layer_outputs[2],) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple( + v + for v in [ + hidden_states, + past_key_values, + all_hidden_states, + all_self_attentions, + all_cross_attentions, + ] + if v is not None + ) + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=past_key_values, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + cross_attentions=all_cross_attentions, + ) + + +# Copied from transformers.models.bert.modeling_bert.BertPooler +class CamembertPooler(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.activation = nn.Tanh() + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. + first_token_tensor = hidden_states[:, 0] + pooled_output = self.dense(first_token_tensor) + pooled_output = self.activation(pooled_output) + return pooled_output + + +@auto_docstring +class CamembertPreTrainedModel(PreTrainedModel): + config: CamembertConfig + base_model_prefix = "roberta" + supports_gradient_checkpointing = True + _supports_sdpa = True + + # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights with BertLMPredictionHead->CamembertLMHead + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, nn.Linear): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + elif isinstance(module, CamembertLMHead): + module.bias.data.zero_() + + +# Copied from transformers.models.roberta.modeling_roberta.RobertaClassificationHead with Roberta->Camembert +class CamembertClassificationHead(nn.Module): + """Head for sentence-level classification tasks.""" + + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + classifier_dropout = ( + config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob + ) + self.dropout = nn.Dropout(classifier_dropout) + self.out_proj = nn.Linear(config.hidden_size, config.num_labels) + + def forward(self, features, **kwargs): + x = features[:, 0, :] # take token (equiv. to [CLS]) + x = self.dropout(x) + x = self.dense(x) + x = torch.tanh(x) + x = self.dropout(x) + x = self.out_proj(x) + return x + + +# Copied from transformers.models.roberta.modeling_roberta.RobertaLMHead with Roberta->Camembert +class CamembertLMHead(nn.Module): + """Camembert Head for masked language modeling.""" + + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + self.decoder = nn.Linear(config.hidden_size, config.vocab_size) + self.bias = nn.Parameter(torch.zeros(config.vocab_size)) + self.decoder.bias = self.bias + + def forward(self, features, **kwargs): + x = self.dense(features) + x = gelu(x) + x = self.layer_norm(x) + + # project back to size of vocabulary with bias + x = self.decoder(x) + + return x + + def _tie_weights(self): + # To tie those two weights if they get disconnected (on TPU or when the bias is resized) + # For accelerate compatibility and to not break backward compatibility + if self.decoder.bias.device.type == "meta": + self.decoder.bias = self.bias + else: + self.bias = self.decoder.bias + + +@auto_docstring +class CamembertModel(CamembertPreTrainedModel): + """ + + The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of + cross-attention is added between the self-attention layers, following the architecture described in *Attention is + all you need*_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz + Kaiser and Illia Polosukhin. + + To behave as a decoder the model needs to be initialized with the `is_decoder` argument of the configuration set to + `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and + `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass. + + .. _*Attention is all you need*: https://huggingface.co/papers/1706.03762 + + """ + + _no_split_modules = [] + + # Copied from transformers.models.roberta.modeling_roberta.RobertaModel.__init__ with Roberta->Camembert + def __init__(self, config, add_pooling_layer=True): + r""" + add_pooling_layer (bool, *optional*, defaults to `True`): + Whether to add a pooling layer + """ + super().__init__(config) + self.config = config + + self.embeddings = CamembertEmbeddings(config) + self.encoder = CamembertEncoder(config) + + self.pooler = CamembertPooler(config) if add_pooling_layer else None + + self.attn_implementation = config._attn_implementation + self.position_embedding_type = config.position_embedding_type + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embeddings.word_embeddings + + def set_input_embeddings(self, value): + self.embeddings.word_embeddings = value + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + @auto_docstring + # Copied from transformers.models.roberta.modeling_roberta.RobertaModel.forward + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[Cache] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.Tensor] = None, + ) -> Union[tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if self.config.is_decoder: + use_cache = use_cache if use_cache is not None else self.config.use_cache + else: + use_cache = False + + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask) + input_shape = input_ids.size() + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + batch_size, seq_length = input_shape + device = input_ids.device if input_ids is not None else inputs_embeds.device + + past_key_values_length = 0 + if past_key_values is not None: + past_key_values_length = ( + past_key_values[0][0].shape[-2] + if not isinstance(past_key_values, Cache) + else past_key_values.get_seq_length() + ) + + if token_type_ids is None: + if hasattr(self.embeddings, "token_type_ids"): + buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length] + buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length) + token_type_ids = buffered_token_type_ids_expanded + else: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) + + embedding_output = self.embeddings( + input_ids=input_ids, + position_ids=position_ids, + token_type_ids=token_type_ids, + inputs_embeds=inputs_embeds, + past_key_values_length=past_key_values_length, + ) + + if attention_mask is None: + attention_mask = torch.ones((batch_size, seq_length + past_key_values_length), device=device) + + use_sdpa_attention_masks = ( + self.attn_implementation == "sdpa" + and self.position_embedding_type == "absolute" + and head_mask is None + and not output_attentions + ) + + # Expand the attention mask + if use_sdpa_attention_masks and attention_mask.dim() == 2: + # Expand the attention mask for SDPA. + # [bsz, seq_len] -> [bsz, 1, seq_len, seq_len] + if self.config.is_decoder: + extended_attention_mask = _prepare_4d_causal_attention_mask_for_sdpa( + attention_mask, + input_shape, + embedding_output, + past_key_values_length, + ) + else: + extended_attention_mask = _prepare_4d_attention_mask_for_sdpa( + attention_mask, embedding_output.dtype, tgt_len=seq_length + ) + else: + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape) + + # If a 2D or 3D attention mask is provided for the cross-attention + # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] + if self.config.is_decoder and encoder_hidden_states is not None: + encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size() + encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) + if encoder_attention_mask is None: + encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device) + + if use_sdpa_attention_masks and encoder_attention_mask.dim() == 2: + # Expand the attention mask for SDPA. + # [bsz, seq_len] -> [bsz, 1, seq_len, seq_len] + encoder_extended_attention_mask = _prepare_4d_attention_mask_for_sdpa( + encoder_attention_mask, embedding_output.dtype, tgt_len=seq_length + ) + else: + encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) + else: + encoder_extended_attention_mask = None + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + + encoder_outputs = self.encoder( + embedding_output, + attention_mask=extended_attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_extended_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + cache_position=cache_position, + ) + sequence_output = encoder_outputs[0] + pooled_output = self.pooler(sequence_output) if self.pooler is not None else None + + if not return_dict: + return (sequence_output, pooled_output) + encoder_outputs[1:] + + return BaseModelOutputWithPoolingAndCrossAttentions( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + past_key_values=encoder_outputs.past_key_values, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + cross_attentions=encoder_outputs.cross_attentions, + ) + + +@auto_docstring +# Copied from transformers.models.roberta.modeling_roberta.RobertaForMaskedLM with Roberta->Camembert, ROBERTA->CAMEMBERT +class CamembertForMaskedLM(CamembertPreTrainedModel): + _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"] + + def __init__(self, config): + super().__init__(config) + + if config.is_decoder: + logger.warning( + "If you want to use `CamembertForMaskedLM` make sure `config.is_decoder=False` for " + "bi-directional self-attention." + ) + + self.roberta = CamembertModel(config, add_pooling_layer=False) + self.lm_head = CamembertLMHead(config) + + # Initialize weights and apply final processing + self.post_init() + + def get_output_embeddings(self): + return self.lm_head.decoder + + def set_output_embeddings(self, new_embeddings): + self.lm_head.decoder = new_embeddings + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple[torch.Tensor], MaskedLMOutput]: + r""" + token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`: + + - 0 corresponds to a *sentence A* token, + - 1 corresponds to a *sentence B* token. + This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value + >= 2. All the value in this tensor should be always < type_vocab_size. + + [What are token type IDs?](../glossary#token-type-ids) + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., + config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the + loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.roberta( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + sequence_output = outputs[0] + prediction_scores = self.lm_head(sequence_output) + + masked_lm_loss = None + if labels is not None: + # move labels to correct device to enable model parallelism + labels = labels.to(prediction_scores.device) + loss_fct = CrossEntropyLoss() + masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) + + if not return_dict: + output = (prediction_scores,) + outputs[2:] + return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output + + return MaskedLMOutput( + loss=masked_lm_loss, + logits=prediction_scores, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@auto_docstring( + custom_intro=""" + CamemBERT Model transformer with a sequence classification/regression head on top (a linear layer on top of the + pooled output) e.g. for GLUE tasks. + """ +) +# Copied from transformers.models.roberta.modeling_roberta.RobertaForSequenceClassification with Roberta->Camembert, ROBERTA->CAMEMBERT +class CamembertForSequenceClassification(CamembertPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.config = config + + self.roberta = CamembertModel(config, add_pooling_layer=False) + self.classifier = CamembertClassificationHead(config) + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple[torch.Tensor], SequenceClassifierOutput]: + r""" + token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`: + + - 0 corresponds to a *sentence A* token, + - 1 corresponds to a *sentence B* token. + This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value + >= 2. All the value in this tensor should be always < type_vocab_size. + + [What are token type IDs?](../glossary#token-type-ids) + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.roberta( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + sequence_output = outputs[0] + logits = self.classifier(sequence_output) + + loss = None + if labels is not None: + # move labels to correct device to enable model parallelism + labels = labels.to(logits.device) + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits, labels) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@auto_docstring +# Copied from transformers.models.roberta.modeling_roberta.RobertaForMultipleChoice with Roberta->Camembert, ROBERTA->CAMEMBERT +class CamembertForMultipleChoice(CamembertPreTrainedModel): + def __init__(self, config): + super().__init__(config) + + self.roberta = CamembertModel(config) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.classifier = nn.Linear(config.hidden_size, 1) + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple[torch.Tensor], MultipleChoiceModelOutput]: + r""" + input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`: + + - 0 corresponds to a *sentence A* token, + - 1 corresponds to a *sentence B* token. + This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value + >= 2. All the value in this tensor should be always < type_vocab_size. + + [What are token type IDs?](../glossary#token-type-ids) + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., + num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See + `input_ids` above) + position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + + [What are position IDs?](../glossary#position-ids) + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] + + flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None + flat_position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None + flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None + flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None + flat_inputs_embeds = ( + inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1)) + if inputs_embeds is not None + else None + ) + + outputs = self.roberta( + flat_input_ids, + position_ids=flat_position_ids, + token_type_ids=flat_token_type_ids, + attention_mask=flat_attention_mask, + head_mask=head_mask, + inputs_embeds=flat_inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + pooled_output = outputs[1] + + pooled_output = self.dropout(pooled_output) + logits = self.classifier(pooled_output) + reshaped_logits = logits.view(-1, num_choices) + + loss = None + if labels is not None: + # move labels to correct device to enable model parallelism + labels = labels.to(reshaped_logits.device) + loss_fct = CrossEntropyLoss() + loss = loss_fct(reshaped_logits, labels) + + if not return_dict: + output = (reshaped_logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return MultipleChoiceModelOutput( + loss=loss, + logits=reshaped_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@auto_docstring +# Copied from transformers.models.roberta.modeling_roberta.RobertaForTokenClassification with Roberta->Camembert, ROBERTA->CAMEMBERT +class CamembertForTokenClassification(CamembertPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + + self.roberta = CamembertModel(config, add_pooling_layer=False) + classifier_dropout = ( + config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob + ) + self.dropout = nn.Dropout(classifier_dropout) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple[torch.Tensor], TokenClassifierOutput]: + r""" + token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`: + + - 0 corresponds to a *sentence A* token, + - 1 corresponds to a *sentence B* token. + This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value + >= 2. All the value in this tensor should be always < type_vocab_size. + + [What are token type IDs?](../glossary#token-type-ids) + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.roberta( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + sequence_output = self.dropout(sequence_output) + logits = self.classifier(sequence_output) + + loss = None + if labels is not None: + # move labels to correct device to enable model parallelism + labels = labels.to(logits.device) + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TokenClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@auto_docstring +# Copied from transformers.models.roberta.modeling_roberta.RobertaForQuestionAnswering with Roberta->Camembert, ROBERTA->CAMEMBERT +class CamembertForQuestionAnswering(CamembertPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + + self.roberta = CamembertModel(config, add_pooling_layer=False) + self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + start_positions: Optional[torch.LongTensor] = None, + end_positions: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple[torch.Tensor], QuestionAnsweringModelOutput]: + r""" + token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`: + + - 0 corresponds to a *sentence A* token, + - 1 corresponds to a *sentence B* token. + This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value + >= 2. All the value in this tensor should be always < type_vocab_size. + + [What are token type IDs?](../glossary#token-type-ids) + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.roberta( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + logits = self.qa_outputs(sequence_output) + start_logits, end_logits = logits.split(1, dim=-1) + start_logits = start_logits.squeeze(-1).contiguous() + end_logits = end_logits.squeeze(-1).contiguous() + + total_loss = None + if start_positions is not None and end_positions is not None: + # If we are on multi-GPU, split add a dimension + if len(start_positions.size()) > 1: + start_positions = start_positions.squeeze(-1) + if len(end_positions.size()) > 1: + end_positions = end_positions.squeeze(-1) + # sometimes the start/end positions are outside our model inputs, we ignore these terms + ignored_index = start_logits.size(1) + start_positions = start_positions.clamp(0, ignored_index) + end_positions = end_positions.clamp(0, ignored_index) + + loss_fct = CrossEntropyLoss(ignore_index=ignored_index) + start_loss = loss_fct(start_logits, start_positions) + end_loss = loss_fct(end_logits, end_positions) + total_loss = (start_loss + end_loss) / 2 + + if not return_dict: + output = (start_logits, end_logits) + outputs[2:] + return ((total_loss,) + output) if total_loss is not None else output + + return QuestionAnsweringModelOutput( + loss=total_loss, + start_logits=start_logits, + end_logits=end_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@auto_docstring( + custom_intro=""" + CamemBERT Model with a `language modeling` head on top for CLM fine-tuning. + """ +) +# Copied from transformers.models.roberta.modeling_roberta.RobertaForCausalLM with Roberta->Camembert, ROBERTA->CAMEMBERT, FacebookAI/roberta-base->almanach/camembert-base +class CamembertForCausalLM(CamembertPreTrainedModel, GenerationMixin): + _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"] + + def __init__(self, config): + super().__init__(config) + + if not config.is_decoder: + logger.warning("If you want to use `CamembertLMHeadModel` as a standalone, add `is_decoder=True.`") + + self.roberta = CamembertModel(config, add_pooling_layer=False) + self.lm_head = CamembertLMHead(config) + + # Initialize weights and apply final processing + self.post_init() + + def get_output_embeddings(self): + return self.lm_head.decoder + + def set_output_embeddings(self, new_embeddings): + self.lm_head.decoder = new_embeddings + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + **kwargs, + ) -> Union[tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]: + r""" + token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`: + + - 0 corresponds to a *sentence A* token, + - 1 corresponds to a *sentence B* token. + This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value + >= 2. All the value in this tensor should be always < type_vocab_size. + + [What are token type IDs?](../glossary#token-type-ids) + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in + `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are + ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` + + Example: + + ```python + >>> from transformers import AutoTokenizer, CamembertForCausalLM, AutoConfig + >>> import torch + + >>> tokenizer = AutoTokenizer.from_pretrained("almanach/camembert-base") + >>> config = AutoConfig.from_pretrained("almanach/camembert-base") + >>> config.is_decoder = True + >>> model = CamembertForCausalLM.from_pretrained("almanach/camembert-base", config=config) + + >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") + >>> outputs = model(**inputs) + + >>> prediction_logits = outputs.logits + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + if labels is not None: + use_cache = False + + outputs = self.roberta( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + prediction_scores = self.lm_head(sequence_output) + + lm_loss = None + if labels is not None: + # move labels to correct device to enable model parallelism + labels = labels.to(prediction_scores.device) + lm_loss = self.loss_function( + prediction_scores, + labels, + vocab_size=self.config.vocab_size, + **kwargs, + ) + + if not return_dict: + output = (prediction_scores,) + outputs[2:] + return ((lm_loss,) + output) if lm_loss is not None else output + + return CausalLMOutputWithCrossAttentions( + loss=lm_loss, + logits=prediction_scores, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + cross_attentions=outputs.cross_attentions, + ) + + +# Copied from transformers.models.roberta.modeling_roberta.create_position_ids_from_input_ids +def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0): + """ + Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols + are ignored. This is modified from fairseq's `utils.make_positions`. + + Args: + x: torch.Tensor x: + + Returns: torch.Tensor + """ + # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA. + mask = input_ids.ne(padding_idx).int() + incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask + return incremental_indices.long() + padding_idx + + +__all__ = [ + "CamembertForCausalLM", + "CamembertForMaskedLM", + "CamembertForMultipleChoice", + "CamembertForQuestionAnswering", + "CamembertForSequenceClassification", + "CamembertForTokenClassification", + "CamembertModel", + "CamembertPreTrainedModel", +] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/camembert/modeling_tf_camembert.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/camembert/modeling_tf_camembert.py new file mode 100644 index 0000000000000000000000000000000000000000..0869902aa96245ead1bb22f2e818517288a35534 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/camembert/modeling_tf_camembert.py @@ -0,0 +1,1800 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""TF 2.0 CamemBERT model.""" + +from __future__ import annotations + +import math +import warnings + +import numpy as np +import tensorflow as tf + +from ...activations_tf import get_tf_activation +from ...modeling_tf_outputs import ( + TFBaseModelOutputWithPastAndCrossAttentions, + TFBaseModelOutputWithPoolingAndCrossAttentions, + TFCausalLMOutputWithCrossAttentions, + TFMaskedLMOutput, + TFMultipleChoiceModelOutput, + TFQuestionAnsweringModelOutput, + TFSequenceClassifierOutput, + TFTokenClassifierOutput, +) +from ...modeling_tf_utils import ( + TFCausalLanguageModelingLoss, + TFMaskedLanguageModelingLoss, + TFModelInputType, + TFMultipleChoiceLoss, + TFPreTrainedModel, + TFQuestionAnsweringLoss, + TFSequenceClassificationLoss, + TFTokenClassificationLoss, + get_initializer, + keras, + keras_serializable, + unpack_inputs, +) +from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax +from ...utils import ( + add_code_sample_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, +) +from .configuration_camembert import CamembertConfig + + +logger = logging.get_logger(__name__) + +_CHECKPOINT_FOR_DOC = "almanach/camembert-base" +_CONFIG_FOR_DOC = "CamembertConfig" + + +CAMEMBERT_START_DOCSTRING = r""" + + This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and + behavior. + + + + TensorFlow models and layers in `transformers` accept two formats as input: + + - having all inputs as keyword arguments (like PyTorch models), or + - having all inputs as a list, tuple or dict in the first positional argument. + + The reason the second format is supported is that Keras methods prefer this format when passing inputs to models + and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just + pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second + format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with + the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first + positional argument: + + - a single Tensor with `input_ids` only and nothing else: `model(input_ids)` + - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring: + `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])` + - a dictionary with one or several input Tensors associated to the input names given in the docstring: + `model({"input_ids": input_ids, "token_type_ids": token_type_ids})` + + Note that when creating models and layers with + [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry + about any of this, as you can just pass inputs like you would to any other Python function! + + + + Parameters: + config ([`CamembertConfig`]): Model configuration class with all the parameters of the + model. Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +CAMEMBERT_INPUTS_DOCSTRING = r""" + Args: + input_ids (`Numpy array` or `tf.Tensor` of shape `({0})`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and + [`PreTrainedTokenizer.encode`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + token_type_ids (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, + 1]`: + + - 0 corresponds to a *sentence A* token, + - 1 corresponds to a *sentence B* token. + + [What are token type IDs?](../glossary#token-type-ids) + position_ids (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + + [What are position IDs?](../glossary#position-ids) + head_mask (`Numpy array` or `tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + inputs_embeds (`tf.Tensor` of shape `({0}, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the + config will be used instead. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. This argument can be used only in eager mode, in graph mode the value in the config will be + used instead. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in + eager mode, in graph mode the value will always be set to True. + training (`bool`, *optional*, defaults to `False`): + Whether or not to use the model in training mode (some modules like dropout modules have different + behaviors between training and evaluation). +""" + + +# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaEmbeddings +class TFCamembertEmbeddings(keras.layers.Layer): + """ + Same as BertEmbeddings with a tiny tweak for positional embeddings indexing. + """ + + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + + self.padding_idx = 1 + self.config = config + self.hidden_size = config.hidden_size + self.max_position_embeddings = config.max_position_embeddings + self.initializer_range = config.initializer_range + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) + + def build(self, input_shape=None): + with tf.name_scope("word_embeddings"): + self.weight = self.add_weight( + name="weight", + shape=[self.config.vocab_size, self.hidden_size], + initializer=get_initializer(self.initializer_range), + ) + + with tf.name_scope("token_type_embeddings"): + self.token_type_embeddings = self.add_weight( + name="embeddings", + shape=[self.config.type_vocab_size, self.hidden_size], + initializer=get_initializer(self.initializer_range), + ) + + with tf.name_scope("position_embeddings"): + self.position_embeddings = self.add_weight( + name="embeddings", + shape=[self.max_position_embeddings, self.hidden_size], + initializer=get_initializer(self.initializer_range), + ) + + if self.built: + return + self.built = True + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) + + def create_position_ids_from_input_ids(self, input_ids, past_key_values_length=0): + """ + Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding + symbols are ignored. This is modified from fairseq's `utils.make_positions`. + + Args: + input_ids: tf.Tensor + Returns: tf.Tensor + """ + mask = tf.cast(tf.math.not_equal(input_ids, self.padding_idx), dtype=input_ids.dtype) + incremental_indices = (tf.math.cumsum(mask, axis=1) + past_key_values_length) * mask + + return incremental_indices + self.padding_idx + + def call( + self, + input_ids=None, + position_ids=None, + token_type_ids=None, + inputs_embeds=None, + past_key_values_length=0, + training=False, + ): + """ + Applies embedding based on inputs tensor. + + Returns: + final_embeddings (`tf.Tensor`): output embedding tensor. + """ + assert not (input_ids is None and inputs_embeds is None) + + if input_ids is not None: + check_embeddings_within_bounds(input_ids, self.config.vocab_size) + inputs_embeds = tf.gather(params=self.weight, indices=input_ids) + + input_shape = shape_list(inputs_embeds)[:-1] + + if token_type_ids is None: + token_type_ids = tf.fill(dims=input_shape, value=0) + + if position_ids is None: + if input_ids is not None: + # Create the position ids from the input token ids. Any padded tokens remain padded. + position_ids = self.create_position_ids_from_input_ids( + input_ids=input_ids, past_key_values_length=past_key_values_length + ) + else: + position_ids = tf.expand_dims( + tf.range(start=self.padding_idx + 1, limit=input_shape[-1] + self.padding_idx + 1), axis=0 + ) + + position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids) + token_type_embeds = tf.gather(params=self.token_type_embeddings, indices=token_type_ids) + final_embeddings = inputs_embeds + position_embeds + token_type_embeds + final_embeddings = self.LayerNorm(inputs=final_embeddings) + final_embeddings = self.dropout(inputs=final_embeddings, training=training) + + return final_embeddings + + +# Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler with Bert->Camembert +class TFCamembertPooler(keras.layers.Layer): + def __init__(self, config: CamembertConfig, **kwargs): + super().__init__(**kwargs) + + self.dense = keras.layers.Dense( + units=config.hidden_size, + kernel_initializer=get_initializer(config.initializer_range), + activation="tanh", + name="dense", + ) + self.config = config + + def call(self, hidden_states: tf.Tensor) -> tf.Tensor: + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. + first_token_tensor = hidden_states[:, 0] + pooled_output = self.dense(inputs=first_token_tensor) + + return pooled_output + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build([None, None, self.config.hidden_size]) + + +# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention with Bert->Camembert +class TFCamembertSelfAttention(keras.layers.Layer): + def __init__(self, config: CamembertConfig, **kwargs): + super().__init__(**kwargs) + + if config.hidden_size % config.num_attention_heads != 0: + raise ValueError( + f"The hidden size ({config.hidden_size}) is not a multiple of the number " + f"of attention heads ({config.num_attention_heads})" + ) + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + self.sqrt_att_head_size = math.sqrt(self.attention_head_size) + + self.query = keras.layers.Dense( + units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query" + ) + self.key = keras.layers.Dense( + units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key" + ) + self.value = keras.layers.Dense( + units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value" + ) + self.dropout = keras.layers.Dropout(rate=config.attention_probs_dropout_prob) + + self.is_decoder = config.is_decoder + self.config = config + + def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor: + # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size] + tensor = tf.reshape(tensor=tensor, shape=(batch_size, -1, self.num_attention_heads, self.attention_head_size)) + + # Transpose the tensor from [batch_size, seq_length, num_attention_heads, attention_head_size] to [batch_size, num_attention_heads, seq_length, attention_head_size] + return tf.transpose(tensor, perm=[0, 2, 1, 3]) + + def call( + self, + hidden_states: tf.Tensor, + attention_mask: tf.Tensor, + head_mask: tf.Tensor, + encoder_hidden_states: tf.Tensor, + encoder_attention_mask: tf.Tensor, + past_key_value: tuple[tf.Tensor], + output_attentions: bool, + training: bool = False, + ) -> tuple[tf.Tensor]: + batch_size = shape_list(hidden_states)[0] + mixed_query_layer = self.query(inputs=hidden_states) + + # If this is instantiated as a cross-attention module, the keys + # and values come from an encoder; the attention mask needs to be + # such that the encoder's padding tokens are not attended to. + is_cross_attention = encoder_hidden_states is not None + + if is_cross_attention and past_key_value is not None: + # reuse k,v, cross_attentions + key_layer = past_key_value[0] + value_layer = past_key_value[1] + attention_mask = encoder_attention_mask + elif is_cross_attention: + key_layer = self.transpose_for_scores(self.key(inputs=encoder_hidden_states), batch_size) + value_layer = self.transpose_for_scores(self.value(inputs=encoder_hidden_states), batch_size) + attention_mask = encoder_attention_mask + elif past_key_value is not None: + key_layer = self.transpose_for_scores(self.key(inputs=hidden_states), batch_size) + value_layer = self.transpose_for_scores(self.value(inputs=hidden_states), batch_size) + key_layer = tf.concat([past_key_value[0], key_layer], axis=2) + value_layer = tf.concat([past_key_value[1], value_layer], axis=2) + else: + key_layer = self.transpose_for_scores(self.key(inputs=hidden_states), batch_size) + value_layer = self.transpose_for_scores(self.value(inputs=hidden_states), batch_size) + + query_layer = self.transpose_for_scores(mixed_query_layer, batch_size) + + if self.is_decoder: + # if cross_attention save Tuple(tf.Tensor, tf.Tensor) of all cross attention key/value_states. + # Further calls to cross_attention layer can then reuse all cross-attention + # key/value_states (first "if" case) + # if uni-directional self-attention (decoder) save Tuple(tf.Tensor, tf.Tensor) of + # all previous decoder key/value_states. Further calls to uni-directional self-attention + # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) + # if encoder bi-directional self-attention `past_key_value` is always `None` + past_key_value = (key_layer, value_layer) + + # Take the dot product between "query" and "key" to get the raw attention scores. + # (batch size, num_heads, seq_len_q, seq_len_k) + attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True) + dk = tf.cast(self.sqrt_att_head_size, dtype=attention_scores.dtype) + attention_scores = tf.divide(attention_scores, dk) + + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in TFCamembertModel call() function) + attention_scores = tf.add(attention_scores, attention_mask) + + # Normalize the attention scores to probabilities. + attention_probs = stable_softmax(logits=attention_scores, axis=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(inputs=attention_probs, training=training) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = tf.multiply(attention_probs, head_mask) + + attention_output = tf.matmul(attention_probs, value_layer) + attention_output = tf.transpose(attention_output, perm=[0, 2, 1, 3]) + + # (batch_size, seq_len_q, all_head_size) + attention_output = tf.reshape(tensor=attention_output, shape=(batch_size, -1, self.all_head_size)) + outputs = (attention_output, attention_probs) if output_attentions else (attention_output,) + + if self.is_decoder: + outputs = outputs + (past_key_value,) + return outputs + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "query", None) is not None: + with tf.name_scope(self.query.name): + self.query.build([None, None, self.config.hidden_size]) + if getattr(self, "key", None) is not None: + with tf.name_scope(self.key.name): + self.key.build([None, None, self.config.hidden_size]) + if getattr(self, "value", None) is not None: + with tf.name_scope(self.value.name): + self.value.build([None, None, self.config.hidden_size]) + + +# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput with Bert->Camembert +class TFCamembertSelfOutput(keras.layers.Layer): + def __init__(self, config: CamembertConfig, **kwargs): + super().__init__(**kwargs) + + self.dense = keras.layers.Dense( + units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + ) + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.config = config + + def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: + hidden_states = self.dense(inputs=hidden_states) + hidden_states = self.dropout(inputs=hidden_states, training=training) + hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor) + + return hidden_states + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build([None, None, self.config.hidden_size]) + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) + + +# Copied from transformers.models.bert.modeling_tf_bert.TFBertAttention with Bert->Camembert +class TFCamembertAttention(keras.layers.Layer): + def __init__(self, config: CamembertConfig, **kwargs): + super().__init__(**kwargs) + + self.self_attention = TFCamembertSelfAttention(config, name="self") + self.dense_output = TFCamembertSelfOutput(config, name="output") + + def prune_heads(self, heads): + raise NotImplementedError + + def call( + self, + input_tensor: tf.Tensor, + attention_mask: tf.Tensor, + head_mask: tf.Tensor, + encoder_hidden_states: tf.Tensor, + encoder_attention_mask: tf.Tensor, + past_key_value: tuple[tf.Tensor], + output_attentions: bool, + training: bool = False, + ) -> tuple[tf.Tensor]: + self_outputs = self.self_attention( + hidden_states=input_tensor, + attention_mask=attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + past_key_value=past_key_value, + output_attentions=output_attentions, + training=training, + ) + attention_output = self.dense_output( + hidden_states=self_outputs[0], input_tensor=input_tensor, training=training + ) + # add attentions (possibly with past_key_value) if we output them + outputs = (attention_output,) + self_outputs[1:] + + return outputs + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "self_attention", None) is not None: + with tf.name_scope(self.self_attention.name): + self.self_attention.build(None) + if getattr(self, "dense_output", None) is not None: + with tf.name_scope(self.dense_output.name): + self.dense_output.build(None) + + +# Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->Camembert +class TFCamembertIntermediate(keras.layers.Layer): + def __init__(self, config: CamembertConfig, **kwargs): + super().__init__(**kwargs) + + self.dense = keras.layers.Dense( + units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + ) + + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = get_tf_activation(config.hidden_act) + else: + self.intermediate_act_fn = config.hidden_act + self.config = config + + def call(self, hidden_states: tf.Tensor) -> tf.Tensor: + hidden_states = self.dense(inputs=hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + + return hidden_states + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build([None, None, self.config.hidden_size]) + + +# Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput with Bert->Camembert +class TFCamembertOutput(keras.layers.Layer): + def __init__(self, config: CamembertConfig, **kwargs): + super().__init__(**kwargs) + + self.dense = keras.layers.Dense( + units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + ) + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.config = config + + def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: + hidden_states = self.dense(inputs=hidden_states) + hidden_states = self.dropout(inputs=hidden_states, training=training) + hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor) + + return hidden_states + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build([None, None, self.config.intermediate_size]) + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) + + +# Copied from transformers.models.bert.modeling_tf_bert.TFBertLayer with Bert->Camembert +class TFCamembertLayer(keras.layers.Layer): + def __init__(self, config: CamembertConfig, **kwargs): + super().__init__(**kwargs) + + self.attention = TFCamembertAttention(config, name="attention") + self.is_decoder = config.is_decoder + self.add_cross_attention = config.add_cross_attention + if self.add_cross_attention: + if not self.is_decoder: + raise ValueError(f"{self} should be used as a decoder model if cross attention is added") + self.crossattention = TFCamembertAttention(config, name="crossattention") + self.intermediate = TFCamembertIntermediate(config, name="intermediate") + self.bert_output = TFCamembertOutput(config, name="output") + + def call( + self, + hidden_states: tf.Tensor, + attention_mask: tf.Tensor, + head_mask: tf.Tensor, + encoder_hidden_states: tf.Tensor | None, + encoder_attention_mask: tf.Tensor | None, + past_key_value: tuple[tf.Tensor] | None, + output_attentions: bool, + training: bool = False, + ) -> tuple[tf.Tensor]: + # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 + self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None + self_attention_outputs = self.attention( + input_tensor=hidden_states, + attention_mask=attention_mask, + head_mask=head_mask, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=self_attn_past_key_value, + output_attentions=output_attentions, + training=training, + ) + attention_output = self_attention_outputs[0] + + # if decoder, the last output is tuple of self-attn cache + if self.is_decoder: + outputs = self_attention_outputs[1:-1] + present_key_value = self_attention_outputs[-1] + else: + outputs = self_attention_outputs[1:] # add self attentions if we output attention weights + + cross_attn_present_key_value = None + if self.is_decoder and encoder_hidden_states is not None: + if not hasattr(self, "crossattention"): + raise ValueError( + f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers" + " by setting `config.add_cross_attention=True`" + ) + + # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple + cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None + cross_attention_outputs = self.crossattention( + input_tensor=attention_output, + attention_mask=attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + past_key_value=cross_attn_past_key_value, + output_attentions=output_attentions, + training=training, + ) + attention_output = cross_attention_outputs[0] + outputs = outputs + cross_attention_outputs[1:-1] # add cross attentions if we output attention weights + + # add cross-attn cache to positions 3,4 of present_key_value tuple + cross_attn_present_key_value = cross_attention_outputs[-1] + present_key_value = present_key_value + cross_attn_present_key_value + + intermediate_output = self.intermediate(hidden_states=attention_output) + layer_output = self.bert_output( + hidden_states=intermediate_output, input_tensor=attention_output, training=training + ) + outputs = (layer_output,) + outputs # add attentions if we output them + + # if decoder, return the attn key/values as the last output + if self.is_decoder: + outputs = outputs + (present_key_value,) + + return outputs + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "attention", None) is not None: + with tf.name_scope(self.attention.name): + self.attention.build(None) + if getattr(self, "intermediate", None) is not None: + with tf.name_scope(self.intermediate.name): + self.intermediate.build(None) + if getattr(self, "bert_output", None) is not None: + with tf.name_scope(self.bert_output.name): + self.bert_output.build(None) + if getattr(self, "crossattention", None) is not None: + with tf.name_scope(self.crossattention.name): + self.crossattention.build(None) + + +# Copied from transformers.models.bert.modeling_tf_bert.TFBertEncoder with Bert->Camembert +class TFCamembertEncoder(keras.layers.Layer): + def __init__(self, config: CamembertConfig, **kwargs): + super().__init__(**kwargs) + self.config = config + self.layer = [TFCamembertLayer(config, name=f"layer_._{i}") for i in range(config.num_hidden_layers)] + + def call( + self, + hidden_states: tf.Tensor, + attention_mask: tf.Tensor, + head_mask: tf.Tensor, + encoder_hidden_states: tf.Tensor | None, + encoder_attention_mask: tf.Tensor | None, + past_key_values: tuple[tuple[tf.Tensor]] | None, + use_cache: bool | None, + output_attentions: bool, + output_hidden_states: bool, + return_dict: bool, + training: bool = False, + ) -> TFBaseModelOutputWithPastAndCrossAttentions | tuple[tf.Tensor]: + all_hidden_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None + + next_decoder_cache = () if use_cache else None + for i, layer_module in enumerate(self.layer): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + past_key_value = past_key_values[i] if past_key_values is not None else None + + layer_outputs = layer_module( + hidden_states=hidden_states, + attention_mask=attention_mask, + head_mask=head_mask[i], + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + past_key_value=past_key_value, + output_attentions=output_attentions, + training=training, + ) + hidden_states = layer_outputs[0] + + if use_cache: + next_decoder_cache += (layer_outputs[-1],) + + if output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + if self.config.add_cross_attention and encoder_hidden_states is not None: + all_cross_attentions = all_cross_attentions + (layer_outputs[2],) + + # Add last layer + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple( + v for v in [hidden_states, all_hidden_states, all_attentions, all_cross_attentions] if v is not None + ) + + return TFBaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=next_decoder_cache, + hidden_states=all_hidden_states, + attentions=all_attentions, + cross_attentions=all_cross_attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "layer", None) is not None: + for layer in self.layer: + with tf.name_scope(layer.name): + layer.build(None) + + +@keras_serializable +# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaMainLayer with Roberta->Camembert +class TFCamembertMainLayer(keras.layers.Layer): + config_class = CamembertConfig + + def __init__(self, config, add_pooling_layer=True, **kwargs): + super().__init__(**kwargs) + + self.config = config + self.is_decoder = config.is_decoder + + self.num_hidden_layers = config.num_hidden_layers + self.initializer_range = config.initializer_range + self.output_attentions = config.output_attentions + self.output_hidden_states = config.output_hidden_states + self.return_dict = config.use_return_dict + self.encoder = TFCamembertEncoder(config, name="encoder") + self.pooler = TFCamembertPooler(config, name="pooler") if add_pooling_layer else None + # The embeddings must be the last declaration in order to follow the weights order + self.embeddings = TFCamembertEmbeddings(config, name="embeddings") + + # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.get_input_embeddings + def get_input_embeddings(self) -> keras.layers.Layer: + return self.embeddings + + # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.set_input_embeddings + def set_input_embeddings(self, value: tf.Variable): + self.embeddings.weight = value + self.embeddings.vocab_size = shape_list(value)[0] + + # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer._prune_heads + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + raise NotImplementedError + + @unpack_inputs + # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.call + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + encoder_hidden_states: np.ndarray | tf.Tensor | None = None, + encoder_attention_mask: np.ndarray | tf.Tensor | None = None, + past_key_values: tuple[tuple[np.ndarray | tf.Tensor]] | None = None, + use_cache: bool | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + training: bool = False, + ) -> TFBaseModelOutputWithPoolingAndCrossAttentions | tuple[tf.Tensor]: + if not self.config.is_decoder: + use_cache = False + + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + input_shape = shape_list(input_ids) + elif inputs_embeds is not None: + input_shape = shape_list(inputs_embeds)[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + batch_size, seq_length = input_shape + + if past_key_values is None: + past_key_values_length = 0 + past_key_values = [None] * len(self.encoder.layer) + else: + past_key_values_length = shape_list(past_key_values[0][0])[-2] + + if attention_mask is None: + attention_mask = tf.fill(dims=(batch_size, seq_length + past_key_values_length), value=1) + + if token_type_ids is None: + token_type_ids = tf.fill(dims=input_shape, value=0) + + embedding_output = self.embeddings( + input_ids=input_ids, + position_ids=position_ids, + token_type_ids=token_type_ids, + inputs_embeds=inputs_embeds, + past_key_values_length=past_key_values_length, + training=training, + ) + + # We create a 3D attention mask from a 2D tensor mask. + # Sizes are [batch_size, 1, 1, to_seq_length] + # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] + # this attention mask is more simple than the triangular masking of causal attention + # used in OpenAI GPT, we just need to prepare the broadcast dimension here. + attention_mask_shape = shape_list(attention_mask) + + mask_seq_length = seq_length + past_key_values_length + # Copied from `modeling_tf_t5.py` + # Provided a padding mask of dimensions [batch_size, mask_seq_length] + # - if the model is a decoder, apply a causal mask in addition to the padding mask + # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, mask_seq_length, mask_seq_length] + if self.is_decoder: + seq_ids = tf.range(mask_seq_length) + causal_mask = tf.less_equal( + tf.tile(seq_ids[None, None, :], (batch_size, mask_seq_length, 1)), + seq_ids[None, :, None], + ) + causal_mask = tf.cast(causal_mask, dtype=attention_mask.dtype) + extended_attention_mask = causal_mask * attention_mask[:, None, :] + attention_mask_shape = shape_list(extended_attention_mask) + extended_attention_mask = tf.reshape( + extended_attention_mask, (attention_mask_shape[0], 1, attention_mask_shape[1], attention_mask_shape[2]) + ) + if past_key_values[0] is not None: + # attention_mask needs to be sliced to the shape `[batch_size, 1, from_seq_length - cached_seq_length, to_seq_length] + extended_attention_mask = extended_attention_mask[:, :, -seq_length:, :] + else: + extended_attention_mask = tf.reshape( + attention_mask, (attention_mask_shape[0], 1, 1, attention_mask_shape[1]) + ) + + # Since attention_mask is 1.0 for positions we want to attend and 0.0 for + # masked positions, this operation will create a tensor which is 0.0 for + # positions we want to attend and -10000.0 for masked positions. + # Since we are adding it to the raw scores before the softmax, this is + # effectively the same as removing these entirely. + extended_attention_mask = tf.cast(extended_attention_mask, dtype=embedding_output.dtype) + one_cst = tf.constant(1.0, dtype=embedding_output.dtype) + ten_thousand_cst = tf.constant(-10000.0, dtype=embedding_output.dtype) + extended_attention_mask = tf.multiply(tf.subtract(one_cst, extended_attention_mask), ten_thousand_cst) + + # Copied from `modeling_tf_t5.py` with -1e9 -> -10000 + if self.is_decoder and encoder_attention_mask is not None: + # If a 2D ou 3D attention mask is provided for the cross-attention + # we need to make broadcastable to [batch_size, num_heads, mask_seq_length, mask_seq_length] + # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] + encoder_attention_mask = tf.cast(encoder_attention_mask, dtype=extended_attention_mask.dtype) + num_dims_encoder_attention_mask = len(shape_list(encoder_attention_mask)) + if num_dims_encoder_attention_mask == 3: + encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :] + if num_dims_encoder_attention_mask == 2: + encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :] + + # T5 has a mask that can compare sequence ids, we can simulate this here with this transposition + # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow/transformer/transformer_layers.py#L270 + # encoder_extended_attention_mask = tf.math.equal(encoder_extended_attention_mask, + # tf.transpose(encoder_extended_attention_mask, perm=(-1, -2))) + + encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -10000.0 + else: + encoder_extended_attention_mask = None + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + if head_mask is not None: + raise NotImplementedError + else: + head_mask = [None] * self.config.num_hidden_layers + + encoder_outputs = self.encoder( + hidden_states=embedding_output, + attention_mask=extended_attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_extended_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + sequence_output = encoder_outputs[0] + pooled_output = self.pooler(hidden_states=sequence_output) if self.pooler is not None else None + + if not return_dict: + return ( + sequence_output, + pooled_output, + ) + encoder_outputs[1:] + + return TFBaseModelOutputWithPoolingAndCrossAttentions( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + past_key_values=encoder_outputs.past_key_values, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + cross_attentions=encoder_outputs.cross_attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "encoder", None) is not None: + with tf.name_scope(self.encoder.name): + self.encoder.build(None) + if getattr(self, "pooler", None) is not None: + with tf.name_scope(self.pooler.name): + self.pooler.build(None) + if getattr(self, "embeddings", None) is not None: + with tf.name_scope(self.embeddings.name): + self.embeddings.build(None) + + +class TFCamembertPreTrainedModel(TFPreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = CamembertConfig + base_model_prefix = "roberta" + + +@add_start_docstrings( + "The bare CamemBERT Model transformer outputting raw hidden-states without any specific head on top.", + CAMEMBERT_START_DOCSTRING, +) +# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaModel with Roberta->Camembert, ROBERTA->CAMEMBERT +class TFCamembertModel(TFCamembertPreTrainedModel): + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + self.roberta = TFCamembertMainLayer(config, name="roberta") + + @unpack_inputs + @add_start_docstrings_to_model_forward(CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFBaseModelOutputWithPoolingAndCrossAttentions, + config_class=_CONFIG_FOR_DOC, + ) + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + encoder_hidden_states: np.ndarray | tf.Tensor | None = None, + encoder_attention_mask: np.ndarray | tf.Tensor | None = None, + past_key_values: tuple[tuple[np.ndarray | tf.Tensor]] | None = None, + use_cache: bool | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + training: bool | None = False, + ) -> tuple | TFBaseModelOutputWithPoolingAndCrossAttentions: + r""" + encoder_hidden_states (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if + the model is configured as a decoder. + encoder_attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in + the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + past_key_values (`tuple[tuple[tf.Tensor]]` of length `config.n_layers`) + contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `(batch_size, sequence_length)`. + use_cache (`bool`, *optional*, defaults to `True`): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). Set to `False` during training, `True` during generation + """ + outputs = self.roberta( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + return outputs + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "roberta", None) is not None: + with tf.name_scope(self.roberta.name): + self.roberta.build(None) + + +# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaLMHead with Roberta->Camembert +class TFCamembertLMHead(keras.layers.Layer): + """Camembert Head for masked language modeling.""" + + def __init__(self, config, input_embeddings, **kwargs): + super().__init__(**kwargs) + + self.config = config + self.hidden_size = config.hidden_size + self.dense = keras.layers.Dense( + config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + ) + self.layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm") + self.act = get_tf_activation("gelu") + + # The output weights are the same as the input embeddings, but there is + # an output-only bias for each token. + self.decoder = input_embeddings + + def build(self, input_shape=None): + self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias") + + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build([None, None, self.config.hidden_size]) + if getattr(self, "layer_norm", None) is not None: + with tf.name_scope(self.layer_norm.name): + self.layer_norm.build([None, None, self.config.hidden_size]) + + def get_output_embeddings(self): + return self.decoder + + def set_output_embeddings(self, value): + self.decoder.weight = value + self.decoder.vocab_size = shape_list(value)[0] + + def get_bias(self): + return {"bias": self.bias} + + def set_bias(self, value): + self.bias = value["bias"] + self.config.vocab_size = shape_list(value["bias"])[0] + + def call(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.act(hidden_states) + hidden_states = self.layer_norm(hidden_states) + + # project back to size of vocabulary with bias + seq_length = shape_list(tensor=hidden_states)[1] + hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size]) + hidden_states = tf.matmul(a=hidden_states, b=self.decoder.weight, transpose_b=True) + hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size]) + hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias) + + return hidden_states + + +@add_start_docstrings( + """CamemBERT Model with a `language modeling` head on top.""", + CAMEMBERT_START_DOCSTRING, +) +# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaForMaskedLM with Roberta->Camembert, ROBERTA->CAMEMBERT +class TFCamembertForMaskedLM(TFCamembertPreTrainedModel, TFMaskedLanguageModelingLoss): + # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model + _keys_to_ignore_on_load_unexpected = [r"pooler", r"lm_head.decoder.weight"] + + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + self.roberta = TFCamembertMainLayer(config, add_pooling_layer=False, name="roberta") + self.lm_head = TFCamembertLMHead(config, self.roberta.embeddings, name="lm_head") + + def get_lm_head(self): + return self.lm_head + + def get_prefix_bias_name(self): + warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning) + return self.name + "/" + self.lm_head.name + + @unpack_inputs + @add_start_docstrings_to_model_forward(CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFMaskedLMOutput, + config_class=_CONFIG_FOR_DOC, + mask="", + expected_output="' Paris'", + expected_loss=0.1, + ) + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + labels: np.ndarray | tf.Tensor | None = None, + training: bool | None = False, + ) -> TFMaskedLMOutput | tuple[tf.Tensor]: + r""" + labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., + config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the + loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` + """ + outputs = self.roberta( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + sequence_output = outputs[0] + prediction_scores = self.lm_head(sequence_output) + + loss = None if labels is None else self.hf_compute_loss(labels, prediction_scores) + + if not return_dict: + output = (prediction_scores,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TFMaskedLMOutput( + loss=loss, + logits=prediction_scores, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "roberta", None) is not None: + with tf.name_scope(self.roberta.name): + self.roberta.build(None) + if getattr(self, "lm_head", None) is not None: + with tf.name_scope(self.lm_head.name): + self.lm_head.build(None) + + +# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaClassificationHead +class TFCamembertClassificationHead(keras.layers.Layer): + """Head for sentence-level classification tasks.""" + + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + self.dense = keras.layers.Dense( + config.hidden_size, + kernel_initializer=get_initializer(config.initializer_range), + activation="tanh", + name="dense", + ) + classifier_dropout = ( + config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob + ) + self.dropout = keras.layers.Dropout(classifier_dropout) + self.out_proj = keras.layers.Dense( + config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj" + ) + self.config = config + + def call(self, features, training=False): + x = features[:, 0, :] # take token (equiv. to [CLS]) + x = self.dropout(x, training=training) + x = self.dense(x) + x = self.dropout(x, training=training) + x = self.out_proj(x) + return x + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build([None, None, self.config.hidden_size]) + if getattr(self, "out_proj", None) is not None: + with tf.name_scope(self.out_proj.name): + self.out_proj.build([None, None, self.config.hidden_size]) + + +@add_start_docstrings( + """ + CamemBERT Model transformer with a sequence classification/regression head on top (a linear layer on top of the + pooled output) e.g. for GLUE tasks. + """, + CAMEMBERT_START_DOCSTRING, +) +# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaForSequenceClassification with Roberta->Camembert, ROBERTA->CAMEMBERT +class TFCamembertForSequenceClassification(TFCamembertPreTrainedModel, TFSequenceClassificationLoss): + # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model + _keys_to_ignore_on_load_unexpected = [r"pooler", r"lm_head"] + + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + self.num_labels = config.num_labels + + self.roberta = TFCamembertMainLayer(config, add_pooling_layer=False, name="roberta") + self.classifier = TFCamembertClassificationHead(config, name="classifier") + + @unpack_inputs + @add_start_docstrings_to_model_forward(CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint="cardiffnlp/twitter-roberta-base-emotion", + output_type=TFSequenceClassifierOutput, + config_class=_CONFIG_FOR_DOC, + expected_output="'optimism'", + expected_loss=0.08, + ) + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + labels: np.ndarray | tf.Tensor | None = None, + training: bool | None = False, + ) -> TFSequenceClassifierOutput | tuple[tf.Tensor]: + r""" + labels (`tf.Tensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + outputs = self.roberta( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + sequence_output = outputs[0] + logits = self.classifier(sequence_output, training=training) + + loss = None if labels is None else self.hf_compute_loss(labels, logits) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TFSequenceClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "roberta", None) is not None: + with tf.name_scope(self.roberta.name): + self.roberta.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build(None) + + +@add_start_docstrings( + """ + CamemBERT Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. + for Named-Entity-Recognition (NER) tasks. + """, + CAMEMBERT_START_DOCSTRING, +) +# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaForTokenClassification with Roberta->Camembert, ROBERTA->CAMEMBERT +class TFCamembertForTokenClassification(TFCamembertPreTrainedModel, TFTokenClassificationLoss): + # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model + _keys_to_ignore_on_load_unexpected = [r"pooler", r"lm_head"] + _keys_to_ignore_on_load_missing = [r"dropout"] + + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + self.num_labels = config.num_labels + + self.roberta = TFCamembertMainLayer(config, add_pooling_layer=False, name="roberta") + classifier_dropout = ( + config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob + ) + self.dropout = keras.layers.Dropout(classifier_dropout) + self.classifier = keras.layers.Dense( + config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" + ) + self.config = config + + @unpack_inputs + @add_start_docstrings_to_model_forward(CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint="ydshieh/roberta-large-ner-english", + output_type=TFTokenClassifierOutput, + config_class=_CONFIG_FOR_DOC, + expected_output="['O', 'ORG', 'ORG', 'O', 'O', 'O', 'O', 'O', 'LOC', 'O', 'LOC', 'LOC']", + expected_loss=0.01, + ) + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + labels: np.ndarray | tf.Tensor | None = None, + training: bool | None = False, + ) -> TFTokenClassifierOutput | tuple[tf.Tensor]: + r""" + labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. + """ + outputs = self.roberta( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + sequence_output = outputs[0] + + sequence_output = self.dropout(sequence_output, training=training) + logits = self.classifier(sequence_output) + + loss = None if labels is None else self.hf_compute_loss(labels, logits) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TFTokenClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "roberta", None) is not None: + with tf.name_scope(self.roberta.name): + self.roberta.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build([None, None, self.config.hidden_size]) + + +@add_start_docstrings( + """ + CamemBERT Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a + softmax) e.g. for RocStories/SWAG tasks. + """, + CAMEMBERT_START_DOCSTRING, +) +# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaForMultipleChoice with Roberta->Camembert, ROBERTA->CAMEMBERT +class TFCamembertForMultipleChoice(TFCamembertPreTrainedModel, TFMultipleChoiceLoss): + # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model + _keys_to_ignore_on_load_unexpected = [r"lm_head"] + _keys_to_ignore_on_load_missing = [r"dropout"] + + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + self.roberta = TFCamembertMainLayer(config, name="roberta") + self.dropout = keras.layers.Dropout(config.hidden_dropout_prob) + self.classifier = keras.layers.Dense( + 1, kernel_initializer=get_initializer(config.initializer_range), name="classifier" + ) + self.config = config + + @unpack_inputs + @add_start_docstrings_to_model_forward( + CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length") + ) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFMultipleChoiceModelOutput, + config_class=_CONFIG_FOR_DOC, + ) + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + labels: np.ndarray | tf.Tensor | None = None, + training: bool | None = False, + ) -> TFMultipleChoiceModelOutput | tuple[tf.Tensor]: + r""" + labels (`tf.Tensor` of shape `(batch_size,)`, *optional*): + Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices]` + where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above) + """ + + if input_ids is not None: + num_choices = shape_list(input_ids)[1] + seq_length = shape_list(input_ids)[2] + else: + num_choices = shape_list(inputs_embeds)[1] + seq_length = shape_list(inputs_embeds)[2] + + flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None + flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None + flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None + flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None + outputs = self.roberta( + flat_input_ids, + flat_attention_mask, + flat_token_type_ids, + flat_position_ids, + head_mask, + inputs_embeds, + output_attentions, + output_hidden_states, + return_dict=return_dict, + training=training, + ) + pooled_output = outputs[1] + pooled_output = self.dropout(pooled_output, training=training) + logits = self.classifier(pooled_output) + reshaped_logits = tf.reshape(logits, (-1, num_choices)) + + loss = None if labels is None else self.hf_compute_loss(labels, reshaped_logits) + + if not return_dict: + output = (reshaped_logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TFMultipleChoiceModelOutput( + loss=loss, + logits=reshaped_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "roberta", None) is not None: + with tf.name_scope(self.roberta.name): + self.roberta.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build([None, None, self.config.hidden_size]) + + +@add_start_docstrings( + """ + CamemBERT Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear + layers on top of the hidden-states output to compute `span start logits` and `span end logits`). + """, + CAMEMBERT_START_DOCSTRING, +) +# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaForQuestionAnswering with Roberta->Camembert, ROBERTA->CAMEMBERT +class TFCamembertForQuestionAnswering(TFCamembertPreTrainedModel, TFQuestionAnsweringLoss): + # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model + _keys_to_ignore_on_load_unexpected = [r"pooler", r"lm_head"] + + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + self.num_labels = config.num_labels + + self.roberta = TFCamembertMainLayer(config, add_pooling_layer=False, name="roberta") + self.qa_outputs = keras.layers.Dense( + config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" + ) + self.config = config + + @unpack_inputs + @add_start_docstrings_to_model_forward(CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint="ydshieh/roberta-base-squad2", + output_type=TFQuestionAnsweringModelOutput, + config_class=_CONFIG_FOR_DOC, + expected_output="' puppet'", + expected_loss=0.86, + ) + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + start_positions: np.ndarray | tf.Tensor | None = None, + end_positions: np.ndarray | tf.Tensor | None = None, + training: bool | None = False, + ) -> TFQuestionAnsweringModelOutput | tuple[tf.Tensor]: + r""" + start_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*): + Labels for position (index) of the start of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence + are not taken into account for computing the loss. + end_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*): + Labels for position (index) of the end of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence + are not taken into account for computing the loss. + """ + outputs = self.roberta( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + sequence_output = outputs[0] + + logits = self.qa_outputs(sequence_output) + start_logits, end_logits = tf.split(logits, 2, axis=-1) + start_logits = tf.squeeze(start_logits, axis=-1) + end_logits = tf.squeeze(end_logits, axis=-1) + + loss = None + if start_positions is not None and end_positions is not None: + labels = {"start_position": start_positions} + labels["end_position"] = end_positions + loss = self.hf_compute_loss(labels, (start_logits, end_logits)) + + if not return_dict: + output = (start_logits, end_logits) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TFQuestionAnsweringModelOutput( + loss=loss, + start_logits=start_logits, + end_logits=end_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "roberta", None) is not None: + with tf.name_scope(self.roberta.name): + self.roberta.build(None) + if getattr(self, "qa_outputs", None) is not None: + with tf.name_scope(self.qa_outputs.name): + self.qa_outputs.build([None, None, self.config.hidden_size]) + + +@add_start_docstrings( + """CamemBERT Model with a `language modeling` head on top for CLM fine-tuning.""", CAMEMBERT_START_DOCSTRING +) +# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaForCausalLM with Roberta->Camembert, ROBERTA->CAMEMBERT +class TFCamembertForCausalLM(TFCamembertPreTrainedModel, TFCausalLanguageModelingLoss): + # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model + _keys_to_ignore_on_load_unexpected = [r"pooler", r"lm_head.decoder.weight"] + + def __init__(self, config: CamembertConfig, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + if not config.is_decoder: + logger.warning("If you want to use `TFCamembertLMHeadModel` as a standalone, add `is_decoder=True.`") + + self.roberta = TFCamembertMainLayer(config, add_pooling_layer=False, name="roberta") + self.lm_head = TFCamembertLMHead(config, input_embeddings=self.roberta.embeddings, name="lm_head") + + def get_lm_head(self): + return self.lm_head + + def get_prefix_bias_name(self): + warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning) + return self.name + "/" + self.lm_head.name + + # Copied from transformers.models.bert.modeling_tf_bert.TFBertLMHeadModel.prepare_inputs_for_generation + def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attention_mask=None, **model_kwargs): + input_shape = input_ids.shape + # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly + if attention_mask is None: + attention_mask = tf.ones(input_shape) + + # cut decoder_input_ids if past is used + if past_key_values is not None: + input_ids = input_ids[:, -1:] + + return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past_key_values} + + @unpack_inputs + @add_start_docstrings_to_model_forward(CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFCausalLMOutputWithCrossAttentions, + config_class=_CONFIG_FOR_DOC, + ) + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + encoder_hidden_states: np.ndarray | tf.Tensor | None = None, + encoder_attention_mask: np.ndarray | tf.Tensor | None = None, + past_key_values: tuple[tuple[np.ndarray | tf.Tensor]] | None = None, + use_cache: bool | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + labels: np.ndarray | tf.Tensor | None = None, + training: bool | None = False, + ) -> TFCausalLMOutputWithCrossAttentions | tuple[tf.Tensor]: + r""" + encoder_hidden_states (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if + the model is configured as a decoder. + encoder_attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in + the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + past_key_values (`tuple[tuple[tf.Tensor]]` of length `config.n_layers`) + contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `(batch_size, sequence_length)`. + use_cache (`bool`, *optional*, defaults to `True`): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). Set to `False` during training, `True` during generation + labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the cross entropy classification loss. Indices should be in `[0, ..., + config.vocab_size - 1]`. + """ + outputs = self.roberta( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + sequence_output = outputs[0] + logits = self.lm_head(hidden_states=sequence_output, training=training) + loss = None + + if labels is not None: + # shift labels to the left and cut last logit token + shifted_logits = logits[:, :-1] + labels = labels[:, 1:] + loss = self.hf_compute_loss(labels=labels, logits=shifted_logits) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TFCausalLMOutputWithCrossAttentions( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + cross_attentions=outputs.cross_attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "roberta", None) is not None: + with tf.name_scope(self.roberta.name): + self.roberta.build(None) + if getattr(self, "lm_head", None) is not None: + with tf.name_scope(self.lm_head.name): + self.lm_head.build(None) + + +__all__ = [ + "TFCamembertForCausalLM", + "TFCamembertForMaskedLM", + "TFCamembertForMultipleChoice", + "TFCamembertForQuestionAnswering", + "TFCamembertForSequenceClassification", + "TFCamembertForTokenClassification", + "TFCamembertModel", + "TFCamembertPreTrainedModel", +] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/camembert/tokenization_camembert.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/camembert/tokenization_camembert.py new file mode 100644 index 0000000000000000000000000000000000000000..cd6e399f208df858d647dce23fe0ad74248d80b8 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/camembert/tokenization_camembert.py @@ -0,0 +1,323 @@ +# coding=utf-8 +# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License +"""Tokenization classes for Camembert model.""" + +import os +from shutil import copyfile +from typing import Any, Optional + +import sentencepiece as spm + +from ...tokenization_utils import AddedToken, PreTrainedTokenizer +from ...utils import logging +from ...utils.import_utils import requires + + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model"} + + +SPIECE_UNDERLINE = "▁" + + +@requires(backends=("sentencepiece",)) +class CamembertTokenizer(PreTrainedTokenizer): + """ + Adapted from [`RobertaTokenizer`] and [`XLNetTokenizer`]. Construct a CamemBERT tokenizer. Based on + [SentencePiece](https://github.com/google/sentencepiece). + + This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to + this superclass for more information regarding those methods. + + Args: + vocab_file (`str`): + [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that + contains the vocabulary necessary to instantiate a tokenizer. + bos_token (`str`, *optional*, defaults to `""`): + The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token. + + + + When building a sequence using special tokens, this is not the token that is used for the beginning of + sequence. The token used is the `cls_token`. + + + + eos_token (`str`, *optional*, defaults to `""`): + The end of sequence token. + + + + When building a sequence using special tokens, this is not the token that is used for the end of sequence. + The token used is the `sep_token`. + + + + sep_token (`str`, *optional*, defaults to `""`): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for + sequence classification or for a text and a question for question answering. It is also used as the last + token of a sequence built with special tokens. + cls_token (`str`, *optional*, defaults to `""`): + The classifier token which is used when doing sequence classification (classification of the whole sequence + instead of per-token classification). It is the first token of the sequence when built with special tokens. + unk_token (`str`, *optional*, defaults to `""`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + pad_token (`str`, *optional*, defaults to `""`): + The token used for padding, for example when batching sequences of different lengths. + mask_token (`str`, *optional*, defaults to `""`): + The token used for masking values. This is the token used when training this model with masked language + modeling. This is the token which the model will try to predict. + additional_special_tokens (`list[str]`, *optional*, defaults to `['NOTUSED', 'NOTUSED', 'NOTUSED']`): + Additional special tokens used by the tokenizer. + sp_model_kwargs (`dict`, *optional*): + Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for + SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things, + to set: + + - `enable_sampling`: Enable subword regularization. + - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout. + + - `nbest_size = {0,1}`: No sampling is performed. + - `nbest_size > 1`: samples from the nbest_size results. + - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice) + using forward-filtering-and-backward-sampling algorithm. + + - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for + BPE-dropout. + + Attributes: + sp_model (`SentencePieceProcessor`): + The *SentencePiece* processor that is used for every conversion (string, tokens and IDs). + """ + + vocab_files_names = VOCAB_FILES_NAMES + model_input_names = ["input_ids", "attention_mask"] + + def __init__( + self, + vocab_file, + bos_token="", + eos_token="", + sep_token="", + cls_token="", + unk_token="", + pad_token="", + mask_token="", + additional_special_tokens=["NOTUSED", "NOTUSED", "NOTUSED"], + sp_model_kwargs: Optional[dict[str, Any]] = None, + **kwargs, + ) -> None: + # Mask token behave like a normal word, i.e. include the space before it + mask_token = ( + AddedToken(mask_token, lstrip=True, rstrip=False, normalized=False, special=True) + if isinstance(mask_token, str) + else mask_token + ) + + self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) + self.sp_model.Load(str(vocab_file)) + self.vocab_file = vocab_file + + # HACK: These tokens were added by the author for an obscure reason as they were already part of the + # sentencepiece vocabulary (this is the case for and and ). + # In this case it is recommended to properly set the tokens by hand. + self._added_tokens_decoder = { + 0: AddedToken("NOTUSED", special=True), + 1: AddedToken(pad_token, special=True) if isinstance(pad_token, str) else pad_token, + 2: AddedToken("NOTUSED", special=True), + 3: AddedToken(unk_token, special=True) if isinstance(unk_token, str) else unk_token, + 4: AddedToken("NOTUSED", special=True), + } + + self.fairseq_offset = 4 # 3 tokens are newly added, but the offset starts from 4 + + # legacy: camemebert is a particular case were we have to make sure `"NOTUSED"` is here + if "added_tokens_decoder" in kwargs: + # this is the only class that requires this unfortunately..... + # the reason is that the fast version has a whole. + kwargs["added_tokens_decoder"].update(self._added_tokens_decoder) + + super().__init__( + bos_token=bos_token, + eos_token=eos_token, + unk_token=unk_token, + sep_token=sep_token, + cls_token=cls_token, + pad_token=pad_token, + mask_token=mask_token, + additional_special_tokens=additional_special_tokens, + sp_model_kwargs=self.sp_model_kwargs, + **kwargs, + ) + + @property + def vocab_size(self): + # The length of the vocabulary without added tokens is len(self.sp_model) but the added tokens are added at the beginning. + return len(self.sp_model) + + def get_vocab(self): + vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size + self.fairseq_offset)} + vocab.update(self.added_tokens_encoder) + return vocab + + def _tokenize(self, text: str) -> list[str]: + return self.sp_model.encode(text, out_type=str) + + def _convert_token_to_id(self, token): + """Converts a token (str) in an id using the vocab.""" + # specific to camembert, both 3 and 4 point to the unk token. + if self.sp_model.PieceToId(token) == 0: + # Convert sentence piece unk token to fairseq unk token index + return self.unk_token_id + return self.fairseq_offset + self.sp_model.PieceToId(token) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.sp_model.IdToPiece(index - self.fairseq_offset) + + def convert_tokens_to_string(self, tokens): + """Converts a sequence of tokens (string) in a single string.""" + # TODO decode outputs do not match between fast and slow + current_sub_tokens = [] + out_string = "" + prev_is_special = False + for token in tokens: + # make sure that special tokens are not decoded using sentencepiece model + if token in self.all_special_tokens: + if not prev_is_special: + out_string += " " + out_string += self.sp_model.decode(current_sub_tokens) + token + prev_is_special = True + current_sub_tokens = [] + else: + current_sub_tokens.append(token) + prev_is_special = False + out_string += self.sp_model.decode(current_sub_tokens) + return out_string.strip() + + def __getstate__(self): + state = self.__dict__.copy() + state["sp_model"] = None + return state + + def __setstate__(self, d): + self.__dict__ = d + + # for backward compatibility + if not hasattr(self, "sp_model_kwargs"): + self.sp_model_kwargs = {} + + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) + self.sp_model.Load(self.vocab_file) + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]: + if not os.path.isdir(save_directory): + logger.error(f"Vocabulary path ({save_directory}) should be a directory") + return + out_vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + + if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file): + copyfile(self.vocab_file, out_vocab_file) + elif not os.path.isfile(self.vocab_file): + with open(out_vocab_file, "wb") as fi: + content_spiece_model = self.sp_model.serialized_model_proto() + fi.write(content_spiece_model) + + return (out_vocab_file,) + + def build_inputs_with_special_tokens( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None + ) -> list[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. An CamemBERT sequence has the following format: + + - single sequence: ` X ` + - pair of sequences: ` A B ` + + Args: + token_ids_0 (`list[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`list[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `list[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + + if token_ids_1 is None: + return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] + cls = [self.cls_token_id] + sep = [self.sep_token_id] + return cls + token_ids_0 + sep + sep + token_ids_1 + sep + + def get_special_tokens_mask( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False + ) -> list[int]: + """ + Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer `prepare_for_model` method. + + Args: + token_ids_0 (`list[int]`): + List of IDs. + token_ids_1 (`list[int]`, *optional*): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not the token list is already formatted with special tokens for the model. + + Returns: + `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + if already_has_special_tokens: + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) + + if token_ids_1 is None: + return [1] + ([0] * len(token_ids_0)) + [1] + return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1] + + def create_token_type_ids_from_sequences( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None + ) -> list[int]: + """ + Create a mask from the two sequences passed to be used in a sequence-pair classification task. CamemBERT, like + RoBERTa, does not make use of token type ids, therefore a list of zeros is returned. + + Args: + token_ids_0 (`list[int]`): + List of IDs. + token_ids_1 (`list[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `list[int]`: List of zeros. + """ + sep = [self.sep_token_id] + cls = [self.cls_token_id] + + if token_ids_1 is None: + return len(cls + token_ids_0 + sep) * [0] + return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0] + + +__all__ = ["CamembertTokenizer"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/camembert/tokenization_camembert_fast.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/camembert/tokenization_camembert_fast.py new file mode 100644 index 0000000000000000000000000000000000000000..423058ed959aeb3e7122fb3730a6d0f1f57b5982 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/camembert/tokenization_camembert_fast.py @@ -0,0 +1,197 @@ +# coding=utf-8 +# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License +"""Fast tokenization classes for Camembert model.""" + +import os +from shutil import copyfile +from typing import Optional + +from ...tokenization_utils import AddedToken +from ...tokenization_utils_fast import PreTrainedTokenizerFast +from ...utils import is_sentencepiece_available, logging + + +if is_sentencepiece_available(): + from .tokenization_camembert import CamembertTokenizer +else: + CamembertTokenizer = None + + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model", "tokenizer_file": "tokenizer.json"} + + +SPIECE_UNDERLINE = "▁" + + +class CamembertTokenizerFast(PreTrainedTokenizerFast): + """ + Construct a "fast" CamemBERT tokenizer (backed by HuggingFace's *tokenizers* library). Adapted from + [`RobertaTokenizer`] and [`XLNetTokenizer`]. Based on + [BPE](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=BPE#models). + + This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should + refer to this superclass for more information regarding those methods. + + Args: + vocab_file (`str`): + [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that + contains the vocabulary necessary to instantiate a tokenizer. + bos_token (`str`, *optional*, defaults to `""`): + The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token. + + + + When building a sequence using special tokens, this is not the token that is used for the beginning of + sequence. The token used is the `cls_token`. + + + + eos_token (`str`, *optional*, defaults to `""`): + The end of sequence token. + + + + When building a sequence using special tokens, this is not the token that is used for the end of sequence. + The token used is the `sep_token`. + + + + sep_token (`str`, *optional*, defaults to `""`): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for + sequence classification or for a text and a question for question answering. It is also used as the last + token of a sequence built with special tokens. + cls_token (`str`, *optional*, defaults to `""`): + The classifier token which is used when doing sequence classification (classification of the whole sequence + instead of per-token classification). It is the first token of the sequence when built with special tokens. + unk_token (`str`, *optional*, defaults to `""`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + pad_token (`str`, *optional*, defaults to `""`): + The token used for padding, for example when batching sequences of different lengths. + mask_token (`str`, *optional*, defaults to `""`): + The token used for masking values. This is the token used when training this model with masked language + modeling. This is the token which the model will try to predict. + additional_special_tokens (`list[str]`, *optional*, defaults to `["NOTUSED", "NOTUSED"]`): + Additional special tokens used by the tokenizer. + """ + + vocab_files_names = VOCAB_FILES_NAMES + model_input_names = ["input_ids", "attention_mask"] + slow_tokenizer_class = CamembertTokenizer + + def __init__( + self, + vocab_file=None, + tokenizer_file=None, + bos_token="", + eos_token="", + sep_token="", + cls_token="", + unk_token="", + pad_token="", + mask_token="", + additional_special_tokens=["NOTUSED", "NOTUSED", "NOTUSED"], + **kwargs, + ): + # Mask token behave like a normal word, i.e. include the space before it. Will have normalized = False + mask_token = AddedToken(mask_token, lstrip=True, special=True) if isinstance(mask_token, str) else mask_token + super().__init__( + vocab_file, + tokenizer_file=tokenizer_file, + bos_token=bos_token, + eos_token=eos_token, + sep_token=sep_token, + cls_token=cls_token, + unk_token=unk_token, + pad_token=pad_token, + mask_token=mask_token, + additional_special_tokens=additional_special_tokens, + **kwargs, + ) + + self.vocab_file = vocab_file + + def build_inputs_with_special_tokens( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None + ) -> list[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. An CamemBERT sequence has the following format: + + - single sequence: ` X ` + - pair of sequences: ` A B ` + + Args: + token_ids_0 (`list[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`list[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `list[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + + if token_ids_1 is None: + return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] + cls = [self.cls_token_id] + sep = [self.sep_token_id] + return cls + token_ids_0 + sep + sep + token_ids_1 + sep + + def create_token_type_ids_from_sequences( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None + ) -> list[int]: + """ + Create a mask from the two sequences passed to be used in a sequence-pair classification task. CamemBERT, like + RoBERTa, does not make use of token type ids, therefore a list of zeros is returned. + + Args: + token_ids_0 (`list[int]`): + List of IDs. + token_ids_1 (`list[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `list[int]`: List of zeros. + """ + sep = [self.sep_token_id] + cls = [self.cls_token_id] + + if token_ids_1 is None: + return len(cls + token_ids_0 + sep) * [0] + return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0] + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]: + if not self.can_save_slow_tokenizer: + raise ValueError( + "Your fast tokenizer does not have the necessary information to save the vocabulary for a slow " + "tokenizer." + ) + + if not os.path.isdir(save_directory): + logger.error(f"Vocabulary path ({save_directory}) should be a directory") + return + out_vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + + if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file): + copyfile(self.vocab_file, out_vocab_file) + + return (out_vocab_file,) + + +__all__ = ["CamembertTokenizerFast"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/canine/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/canine/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..bb00d8fd8827035ff7b351db49d7128a54429c53 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/canine/__init__.py @@ -0,0 +1,28 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_canine import * + from .modeling_canine import * + from .tokenization_canine import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/canine/configuration_canine.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/canine/configuration_canine.py new file mode 100644 index 0000000000000000000000000000000000000000..29e90327d08f02d490006a464cb566adbca52007 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/canine/configuration_canine.py @@ -0,0 +1,141 @@ +# coding=utf-8 +# Copyright Google AI and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""CANINE model configuration""" + +from ...configuration_utils import PretrainedConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class CanineConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`CanineModel`]. It is used to instantiate an + CANINE model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of the CANINE + [google/canine-s](https://huggingface.co/google/canine-s) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + hidden_size (`int`, *optional*, defaults to 768): + Dimension of the encoder layers and the pooler layer. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the deep Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoders. + intermediate_size (`int`, *optional*, defaults to 3072): + Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoders. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"` and `"gelu_new"` are supported. + hidden_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoders, and pooler. + attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout ratio for the attention probabilities. + max_position_embeddings (`int`, *optional*, defaults to 16384): + The maximum sequence length that this model might ever be used with. + type_vocab_size (`int`, *optional*, defaults to 16): + The vocabulary size of the `token_type_ids` passed when calling [`CanineModel`]. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-12): + The epsilon used by the layer normalization layers. + pad_token_id (`int`, *optional*, defaults to 0): + Padding token id. + bos_token_id (`int`, *optional*, defaults to 57344): + Beginning of stream token id. + eos_token_id (`int`, *optional*, defaults to 57345): + End of stream token id. + downsampling_rate (`int`, *optional*, defaults to 4): + The rate at which to downsample the original character sequence length before applying the deep Transformer + encoder. + upsampling_kernel_size (`int`, *optional*, defaults to 4): + The kernel size (i.e. the number of characters in each window) of the convolutional projection layer when + projecting back from `hidden_size`*2 to `hidden_size`. + num_hash_functions (`int`, *optional*, defaults to 8): + The number of hash functions to use. Each hash function has its own embedding matrix. + num_hash_buckets (`int`, *optional*, defaults to 16384): + The number of hash buckets to use. + local_transformer_stride (`int`, *optional*, defaults to 128): + The stride of the local attention of the first shallow Transformer encoder. Defaults to 128 for good + TPU/XLA memory alignment. + + Example: + + ```python + >>> from transformers import CanineConfig, CanineModel + + >>> # Initializing a CANINE google/canine-s style configuration + >>> configuration = CanineConfig() + + >>> # Initializing a model (with random weights) from the google/canine-s style configuration + >>> model = CanineModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "canine" + + def __init__( + self, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=16384, + type_vocab_size=16, + initializer_range=0.02, + layer_norm_eps=1e-12, + pad_token_id=0, + bos_token_id=0xE000, + eos_token_id=0xE001, + downsampling_rate=4, + upsampling_kernel_size=4, + num_hash_functions=8, + num_hash_buckets=16384, + local_transformer_stride=128, # Good TPU/XLA memory alignment. + **kwargs, + ): + super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) + + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.initializer_range = initializer_range + self.type_vocab_size = type_vocab_size + self.layer_norm_eps = layer_norm_eps + + # Character config: + self.downsampling_rate = downsampling_rate + self.upsampling_kernel_size = upsampling_kernel_size + self.num_hash_functions = num_hash_functions + self.num_hash_buckets = num_hash_buckets + self.local_transformer_stride = local_transformer_stride + + +__all__ = ["CanineConfig"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/canine/modeling_canine.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/canine/modeling_canine.py new file mode 100644 index 0000000000000000000000000000000000000000..585961180f9e914bc1a0670306ab94d9f805fb6e --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/canine/modeling_canine.py @@ -0,0 +1,1549 @@ +# coding=utf-8 +# Copyright 2021 Google AI The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch CANINE model.""" + +import copy +import math +import os +from dataclasses import dataclass +from typing import Optional, Union + +import torch +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from ...activations import ACT2FN +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_outputs import ( + BaseModelOutput, + ModelOutput, + MultipleChoiceModelOutput, + QuestionAnsweringModelOutput, + SequenceClassifierOutput, + TokenClassifierOutput, +) +from ...modeling_utils import PreTrainedModel +from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer +from ...utils import auto_docstring, logging +from .configuration_canine import CanineConfig + + +logger = logging.get_logger(__name__) + + +# Support up to 16 hash functions. +_PRIMES = [31, 43, 59, 61, 73, 97, 103, 113, 137, 149, 157, 173, 181, 193, 211, 223] + + +@dataclass +@auto_docstring( + custom_intro=""" + Output type of [`CanineModel`]. Based on [`~modeling_outputs.BaseModelOutputWithPooling`], but with slightly + different `hidden_states` and `attentions`, as these also include the hidden states and attentions of the shallow + Transformer encoders. + """ +) +class CanineModelOutputWithPooling(ModelOutput): + r""" + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model (i.e. the output of the final + shallow Transformer encoder). + pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`): + Hidden-state of the first token of the sequence (classification token) at the last layer of the deep + Transformer encoder, further processed by a Linear layer and a Tanh activation function. The Linear layer + weights are trained from the next sentence prediction (classification) objective during pretraining. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the input to each encoder + one for the output of each layer of each + encoder) of shape `(batch_size, sequence_length, hidden_size)` and `(batch_size, sequence_length // + config.downsampling_rate, hidden_size)`. Hidden-states of the model at the output of each layer plus the + initial input to each Transformer encoder. The hidden states of the shallow encoders have length + `sequence_length`, but the hidden states of the deep encoder have length `sequence_length` // + `config.downsampling_rate`. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of the 3 Transformer encoders of shape `(batch_size, + num_heads, sequence_length, sequence_length)` and `(batch_size, num_heads, sequence_length // + config.downsampling_rate, sequence_length // config.downsampling_rate)`. Attentions weights after the + attention softmax, used to compute the weighted average in the self-attention heads. + """ + + last_hidden_state: Optional[torch.FloatTensor] = None + pooler_output: Optional[torch.FloatTensor] = None + hidden_states: Optional[tuple[torch.FloatTensor]] = None + attentions: Optional[tuple[torch.FloatTensor]] = None + + +def load_tf_weights_in_canine(model, config, tf_checkpoint_path): + """Load tf checkpoints in a pytorch model.""" + try: + import re + + import numpy as np + import tensorflow as tf + except ImportError: + logger.error( + "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see " + "https://www.tensorflow.org/install/ for installation instructions." + ) + raise + tf_path = os.path.abspath(tf_checkpoint_path) + logger.info(f"Converting TensorFlow checkpoint from {tf_path}") + # Load weights from TF model + init_vars = tf.train.list_variables(tf_path) + names = [] + arrays = [] + for name, shape in init_vars: + logger.info(f"Loading TF weight {name} with shape {shape}") + array = tf.train.load_variable(tf_path, name) + names.append(name) + arrays.append(array) + + for name, array in zip(names, arrays): + name = name.split("/") + # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v + # which are not required for using pretrained model + # also discard the cls weights (which were used for the next sentence prediction pre-training task) + if any( + n + in [ + "adam_v", + "adam_m", + "AdamWeightDecayOptimizer", + "AdamWeightDecayOptimizer_1", + "global_step", + "cls", + "autoregressive_decoder", + "char_output_weights", + ] + for n in name + ): + logger.info(f"Skipping {'/'.join(name)}") + continue + # if first scope name starts with "bert", change it to "encoder" + if name[0] == "bert": + name[0] = "encoder" + # remove "embeddings" middle name of HashBucketCodepointEmbedders + elif name[1] == "embeddings": + name.remove(name[1]) + # rename segment_embeddings to token_type_embeddings + elif name[1] == "segment_embeddings": + name[1] = "token_type_embeddings" + # rename initial convolutional projection layer + elif name[1] == "initial_char_encoder": + name = ["chars_to_molecules"] + name[-2:] + # rename final convolutional projection layer + elif name[0] == "final_char_encoder" and name[1] in ["LayerNorm", "conv"]: + name = ["projection"] + name[1:] + pointer = model + for m_name in name: + if (re.fullmatch(r"[A-Za-z]+_\d+", m_name)) and "Embedder" not in m_name: + scope_names = re.split(r"_(\d+)", m_name) + else: + scope_names = [m_name] + if scope_names[0] == "kernel" or scope_names[0] == "gamma": + pointer = getattr(pointer, "weight") + elif scope_names[0] == "output_bias" or scope_names[0] == "beta": + pointer = getattr(pointer, "bias") + elif scope_names[0] == "output_weights": + pointer = getattr(pointer, "weight") + else: + try: + pointer = getattr(pointer, scope_names[0]) + except AttributeError: + logger.info(f"Skipping {'/'.join(name)}") + continue + if len(scope_names) >= 2: + num = int(scope_names[1]) + pointer = pointer[num] + if m_name[-11:] == "_embeddings": + pointer = getattr(pointer, "weight") + elif m_name[-10:] in [f"Embedder_{i}" for i in range(8)]: + pointer = getattr(pointer, "weight") + elif m_name == "kernel": + array = np.transpose(array) + + if pointer.shape != array.shape: + raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched") + + logger.info(f"Initialize PyTorch weight {name}") + pointer.data = torch.from_numpy(array) + return model + + +class CanineEmbeddings(nn.Module): + """Construct the character, position and token_type embeddings.""" + + def __init__(self, config): + super().__init__() + + self.config = config + + # character embeddings + shard_embedding_size = config.hidden_size // config.num_hash_functions + for i in range(config.num_hash_functions): + name = f"HashBucketCodepointEmbedder_{i}" + setattr(self, name, nn.Embedding(config.num_hash_buckets, shard_embedding_size)) + self.char_position_embeddings = nn.Embedding(config.num_hash_buckets, config.hidden_size) + self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size) + + # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load + # any TensorFlow checkpoint file + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + # position_ids (1, len position emb) is contiguous in memory and exported when serialized + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) + self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") + + def _hash_bucket_tensors(self, input_ids, num_hashes: int, num_buckets: int): + """ + Converts ids to hash bucket ids via multiple hashing. + + Args: + input_ids: The codepoints or other IDs to be hashed. + num_hashes: The number of hash functions to use. + num_buckets: The number of hash buckets (i.e. embeddings in each table). + + Returns: + A list of tensors, each of which is the hash bucket IDs from one hash function. + """ + if num_hashes > len(_PRIMES): + raise ValueError(f"`num_hashes` must be <= {len(_PRIMES)}") + + primes = _PRIMES[:num_hashes] + + result_tensors = [] + for prime in primes: + hashed = ((input_ids + 1) * prime) % num_buckets + result_tensors.append(hashed) + return result_tensors + + def _embed_hash_buckets(self, input_ids, embedding_size: int, num_hashes: int, num_buckets: int): + """Converts IDs (e.g. codepoints) into embeddings via multiple hashing.""" + if embedding_size % num_hashes != 0: + raise ValueError(f"Expected `embedding_size` ({embedding_size}) % `num_hashes` ({num_hashes}) == 0") + + hash_bucket_tensors = self._hash_bucket_tensors(input_ids, num_hashes=num_hashes, num_buckets=num_buckets) + embedding_shards = [] + for i, hash_bucket_ids in enumerate(hash_bucket_tensors): + name = f"HashBucketCodepointEmbedder_{i}" + shard_embeddings = getattr(self, name)(hash_bucket_ids) + embedding_shards.append(shard_embeddings) + + return torch.cat(embedding_shards, dim=-1) + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + ) -> torch.FloatTensor: + if input_ids is not None: + input_shape = input_ids.size() + else: + input_shape = inputs_embeds.size()[:-1] + + seq_length = input_shape[1] + + if position_ids is None: + position_ids = self.position_ids[:, :seq_length] + + if token_type_ids is None: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device) + + if inputs_embeds is None: + inputs_embeds = self._embed_hash_buckets( + input_ids, self.config.hidden_size, self.config.num_hash_functions, self.config.num_hash_buckets + ) + + token_type_embeddings = self.token_type_embeddings(token_type_ids) + + embeddings = inputs_embeds + token_type_embeddings + + if self.position_embedding_type == "absolute": + position_embeddings = self.char_position_embeddings(position_ids) + embeddings += position_embeddings + embeddings = self.LayerNorm(embeddings) + embeddings = self.dropout(embeddings) + return embeddings + + +class CharactersToMolecules(nn.Module): + """Convert character sequence to initial molecule sequence (i.e. downsample) using strided convolutions.""" + + def __init__(self, config): + super().__init__() + + self.conv = nn.Conv1d( + in_channels=config.hidden_size, + out_channels=config.hidden_size, + kernel_size=config.downsampling_rate, + stride=config.downsampling_rate, + ) + self.activation = ACT2FN[config.hidden_act] + + # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load + # any TensorFlow checkpoint file + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + def forward(self, char_encoding: torch.Tensor) -> torch.Tensor: + # `cls_encoding`: [batch, 1, hidden_size] + cls_encoding = char_encoding[:, 0:1, :] + + # char_encoding has shape [batch, char_seq, hidden_size] + # We transpose it to be [batch, hidden_size, char_seq] + char_encoding = torch.transpose(char_encoding, 1, 2) + downsampled = self.conv(char_encoding) + downsampled = torch.transpose(downsampled, 1, 2) + downsampled = self.activation(downsampled) + + # Truncate the last molecule in order to reserve a position for [CLS]. + # Often, the last position is never used (unless we completely fill the + # text buffer). This is important in order to maintain alignment on TPUs + # (i.e. a multiple of 128). + downsampled_truncated = downsampled[:, 0:-1, :] + + # We also keep [CLS] as a separate sequence position since we always + # want to reserve a position (and the model capacity that goes along + # with that) in the deep BERT stack. + # `result`: [batch, molecule_seq, molecule_dim] + result = torch.cat([cls_encoding, downsampled_truncated], dim=1) + + result = self.LayerNorm(result) + + return result + + +class ConvProjection(nn.Module): + """ + Project representations from hidden_size*2 back to hidden_size across a window of w = config.upsampling_kernel_size + characters. + """ + + def __init__(self, config): + super().__init__() + self.config = config + self.conv = nn.Conv1d( + in_channels=config.hidden_size * 2, + out_channels=config.hidden_size, + kernel_size=config.upsampling_kernel_size, + stride=1, + ) + self.activation = ACT2FN[config.hidden_act] + # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load + # any TensorFlow checkpoint file + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward( + self, + inputs: torch.Tensor, + final_seq_char_positions: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + # inputs has shape [batch, mol_seq, molecule_hidden_size+char_hidden_final] + # we transpose it to be [batch, molecule_hidden_size+char_hidden_final, mol_seq] + inputs = torch.transpose(inputs, 1, 2) + + # PyTorch < 1.9 does not support padding="same" (which is used in the original implementation), + # so we pad the tensor manually before passing it to the conv layer + # based on https://github.com/google-research/big_transfer/blob/49afe42338b62af9fbe18f0258197a33ee578a6b/bit_tf2/models.py#L36-L38 + pad_total = self.config.upsampling_kernel_size - 1 + pad_beg = pad_total // 2 + pad_end = pad_total - pad_beg + + pad = nn.ConstantPad1d((pad_beg, pad_end), 0) + # `result`: shape (batch_size, char_seq_len, hidden_size) + result = self.conv(pad(inputs)) + result = torch.transpose(result, 1, 2) + result = self.activation(result) + result = self.LayerNorm(result) + result = self.dropout(result) + final_char_seq = result + + if final_seq_char_positions is not None: + # Limit transformer query seq and attention mask to these character + # positions to greatly reduce the compute cost. Typically, this is just + # done for the MLM training task. + # TODO add support for MLM + raise NotImplementedError("CanineForMaskedLM is currently not supported") + else: + query_seq = final_char_seq + + return query_seq + + +class CanineSelfAttention(nn.Module): + def __init__(self, config): + super().__init__() + if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): + raise ValueError( + f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention " + f"heads ({config.num_attention_heads})" + ) + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size) + self.key = nn.Linear(config.hidden_size, self.all_head_size) + self.value = nn.Linear(config.hidden_size, self.all_head_size) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + self.max_position_embeddings = config.max_position_embeddings + self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) + + def forward( + self, + from_tensor: torch.Tensor, + to_tensor: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = False, + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: + batch_size, seq_length, _ = from_tensor.shape + + # If this is instantiated as a cross-attention module, the keys + # and values come from an encoder; the attention mask needs to be + # such that the encoder's padding tokens are not attended to. + + key_layer = ( + self.key(to_tensor) + .view(batch_size, -1, self.num_attention_heads, self.attention_head_size) + .transpose(1, 2) + ) + value_layer = ( + self.value(to_tensor) + .view(batch_size, -1, self.num_attention_heads, self.attention_head_size) + .transpose(1, 2) + ) + query_layer = ( + self.query(from_tensor) + .view(batch_size, -1, self.num_attention_heads, self.attention_head_size) + .transpose(1, 2) + ) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + seq_length = from_tensor.size()[1] + position_ids_l = torch.arange(seq_length, dtype=torch.long, device=from_tensor.device).view(-1, 1) + position_ids_r = torch.arange(seq_length, dtype=torch.long, device=from_tensor.device).view(1, -1) + distance = position_ids_l - position_ids_r + positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1) + positional_embedding = positional_embedding.to(dtype=query_layer.dtype) # fp16 compatibility + + if self.position_embedding_type == "relative_key": + relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores + elif self.position_embedding_type == "relative_key_query": + relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key + + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + if attention_mask is not None: + if attention_mask.ndim == 3: + # if attention_mask is 3D, do the following: + attention_mask = torch.unsqueeze(attention_mask, dim=1) + # Since attention_mask is 1.0 for positions we want to attend and 0.0 for + # masked positions, this operation will create a tensor which is 0.0 for + # positions we want to attend and the dtype's smallest value for masked positions. + attention_mask = (1.0 - attention_mask.float()) * torch.finfo(attention_scores.dtype).min + # Apply the attention mask (precomputed for all layers in CanineModel forward() function) + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = nn.functional.softmax(attention_scores, dim=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = attention_probs * head_mask + + context_layer = torch.matmul(attention_probs, value_layer) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(*new_context_layer_shape) + + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + + return outputs + + +class CanineSelfOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward( + self, hidden_states: tuple[torch.FloatTensor], input_tensor: torch.FloatTensor + ) -> tuple[torch.FloatTensor, torch.FloatTensor]: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class CanineAttention(nn.Module): + """ + Additional arguments related to local attention: + + - **local** (`bool`, *optional*, defaults to `False`) -- Whether to apply local attention. + - **always_attend_to_first_position** (`bool`, *optional*, defaults to `False`) -- Should all blocks be able to + attend + to the `to_tensor`'s first position (e.g. a [CLS] position)? - **first_position_attends_to_all** (`bool`, + *optional*, defaults to `False`) -- Should the *from_tensor*'s first position be able to attend to all + positions within the *from_tensor*? - **attend_from_chunk_width** (`int`, *optional*, defaults to 128) -- The + width of each block-wise chunk in `from_tensor`. - **attend_from_chunk_stride** (`int`, *optional*, defaults to + 128) -- The number of elements to skip when moving to the next block in `from_tensor`. - + **attend_to_chunk_width** (`int`, *optional*, defaults to 128) -- The width of each block-wise chunk in + *to_tensor*. - **attend_to_chunk_stride** (`int`, *optional*, defaults to 128) -- The number of elements to + skip when moving to the next block in `to_tensor`. + """ + + def __init__( + self, + config, + local=False, + always_attend_to_first_position: bool = False, + first_position_attends_to_all: bool = False, + attend_from_chunk_width: int = 128, + attend_from_chunk_stride: int = 128, + attend_to_chunk_width: int = 128, + attend_to_chunk_stride: int = 128, + ): + super().__init__() + self.self = CanineSelfAttention(config) + self.output = CanineSelfOutput(config) + self.pruned_heads = set() + + # additional arguments related to local attention + self.local = local + if attend_from_chunk_width < attend_from_chunk_stride: + raise ValueError( + "`attend_from_chunk_width` < `attend_from_chunk_stride` would cause sequence positions to get skipped." + ) + if attend_to_chunk_width < attend_to_chunk_stride: + raise ValueError( + "`attend_to_chunk_width` < `attend_to_chunk_stride`would cause sequence positions to get skipped." + ) + self.always_attend_to_first_position = always_attend_to_first_position + self.first_position_attends_to_all = first_position_attends_to_all + self.attend_from_chunk_width = attend_from_chunk_width + self.attend_from_chunk_stride = attend_from_chunk_stride + self.attend_to_chunk_width = attend_to_chunk_width + self.attend_to_chunk_stride = attend_to_chunk_stride + + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads + ) + + # Prune linear layers + self.self.query = prune_linear_layer(self.self.query, index) + self.self.key = prune_linear_layer(self.self.key, index) + self.self.value = prune_linear_layer(self.self.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.self.num_attention_heads = self.self.num_attention_heads - len(heads) + self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + def forward( + self, + hidden_states: tuple[torch.FloatTensor], + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = False, + ) -> tuple[torch.FloatTensor, Optional[torch.FloatTensor]]: + if not self.local: + self_outputs = self.self(hidden_states, hidden_states, attention_mask, head_mask, output_attentions) + attention_output = self_outputs[0] + else: + from_seq_length = to_seq_length = hidden_states.shape[1] + from_tensor = to_tensor = hidden_states + + # Create chunks (windows) that we will attend *from* and then concatenate them. + from_chunks = [] + if self.first_position_attends_to_all: + from_chunks.append((0, 1)) + # We must skip this first position so that our output sequence is the + # correct length (this matters in the *from* sequence only). + from_start = 1 + else: + from_start = 0 + for chunk_start in range(from_start, from_seq_length, self.attend_from_chunk_stride): + chunk_end = min(from_seq_length, chunk_start + self.attend_from_chunk_width) + from_chunks.append((chunk_start, chunk_end)) + + # Determine the chunks (windows) that will attend *to*. + to_chunks = [] + if self.first_position_attends_to_all: + to_chunks.append((0, to_seq_length)) + for chunk_start in range(0, to_seq_length, self.attend_to_chunk_stride): + chunk_end = min(to_seq_length, chunk_start + self.attend_to_chunk_width) + to_chunks.append((chunk_start, chunk_end)) + + if len(from_chunks) != len(to_chunks): + raise ValueError( + f"Expected to have same number of `from_chunks` ({from_chunks}) and " + f"`to_chunks` ({from_chunks}). Check strides." + ) + + # next, compute attention scores for each pair of windows and concatenate + attention_output_chunks = [] + attention_probs_chunks = [] + for (from_start, from_end), (to_start, to_end) in zip(from_chunks, to_chunks): + from_tensor_chunk = from_tensor[:, from_start:from_end, :] + to_tensor_chunk = to_tensor[:, to_start:to_end, :] + # `attention_mask`: [batch_size, from_seq, to_seq] + # `attention_mask_chunk`: [batch_size, from_seq_chunk, to_seq_chunk] + attention_mask_chunk = attention_mask[:, from_start:from_end, to_start:to_end] + if self.always_attend_to_first_position: + cls_attention_mask = attention_mask[:, from_start:from_end, 0:1] + attention_mask_chunk = torch.cat([cls_attention_mask, attention_mask_chunk], dim=2) + + cls_position = to_tensor[:, 0:1, :] + to_tensor_chunk = torch.cat([cls_position, to_tensor_chunk], dim=1) + + attention_outputs_chunk = self.self( + from_tensor_chunk, to_tensor_chunk, attention_mask_chunk, head_mask, output_attentions + ) + attention_output_chunks.append(attention_outputs_chunk[0]) + if output_attentions: + attention_probs_chunks.append(attention_outputs_chunk[1]) + + attention_output = torch.cat(attention_output_chunks, dim=1) + + attention_output = self.output(attention_output, hidden_states) + outputs = (attention_output,) + if not self.local: + outputs = outputs + self_outputs[1:] # add attentions if we output them + else: + outputs = outputs + tuple(attention_probs_chunks) # add attentions if we output them + return outputs + + +class CanineIntermediate(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +class CanineOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: tuple[torch.FloatTensor], input_tensor: torch.FloatTensor) -> torch.FloatTensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class CanineLayer(GradientCheckpointingLayer): + def __init__( + self, + config, + local, + always_attend_to_first_position, + first_position_attends_to_all, + attend_from_chunk_width, + attend_from_chunk_stride, + attend_to_chunk_width, + attend_to_chunk_stride, + ): + super().__init__() + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 + self.attention = CanineAttention( + config, + local, + always_attend_to_first_position, + first_position_attends_to_all, + attend_from_chunk_width, + attend_from_chunk_stride, + attend_to_chunk_width, + attend_to_chunk_stride, + ) + self.intermediate = CanineIntermediate(config) + self.output = CanineOutput(config) + + def forward( + self, + hidden_states: tuple[torch.FloatTensor], + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = False, + ) -> tuple[torch.FloatTensor, Optional[torch.FloatTensor]]: + self_attention_outputs = self.attention( + hidden_states, + attention_mask, + head_mask, + output_attentions=output_attentions, + ) + attention_output = self_attention_outputs[0] + + outputs = self_attention_outputs[1:] # add self attentions if we output attention weights + + layer_output = apply_chunking_to_forward( + self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output + ) + outputs = (layer_output,) + outputs + + return outputs + + def feed_forward_chunk(self, attention_output): + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + return layer_output + + +class CanineEncoder(nn.Module): + def __init__( + self, + config, + local=False, + always_attend_to_first_position=False, + first_position_attends_to_all=False, + attend_from_chunk_width=128, + attend_from_chunk_stride=128, + attend_to_chunk_width=128, + attend_to_chunk_stride=128, + ): + super().__init__() + self.config = config + self.layer = nn.ModuleList( + [ + CanineLayer( + config, + local, + always_attend_to_first_position, + first_position_attends_to_all, + attend_from_chunk_width, + attend_from_chunk_stride, + attend_to_chunk_width, + attend_to_chunk_stride, + ) + for _ in range(config.num_hidden_layers) + ] + ) + self.gradient_checkpointing = False + + def forward( + self, + hidden_states: tuple[torch.FloatTensor], + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = False, + output_hidden_states: Optional[bool] = False, + return_dict: Optional[bool] = True, + ) -> Union[tuple, BaseModelOutput]: + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + + for i, layer_module in enumerate(self.layer): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_head_mask = head_mask[i] if head_mask is not None else None + + layer_outputs = layer_module(hidden_states, attention_mask, layer_head_mask, output_attentions) + + hidden_states = layer_outputs[0] + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None) + return BaseModelOutput( + last_hidden_state=hidden_states, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + ) + + +class CaninePooler(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.activation = nn.Tanh() + + def forward(self, hidden_states: tuple[torch.FloatTensor]) -> torch.FloatTensor: + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. + first_token_tensor = hidden_states[:, 0] + pooled_output = self.dense(first_token_tensor) + pooled_output = self.activation(pooled_output) + return pooled_output + + +class CaninePredictionHeadTransform(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + if isinstance(config.hidden_act, str): + self.transform_act_fn = ACT2FN[config.hidden_act] + else: + self.transform_act_fn = config.hidden_act + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + def forward(self, hidden_states: tuple[torch.FloatTensor]) -> torch.FloatTensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.transform_act_fn(hidden_states) + hidden_states = self.LayerNorm(hidden_states) + return hidden_states + + +class CanineLMPredictionHead(nn.Module): + def __init__(self, config): + super().__init__() + self.transform = CaninePredictionHeadTransform(config) + + # The output weights are the same as the input embeddings, but there is + # an output-only bias for each token. + self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + self.bias = nn.Parameter(torch.zeros(config.vocab_size)) + + # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` + self.decoder.bias = self.bias + + def forward(self, hidden_states: tuple[torch.FloatTensor]) -> torch.FloatTensor: + hidden_states = self.transform(hidden_states) + hidden_states = self.decoder(hidden_states) + return hidden_states + + +class CanineOnlyMLMHead(nn.Module): + def __init__(self, config): + super().__init__() + self.predictions = CanineLMPredictionHead(config) + + def forward( + self, + sequence_output: tuple[torch.Tensor], + ) -> tuple[torch.Tensor]: + prediction_scores = self.predictions(sequence_output) + return prediction_scores + + +@auto_docstring +class CaninePreTrainedModel(PreTrainedModel): + config: CanineConfig + load_tf_weights = load_tf_weights_in_canine + base_model_prefix = "canine" + supports_gradient_checkpointing = True + + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, (nn.Linear, nn.Conv1d)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + +@auto_docstring +class CanineModel(CaninePreTrainedModel): + def __init__(self, config, add_pooling_layer=True): + r""" + add_pooling_layer (bool, *optional*, defaults to `True`): + Whether to add a pooling layer + """ + super().__init__(config) + self.config = config + shallow_config = copy.deepcopy(config) + shallow_config.num_hidden_layers = 1 + + self.char_embeddings = CanineEmbeddings(config) + # shallow/low-dim transformer encoder to get a initial character encoding + self.initial_char_encoder = CanineEncoder( + shallow_config, + local=True, + always_attend_to_first_position=False, + first_position_attends_to_all=False, + attend_from_chunk_width=config.local_transformer_stride, + attend_from_chunk_stride=config.local_transformer_stride, + attend_to_chunk_width=config.local_transformer_stride, + attend_to_chunk_stride=config.local_transformer_stride, + ) + self.chars_to_molecules = CharactersToMolecules(config) + # deep transformer encoder + self.encoder = CanineEncoder(config) + self.projection = ConvProjection(config) + # shallow/low-dim transformer encoder to get a final character encoding + self.final_char_encoder = CanineEncoder(shallow_config) + + self.pooler = CaninePooler(config) if add_pooling_layer else None + + # Initialize weights and apply final processing + self.post_init() + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + def _create_3d_attention_mask_from_input_mask(self, from_tensor, to_mask): + """ + Create 3D attention mask from a 2D tensor mask. + + Args: + from_tensor: 2D or 3D Tensor of shape [batch_size, from_seq_length, ...]. + to_mask: int32 Tensor of shape [batch_size, to_seq_length]. + + Returns: + float Tensor of shape [batch_size, from_seq_length, to_seq_length]. + """ + batch_size, from_seq_length = from_tensor.shape[0], from_tensor.shape[1] + + to_seq_length = to_mask.shape[1] + + to_mask = torch.reshape(to_mask, (batch_size, 1, to_seq_length)).float() + + # We don't assume that `from_tensor` is a mask (although it could be). We + # don't actually care if we attend *from* padding tokens (only *to* padding) + # tokens so we create a tensor of all ones. + broadcast_ones = torch.ones(size=(batch_size, from_seq_length, 1), dtype=torch.float32, device=to_mask.device) + + # Here we broadcast along two dimensions to create the mask. + mask = broadcast_ones * to_mask + + return mask + + def _downsample_attention_mask(self, char_attention_mask: torch.Tensor, downsampling_rate: int): + """Downsample 2D character attention mask to 2D molecule attention mask using MaxPool1d layer.""" + + # first, make char_attention_mask 3D by adding a channel dim + batch_size, char_seq_len = char_attention_mask.shape + poolable_char_mask = torch.reshape(char_attention_mask, (batch_size, 1, char_seq_len)) + + # next, apply MaxPool1d to get pooled_molecule_mask of shape (batch_size, 1, mol_seq_len) + pooled_molecule_mask = torch.nn.MaxPool1d(kernel_size=downsampling_rate, stride=downsampling_rate)( + poolable_char_mask.float() + ) + + # finally, squeeze to get tensor of shape (batch_size, mol_seq_len) + molecule_attention_mask = torch.squeeze(pooled_molecule_mask, dim=-1) + + return molecule_attention_mask + + def _repeat_molecules(self, molecules: torch.Tensor, char_seq_length: int) -> torch.Tensor: + """Repeats molecules to make them the same length as the char sequence.""" + + rate = self.config.downsampling_rate + + molecules_without_extra_cls = molecules[:, 1:, :] + # `repeated`: [batch_size, almost_char_seq_len, molecule_hidden_size] + repeated = torch.repeat_interleave(molecules_without_extra_cls, repeats=rate, dim=-2) + + # So far, we've repeated the elements sufficient for any `char_seq_length` + # that's a multiple of `downsampling_rate`. Now we account for the last + # n elements (n < `downsampling_rate`), i.e. the remainder of floor + # division. We do this by repeating the last molecule a few extra times. + last_molecule = molecules[:, -1:, :] + remainder_length = char_seq_length % rate + remainder_repeated = torch.repeat_interleave( + last_molecule, + # +1 molecule to compensate for truncation. + repeats=remainder_length + rate, + dim=-2, + ) + + # `repeated`: [batch_size, char_seq_len, molecule_hidden_size] + return torch.cat([repeated, remainder_repeated], dim=-2) + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, CanineModelOutputWithPooling]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask) + input_shape = input_ids.size() + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + batch_size, seq_length = input_shape + device = input_ids.device if input_ids is not None else inputs_embeds.device + + if attention_mask is None: + attention_mask = torch.ones(((batch_size, seq_length)), device=device) + if token_type_ids is None: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) + + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape) + molecule_attention_mask = self._downsample_attention_mask( + attention_mask, downsampling_rate=self.config.downsampling_rate + ) + extended_molecule_attention_mask: torch.Tensor = self.get_extended_attention_mask( + molecule_attention_mask, (batch_size, molecule_attention_mask.shape[-1]) + ) + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + + # `input_char_embeddings`: shape (batch_size, char_seq, char_dim) + input_char_embeddings = self.char_embeddings( + input_ids=input_ids, + position_ids=position_ids, + token_type_ids=token_type_ids, + inputs_embeds=inputs_embeds, + ) + + # Contextualize character embeddings using shallow Transformer. + # We use a 3D attention mask for the local attention. + # `input_char_encoding`: shape (batch_size, char_seq_len, char_dim) + char_attention_mask = self._create_3d_attention_mask_from_input_mask( + input_ids if input_ids is not None else inputs_embeds, attention_mask + ) + init_chars_encoder_outputs = self.initial_char_encoder( + input_char_embeddings, + attention_mask=char_attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + ) + input_char_encoding = init_chars_encoder_outputs.last_hidden_state + + # Downsample chars to molecules. + # The following lines have dimensions: [batch, molecule_seq, molecule_dim]. + # In this transformation, we change the dimensionality from `char_dim` to + # `molecule_dim`, but do *NOT* add a resnet connection. Instead, we rely on + # the resnet connections (a) from the final char transformer stack back into + # the original char transformer stack and (b) the resnet connections from + # the final char transformer stack back into the deep BERT stack of + # molecules. + # + # Empirically, it is critical to use a powerful enough transformation here: + # mean pooling causes training to diverge with huge gradient norms in this + # region of the model; using a convolution here resolves this issue. From + # this, it seems that molecules and characters require a very different + # feature space; intuitively, this makes sense. + init_molecule_encoding = self.chars_to_molecules(input_char_encoding) + + # Deep BERT encoder + # `molecule_sequence_output`: shape (batch_size, mol_seq_len, mol_dim) + encoder_outputs = self.encoder( + init_molecule_encoding, + attention_mask=extended_molecule_attention_mask, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + molecule_sequence_output = encoder_outputs[0] + pooled_output = self.pooler(molecule_sequence_output) if self.pooler is not None else None + + # Upsample molecules back to characters. + # `repeated_molecules`: shape (batch_size, char_seq_len, mol_hidden_size) + repeated_molecules = self._repeat_molecules(molecule_sequence_output, char_seq_length=input_shape[-1]) + + # Concatenate representations (contextualized char embeddings and repeated molecules): + # `concat`: shape [batch_size, char_seq_len, molecule_hidden_size+char_hidden_final] + concat = torch.cat([input_char_encoding, repeated_molecules], dim=-1) + + # Project representation dimension back to hidden_size + # `sequence_output`: shape (batch_size, char_seq_len, hidden_size]) + sequence_output = self.projection(concat) + + # Apply final shallow Transformer + # `sequence_output`: shape (batch_size, char_seq_len, hidden_size]) + final_chars_encoder_outputs = self.final_char_encoder( + sequence_output, + attention_mask=extended_attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + ) + sequence_output = final_chars_encoder_outputs.last_hidden_state + + if output_hidden_states: + deep_encoder_hidden_states = encoder_outputs.hidden_states if return_dict else encoder_outputs[1] + all_hidden_states = ( + all_hidden_states + + init_chars_encoder_outputs.hidden_states + + deep_encoder_hidden_states + + final_chars_encoder_outputs.hidden_states + ) + + if output_attentions: + deep_encoder_self_attentions = encoder_outputs.attentions if return_dict else encoder_outputs[-1] + all_self_attentions = ( + all_self_attentions + + init_chars_encoder_outputs.attentions + + deep_encoder_self_attentions + + final_chars_encoder_outputs.attentions + ) + + if not return_dict: + output = (sequence_output, pooled_output) + output += tuple(v for v in [all_hidden_states, all_self_attentions] if v is not None) + return output + + return CanineModelOutputWithPooling( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + ) + + +@auto_docstring( + custom_intro=""" + CANINE Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled + output) e.g. for GLUE tasks. + """ +) +class CanineForSequenceClassification(CaninePreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + + self.canine = CanineModel(config) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, SequenceClassifierOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.canine( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + pooled_output = outputs[1] + + pooled_output = self.dropout(pooled_output) + logits = self.classifier(pooled_output) + + loss = None + if labels is not None: + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits, labels) + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@auto_docstring +class CanineForMultipleChoice(CaninePreTrainedModel): + def __init__(self, config): + super().__init__(config) + + self.canine = CanineModel(config) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.classifier = nn.Linear(config.hidden_size, 1) + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, MultipleChoiceModelOutput]: + r""" + input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, + 1]`: + + - 0 corresponds to a *sentence A* token, + - 1 corresponds to a *sentence B* token. + + [What are token type IDs?](../glossary#token-type-ids) + position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + + [What are position IDs?](../glossary#position-ids) + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert *input_ids* indices into associated vectors than the + model's internal embedding lookup matrix. + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., + num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See + `input_ids` above) + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] + + input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None + attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None + token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None + position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None + inputs_embeds = ( + inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1)) + if inputs_embeds is not None + else None + ) + + outputs = self.canine( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + pooled_output = outputs[1] + + pooled_output = self.dropout(pooled_output) + logits = self.classifier(pooled_output) + reshaped_logits = logits.view(-1, num_choices) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(reshaped_logits, labels) + + if not return_dict: + output = (reshaped_logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return MultipleChoiceModelOutput( + loss=loss, + logits=reshaped_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@auto_docstring +class CanineForTokenClassification(CaninePreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + + self.canine = CanineModel(config) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, TokenClassifierOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. + + Example: + + ```python + >>> from transformers import AutoTokenizer, CanineForTokenClassification + >>> import torch + + >>> tokenizer = AutoTokenizer.from_pretrained("google/canine-s") + >>> model = CanineForTokenClassification.from_pretrained("google/canine-s") + + >>> inputs = tokenizer( + ... "HuggingFace is a company based in Paris and New York", add_special_tokens=False, return_tensors="pt" + ... ) + + >>> with torch.no_grad(): + ... logits = model(**inputs).logits + + >>> predicted_token_class_ids = logits.argmax(-1) + + >>> # Note that tokens are classified rather then input words which means that + >>> # there might be more predicted token classes than words. + >>> # Multiple token classes might account for the same word + >>> predicted_tokens_classes = [model.config.id2label[t.item()] for t in predicted_token_class_ids[0]] + >>> predicted_tokens_classes # doctest: +SKIP + ``` + + ```python + >>> labels = predicted_token_class_ids + >>> loss = model(**inputs, labels=labels).loss + >>> round(loss.item(), 2) # doctest: +SKIP + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.canine( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + sequence_output = self.dropout(sequence_output) + logits = self.classifier(sequence_output) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TokenClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@auto_docstring +class CanineForQuestionAnswering(CaninePreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + + self.canine = CanineModel(config) + self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + start_positions: Optional[torch.LongTensor] = None, + end_positions: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, QuestionAnsweringModelOutput]: + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.canine( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + logits = self.qa_outputs(sequence_output) + start_logits, end_logits = logits.split(1, dim=-1) + start_logits = start_logits.squeeze(-1) + end_logits = end_logits.squeeze(-1) + + total_loss = None + if start_positions is not None and end_positions is not None: + # If we are on multi-GPU, split add a dimension + if len(start_positions.size()) > 1: + start_positions = start_positions.squeeze(-1) + if len(end_positions.size()) > 1: + end_positions = end_positions.squeeze(-1) + # sometimes the start/end positions are outside our model inputs, we ignore these terms + ignored_index = start_logits.size(1) + start_positions.clamp_(0, ignored_index) + end_positions.clamp_(0, ignored_index) + + loss_fct = CrossEntropyLoss(ignore_index=ignored_index) + start_loss = loss_fct(start_logits, start_positions) + end_loss = loss_fct(end_logits, end_positions) + total_loss = (start_loss + end_loss) / 2 + + if not return_dict: + output = (start_logits, end_logits) + outputs[2:] + return ((total_loss,) + output) if total_loss is not None else output + + return QuestionAnsweringModelOutput( + loss=total_loss, + start_logits=start_logits, + end_logits=end_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +__all__ = [ + "CanineForMultipleChoice", + "CanineForQuestionAnswering", + "CanineForSequenceClassification", + "CanineForTokenClassification", + "CanineLayer", + "CanineModel", + "CaninePreTrainedModel", + "load_tf_weights_in_canine", +] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/canine/tokenization_canine.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/canine/tokenization_canine.py new file mode 100644 index 0000000000000000000000000000000000000000..f6b2a8bfd96efdf6ed87aa1c82571612cfd89a4e --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/canine/tokenization_canine.py @@ -0,0 +1,213 @@ +# coding=utf-8 +# Copyright Google AI and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes for CANINE.""" + +from typing import Optional + +from ...tokenization_utils import AddedToken, PreTrainedTokenizer +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +# Unicode defines 1,114,112 total “codepoints” +UNICODE_VOCAB_SIZE = 1114112 + +# Below: Constants defining canonical codepoints for special, pseudo-characters. +# Copied from https://github.com/google-research/language/blob/master/language/canine/special_codepoints.py +PAD = 0 +CLS = 0xE000 +SEP = 0xE001 +BOS = 0xE002 +MASK = 0xE003 +RESERVED = 0xE004 + +# Maps special codepoints to human-readable names. +SPECIAL_CODEPOINTS: dict[int, str] = { + # Special symbols are represented using codepoints values that are valid, + # but designated as "Private Use", meaning that they will never be assigned + # characters by the Unicode Consortium, and are thus safe for use here. + # + # NOTE: Do *NOT* add any sort of [UNK_CHAR] here. They are explicitly + # excluded and should fail with a hard error. + CLS: "[CLS]", + SEP: "[SEP]", + BOS: "[BOS]", + MASK: "[MASK]", + PAD: "[PAD]", + RESERVED: "[RESERVED]", +} + +# Maps special codepoint human-readable names to their codepoint values. +SPECIAL_CODEPOINTS_BY_NAME: dict[str, int] = {name: codepoint for codepoint, name in SPECIAL_CODEPOINTS.items()} + + +class CanineTokenizer(PreTrainedTokenizer): + r""" + Construct a CANINE tokenizer (i.e. a character splitter). It turns text into a sequence of characters, and then + converts each character into its Unicode code point. + + [`CanineTokenizer`] inherits from [`PreTrainedTokenizer`]. + + Refer to superclass [`PreTrainedTokenizer`] for usage examples and documentation concerning parameters. + + Args: + model_max_length (`int`, *optional*, defaults to 2048): + The maximum sentence length the model accepts. + """ + + def __init__( + self, + bos_token=chr(CLS), + eos_token=chr(SEP), + sep_token=chr(SEP), + cls_token=chr(CLS), + pad_token=chr(PAD), + mask_token=chr(MASK), + add_prefix_space=False, + model_max_length=2048, + **kwargs, + ): + bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token + eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token + sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token + cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token + pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token + + # Mask token behave like a normal word, i.e. include the space before it + mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token + + # Creates a mapping for looking up the IDs of special symbols. + self._special_codepoints: dict[str, int] = {} + for codepoint, name in SPECIAL_CODEPOINTS.items(): + self._special_codepoints[name] = codepoint + + # Creates a mapping for looking up the string forms of special symbol IDs. + self._special_codepoint_strings: dict[int, str] = { + codepoint: name for name, codepoint in self._special_codepoints.items() + } + + self._unicode_vocab_size = UNICODE_VOCAB_SIZE + self._num_special_tokens = len(self._special_codepoints) + + super().__init__( + bos_token=bos_token, + eos_token=eos_token, + sep_token=sep_token, + cls_token=cls_token, + pad_token=pad_token, + mask_token=mask_token, + add_prefix_space=add_prefix_space, + model_max_length=model_max_length, + **kwargs, + ) + + @property + def vocab_size(self) -> int: + return self._unicode_vocab_size + + def get_vocab(self): + vocab = {chr(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab + + def _tokenize(self, text: str) -> list[str]: + """Tokenize a string (i.e. perform character splitting).""" + return list(text) + + def _convert_token_to_id(self, token: str) -> int: + """Converts a token (i.e. a Unicode character) in an id (i.e. its integer Unicode code point value).""" + try: + return ord(token) + except TypeError: + raise ValueError(f"invalid token: '{token}'") + + def _convert_id_to_token(self, index: int) -> str: + """ + Converts a Unicode code point (integer) in a token (str). In case it's a special code point, convert to + human-readable format. + """ + try: + if index in SPECIAL_CODEPOINTS: + return SPECIAL_CODEPOINTS[index] + return chr(index) + except TypeError: + raise ValueError(f"invalid id: {index}") + + def convert_tokens_to_string(self, tokens): + return "".join(tokens) + + def build_inputs_with_special_tokens( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None + ) -> list[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A CANINE sequence has the following format: + + - single sequence: `[CLS] X [SEP]` + - pair of sequences: `[CLS] A [SEP] B [SEP]` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + sep = [self.sep_token_id] + cls = [self.cls_token_id] + + result = cls + token_ids_0 + sep + if token_ids_1 is not None: + result += token_ids_1 + sep + return result + + def get_special_tokens_mask( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False + ) -> list[int]: + """ + Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer `prepare_for_model` method. + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not the token list is already formatted with special tokens for the model. + + Returns: + `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + if already_has_special_tokens: + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) + + result = [1] + ([0] * len(token_ids_0)) + [1] + if token_ids_1 is not None: + result += ([0] * len(token_ids_1)) + [1] + return result + + # CanineTokenizer has no vocab file + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None): + return () + + +__all__ = ["CanineTokenizer"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/chameleon/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/chameleon/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..6ad11a90a24bc4e8c9fd744bca6297e5388fd52e --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/chameleon/__init__.py @@ -0,0 +1,30 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_chameleon import * + from .image_processing_chameleon import * + from .image_processing_chameleon_fast import * + from .modeling_chameleon import * + from .processing_chameleon import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/chameleon/configuration_chameleon.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/chameleon/configuration_chameleon.py new file mode 100644 index 0000000000000000000000000000000000000000..34436a5288c8187d893d7ca4775b812b4c4d7961 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/chameleon/configuration_chameleon.py @@ -0,0 +1,282 @@ +# coding=utf-8 +# Copyright 2024 Meta Inc. and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""chameleon model configuration""" + +from typing import Optional + +from ...configuration_utils import PretrainedConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class ChameleonVQVAEConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`ChameleonVQModel`]. It is used to instantiate a + `ChameleonVQModel` according to the specified arguments, defining the model architecture. + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. Instantiating a + configuration with the defaults will yield a similar configuration to the VQModel of the + [meta/chameleon-7B](https://huggingface.co/meta/chameleon-7B). + + Args: + embed_dim (`int`, *optional*, defaults to 256): + Dimensionality of each embedding vector. + num_embeddings (`int`, *optional*, defaults to 8192): + Number of codebook embeddings. + double_latent (`bool`, *optional*, defaults to `False`): + Whether to use double z channels. + latent_channels (`int`, *optional*, defaults to 256): + Number of channels for the latent space. + resolution (`int`, *optional*, defaults to 512): + Resolution of the input images. + in_channels (`int`, *optional*, defaults to 3): + Number of input channels. + base_channels (`int`, *optional*, defaults to 128): + Base channel count. + channel_multiplier (`list[int]`, *optional*, defaults to `[1, 1, 2, 2, 4]`): + Channel multipliers for each resolution. + num_res_blocks (`int`, *optional*, defaults to 2): + Number of residual blocks. + attn_resolutions (`list[int]`, *optional*): + Resolutions to apply attention. + dropout (`float`, *optional*, defaults to 0.0): + Dropout rate. + attn_type (`str`, *optional*, defaults to `"vanilla"`): + Attention type used in VQ-GAN encoder. Can be "vanilla" or None. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + """ + + model_type = "chameleon_vqgan" + base_config_key = "vq_config" + + def __init__( + self, + embed_dim: int = 256, + num_embeddings: int = 8192, + double_latent: bool = False, + latent_channels: int = 256, + resolution: int = 512, + in_channels: int = 3, + base_channels: int = 128, + channel_multiplier: list[int] = [1, 1, 2, 2, 4], + num_res_blocks: int = 2, + attn_resolutions: Optional[list[int]] = None, + dropout: float = 0.0, + attn_type: str = "vanilla", + initializer_range=0.02, + **kwargs, + ): + super().__init__(**kwargs) + self.embed_dim = embed_dim + self.num_embeddings = num_embeddings + self.double_latent = double_latent + self.latent_channels = latent_channels + self.resolution = resolution + self.in_channels = in_channels + self.base_channels = base_channels + self.channel_multiplier = channel_multiplier + self.num_res_blocks = num_res_blocks + self.attn_resolutions = attn_resolutions + self.dropout = dropout + self.attn_type = attn_type + self.initializer_range = initializer_range + + +class ChameleonConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`ChameleonModel`]. It is used to instantiate a + chameleon model according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of the + [meta/chameleon-7B](https://huggingface.co/meta/chameleon-7B). + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 65536): + Vocabulary size of the chameleon model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`ChameleonModel`]; this includes text and image tokens. + hidden_size (`int`, *optional*, defaults to 4096): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 11008): + Dimension of the MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 32): + Number of hidden layers in the Transformer decoder. + num_attention_heads (`int`, *optional*, defaults to 32): + Number of attention heads for each attention layer in the Transformer decoder. + num_key_value_heads (`int`, *optional*, defaults to 32): + This is the number of key_value heads that should be used to implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When + converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed + by meanpooling all the original heads within that group. For more details, check out [this + paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to + `num_attention_heads`. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the decoder. + max_position_embeddings (`int`, *optional*, defaults to 4096): + The maximum sequence length that this model might ever be used with. Chameleon supports up to 4096 tokens. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + rms_norm_eps (`float`, *optional*, defaults to 1e-05): + The epsilon used by the rms normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + pad_token_id (`int`, *optional*): + Padding token id. + bos_token_id (`int`, *optional*, defaults to 1): + Beginning of stream token id. + eos_token_id (`int`, *optional*, defaults to 2): + End of stream token id. + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether to tie weight embeddings + rope_theta (`float`, *optional*, defaults to 10000.0): + The base period of the RoPE embeddings. + rope_scaling (`Dict`, *optional*): + Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling + strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is + `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update + `max_position_embeddings` to the expected new maximum. See the following thread for more information on how + these scaling strategies behave: + https://www.reddit.com/r/Localchameleon/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an + experimental feature, subject to breaking API changes in future versions. + attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`): + Whether to use a bias in the query, key, value and output projection layers during self-attention. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + model_parallel_size (`int`, *optional*, defaults to 1): + Number of shards used when training the model. This will be used in qk layernorm because the original Chameleon inference + doesn't do reduction in those layers and each rank has its own biases. + swin_norm (`bool`, *optional*, defaults to `False`): + Use Swin Transformer normalization. + vq_config (`dict`, *optional*): + ChameleonVQConfig instance containing the configuration for the VQ-VAE model. + vocabulary_map (`dict`, *optional*): + A dictionary containing the vocabulary map from the tokenizer. Used to obtain tokens from the image inputs. + mlp_bias (`bool`, *optional*, defaults to `False`): + Whether to use a bias in up_proj, down_proj and gate_proj layers in the MLP layers. + + + ```python + >>> from transformers import ChameleonModel, ChameleonConfig + + >>> # Initializing a chameleon chameleon-7b style configuration + >>> configuration = ChameleonConfig() + + >>> # Initializing a model from the chameleon-7b style configuration + >>> model = ChameleonModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "chameleon" + sub_configs = {"vq_config": ChameleonVQVAEConfig} + keys_to_ignore_at_inference = ["past_key_values"] + + def __init__( + self, + vocab_size=65536, + hidden_size=4096, + intermediate_size=11008, + num_hidden_layers=32, + num_attention_heads=32, + num_key_value_heads=32, + hidden_act="silu", + max_position_embeddings=4096, + initializer_range=0.02, + rms_norm_eps=1e-05, + use_cache=True, + pad_token_id=None, + bos_token_id=1, + eos_token_id=2, + tie_word_embeddings=False, + rope_theta=10000.0, + rope_scaling=None, + attention_bias=False, + attention_dropout=0.0, + model_parallel_size=1, + swin_norm=False, + vq_config=None, + vocabulary_map=None, + mlp_bias=False, + **kwargs, + ): + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.mlp_bias = mlp_bias + + self.num_key_value_heads = num_key_value_heads + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + self.use_cache = use_cache + self.rope_theta = rope_theta + self.rope_scaling = rope_scaling + self._rope_scaling_validation() + self.attention_bias = attention_bias + self.attention_dropout = attention_dropout + self.model_parallel_size = model_parallel_size + self.swin_norm = swin_norm + + if vq_config is None: + vq_config = {} + logger.info("vq_config is None. initializing the ChameleonVQConfig with default values.") + + self.vq_config = ChameleonVQVAEConfig(**vq_config) + + self.vocabulary_map = vocabulary_map + self.image_token_id = vocabulary_map.get("") if vocabulary_map is not None else None + + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) + + def _rope_scaling_validation(self): + """ + Validate the `rope_scaling` configuration. + """ + if self.rope_scaling is None: + return + + if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2: + raise ValueError( + "`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, " + f"got {self.rope_scaling}" + ) + rope_scaling_type = self.rope_scaling.get("type", None) + rope_scaling_factor = self.rope_scaling.get("factor", None) + if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]: + raise ValueError( + f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}" + ) + if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0: + raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}") + + +__all__ = ["ChameleonConfig", "ChameleonVQVAEConfig"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/chameleon/image_processing_chameleon.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/chameleon/image_processing_chameleon.py new file mode 100644 index 0000000000000000000000000000000000000000..9cae9d7bdd34d8b7a2292059ae1e92b587891272 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/chameleon/image_processing_chameleon.py @@ -0,0 +1,341 @@ +# coding=utf-8 +# Copyright 2024 Meta Inc. and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Image processor class for Chameleon.""" + +from typing import Optional, Union + +import numpy as np + +from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict +from ...image_transforms import get_resize_output_image_size, resize, to_channel_dimension_format +from ...image_utils import ( + ChannelDimension, + ImageInput, + PILImageResampling, + infer_channel_dimension_format, + is_scaled_image, + make_flat_list_of_images, + to_numpy_array, + valid_images, + validate_preprocess_arguments, +) +from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging + + +logger = logging.get_logger(__name__) + +if is_vision_available(): + import PIL + + +class ChameleonImageProcessor(BaseImageProcessor): + r""" + Constructs a Chameleon image processor. + + Args: + do_resize (`bool`, *optional*, defaults to `True`): + Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by + `do_resize` in the `preprocess` method. + size (`dict[str, int]` *optional*, defaults to `{"shortest_edge": 512}`): + Size of the image after resizing. The shortest edge of the image is resized to size["shortest_edge"], with + the longest edge resized to keep the input aspect ratio. Can be overridden by `size` in the `preprocess` + method. + resample (`PILImageResampling`, *optional*, defaults to 1): + Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method. + do_center_crop (`bool`, *optional*, defaults to `True`): + Whether to center crop the image to the specified `crop_size`. Can be overridden by `do_center_crop` in the + `preprocess` method. + crop_size (`dict[str, int]` *optional*, defaults to {"height": 512, "width": 512}): + Size of the output image after applying `center_crop`. Can be overridden by `crop_size` in the `preprocess` + method. + do_rescale (`bool`, *optional*, defaults to `True`): + Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in + the `preprocess` method. + rescale_factor (`int` or `float`, *optional*, defaults to 0.0078): + Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess` + method. + do_normalize (`bool`, *optional*, defaults to `True`): + Whether to normalize the image. Can be overridden by `do_normalize` in the `preprocess` method. + image_mean (`float` or `list[float]`, *optional*, defaults to `[1.0, 1.0, 1.0]`): + Mean to use if normalizing the image. This is a float or list of floats the length of the number of + channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. + image_std (`float` or `list[float]`, *optional*, defaults to `[1.0, 1.0, 1.0]`): + Standard deviation to use if normalizing the image. This is a float or list of floats the length of the + number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method. + Can be overridden by the `image_std` parameter in the `preprocess` method. + do_convert_rgb (`bool`, *optional*, defaults to `True`): + Whether to convert the image to RGB. + """ + + model_input_names = ["pixel_values"] + + def __init__( + self, + do_resize: bool = True, + size: Optional[dict[str, int]] = None, + resample: PILImageResampling = PIL.Image.LANCZOS, + do_center_crop: bool = True, + crop_size: Optional[dict[str, int]] = None, + do_rescale: bool = True, + rescale_factor: Union[int, float] = 0.0078, + do_normalize: bool = True, + image_mean: Optional[Union[float, list[float]]] = None, + image_std: Optional[Union[float, list[float]]] = None, + do_convert_rgb: bool = True, + **kwargs, + ) -> None: + super().__init__(**kwargs) + size = size if size is not None else {"shortest_edge": 512} + size = get_size_dict(size, default_to_square=False) + crop_size = crop_size if crop_size is not None else {"height": 512, "width": 512} + crop_size = get_size_dict(crop_size, default_to_square=True, param_name="crop_size") + + self.do_resize = do_resize + self.size = size + self.resample = resample + self.do_center_crop = do_center_crop + self.crop_size = crop_size + self.do_rescale = do_rescale + self.rescale_factor = rescale_factor + self.do_normalize = do_normalize + self.image_mean = image_mean if image_mean is not None else [1.0, 1.0, 1.0] + self.image_std = image_std if image_std is not None else [1.0, 1.0, 1.0] + self.do_convert_rgb = do_convert_rgb + + # Copied from transformers.models.clip.image_processing_clip.CLIPImageProcessor.resize + def resize( + self, + image: np.ndarray, + size: dict[str, int], + resample: PILImageResampling = PILImageResampling.BICUBIC, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ) -> np.ndarray: + """ + Resize an image. The shortest edge of the image is resized to size["shortest_edge"], with the longest edge + resized to keep the input aspect ratio. + + Args: + image (`np.ndarray`): + Image to resize. + size (`dict[str, int]`): + Size of the output image. + resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`): + Resampling filter to use when resiizing the image. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format of the image. If not provided, it will be the same as the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format of the input image. If not provided, it will be inferred. + """ + default_to_square = True + if "shortest_edge" in size: + size = size["shortest_edge"] + default_to_square = False + elif "height" in size and "width" in size: + size = (size["height"], size["width"]) + else: + raise ValueError("Size must contain either 'shortest_edge' or 'height' and 'width'.") + + output_size = get_resize_output_image_size( + image, + size=size, + default_to_square=default_to_square, + input_data_format=input_data_format, + ) + return resize( + image, + size=output_size, + resample=resample, + data_format=data_format, + input_data_format=input_data_format, + **kwargs, + ) + + @filter_out_non_signature_kwargs() + def preprocess( + self, + images: ImageInput, + do_resize: Optional[bool] = None, + size: Optional[dict[str, int]] = None, + resample: Optional[PILImageResampling] = None, + do_center_crop: Optional[bool] = None, + crop_size: Optional[int] = None, + do_rescale: Optional[bool] = None, + rescale_factor: Optional[float] = None, + do_normalize: Optional[bool] = None, + image_mean: Optional[Union[float, list[float]]] = None, + image_std: Optional[Union[float, list[float]]] = None, + do_convert_rgb: Optional[bool] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + data_format: Optional[ChannelDimension] = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> PIL.Image.Image: + """ + Preprocess an image or batch of images. + + Args: + images (`ImageInput`): + Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If + passing in images with pixel values between 0 and 1, set `do_rescale=False`. + do_resize (`bool`, *optional*, defaults to `self.do_resize`): + Whether to resize the image. + size (`dict[str, int]`, *optional*, defaults to `self.size`): + Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with + the longest edge resized to keep the input aspect ratio. + resample (`int`, *optional*, defaults to `self.resample`): + Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only + has an effect if `do_resize` is set to `True`. + do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`): + Whether to center crop the image. + crop_size (`dict[str, int]`, *optional*, defaults to `self.crop_size`): + Size of the center crop. Only has an effect if `do_center_crop` is set to `True`. + do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): + Whether to rescale the image. + rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): + Rescale factor to rescale the image by if `do_rescale` is set to `True`. + do_normalize (`bool`, *optional*, defaults to `self.do_normalize`): + Whether to normalize the image. + image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`): + Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`. + image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`): + Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to + `True`. + do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`): + Whether to convert the image to RGB. + return_tensors (`str` or `TensorType`, *optional*): + The type of tensors to return. Can be one of: + - Unset: Return a list of `np.ndarray`. + - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`. + - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. + - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. + - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. + data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - Unset: Use the channel dimension format of the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + """ + do_resize = do_resize if do_resize is not None else self.do_resize + size = size if size is not None else self.size + size = get_size_dict(size, param_name="size", default_to_square=False) + resample = resample if resample is not None else self.resample + do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop + crop_size = crop_size if crop_size is not None else self.crop_size + crop_size = get_size_dict(crop_size, param_name="crop_size", default_to_square=True) + do_rescale = do_rescale if do_rescale is not None else self.do_rescale + rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor + do_normalize = do_normalize if do_normalize is not None else self.do_normalize + image_mean = image_mean if image_mean is not None else self.image_mean + image_std = image_std if image_std is not None else self.image_std + do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb + + images = self.fetch_images(images) + images = make_flat_list_of_images(images) + + if not valid_images(images): + raise ValueError( + "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " + "torch.Tensor, tf.Tensor or jax.ndarray." + ) + + validate_preprocess_arguments( + do_rescale=do_rescale, + rescale_factor=rescale_factor, + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + do_center_crop=do_center_crop, + crop_size=crop_size, + do_resize=do_resize, + size=size, + resample=resample, + ) + + if do_convert_rgb: + images = [self.blend_rgba(image) for image in images] + + # All transformations expect numpy arrays. + images = [to_numpy_array(image) for image in images] + + if do_rescale and is_scaled_image(images[0]): + logger.warning_once( + "It looks like you are trying to rescale already rescaled images. If the input" + " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again." + ) + + if input_data_format is None: + # We assume that all images have the same channel dimension format. + input_data_format = infer_channel_dimension_format(images[0]) + all_images = [] + for image in images: + if do_resize: + image = self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format) + + if do_center_crop: + image = self.center_crop(image=image, size=crop_size, input_data_format=input_data_format) + + if do_rescale: + image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format) + + if do_normalize: + image = self.normalize( + image=image, mean=image_mean, std=image_std, input_data_format=input_data_format + ) + + all_images.append(image) + images = [ + to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) + for image in all_images + ] + + data = {"pixel_values": images} + return BatchFeature(data=data, tensor_type=return_tensors) + + def blend_rgba(self, image: ImageInput) -> ImageInput: + """ + Convert image to RGB by blending the transparency layer if it's in RGBA format. + If image is not `PIL.Image`, it si simply returned without modifications. + + Args: + image (`ImageInput`): + Image to convert. + """ + + if not isinstance(image, PIL.Image.Image): + return image + elif image.mode == "RGB": + return image + + img_rgba = np.array(image.convert("RGBA")) + + # If there is no transparency layer, simple convert and return. + if not (img_rgba[:, :, 3] < 255).any(): + return image.convert("RGB") + + # There is a transparency layer, blend it with a white background. + # Calculate the alpha proportion for blending. + alpha = img_rgba[:, :, 3] / 255.0 + img_rgb = (1 - alpha[:, :, np.newaxis]) * 255 + alpha[:, :, np.newaxis] * img_rgba[:, :, :3] + return PIL.Image.fromarray(img_rgb.astype("uint8"), "RGB") + + +__all__ = ["ChameleonImageProcessor"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/chameleon/image_processing_chameleon_fast.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/chameleon/image_processing_chameleon_fast.py new file mode 100644 index 0000000000000000000000000000000000000000..1d102614f7df3700845c00f9d8bfa217930c776b --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/chameleon/image_processing_chameleon_fast.py @@ -0,0 +1,112 @@ +# coding=utf-8 +# Copyright 2025 Meta Inc. and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Fast Image processor class for Chameleon.""" + +from typing import Optional + +import numpy as np +import PIL +import torch +from torchvision.transforms.v2 import functional as F + +from ...image_processing_utils_fast import BaseImageProcessorFast +from ...image_utils import ImageInput, PILImageResampling, SizeDict +from ...utils import auto_docstring, logging + + +logger = logging.get_logger(__name__) + + +@auto_docstring +class ChameleonImageProcessorFast(BaseImageProcessorFast): + resample = PILImageResampling.LANCZOS + image_mean = [1.0, 1.0, 1.0] + image_std = [1.0, 1.0, 1.0] + size = {"shortest_edge": 512} + default_to_square = False + crop_size = {"height": 512, "width": 512} + do_resize = True + do_center_crop = True + do_rescale = True + rescale_factor = 0.0078 + do_normalize = True + do_convert_rgb = True + + def convert_to_rgb(self, image: ImageInput) -> ImageInput: + """ + Convert image to RGB by blending the transparency layer if it's in RGBA format. + If image is not `PIL.Image`, it si simply returned without modifications. + + Args: + image (`ImageInput`): + Image to convert. + """ + + if not isinstance(image, PIL.Image.Image): + return image + elif image.mode == "RGB": + return image + + img_rgba = np.array(image.convert("RGBA")) + + # If there is no transparency layer, simple convert and return. + if not (img_rgba[:, :, 3] < 255).any(): + return image.convert("RGB") + + # There is a transparency layer, blend it with a white background. + # Calculate the alpha proportion for blending. + alpha = img_rgba[:, :, 3] / 255.0 + img_rgb = (1 - alpha[:, :, np.newaxis]) * 255 + alpha[:, :, np.newaxis] * img_rgba[:, :, :3] + return PIL.Image.fromarray(img_rgb.astype("uint8"), "RGB") + + def resize( + self, + image: "torch.Tensor", + size: SizeDict, + interpolation: Optional["F.InterpolationMode"] = None, + **kwargs, + ) -> "torch.Tensor": + """ + Resize an image to `(size["height"], size["width"])`. + + Args: + image (`torch.Tensor`): + Image to resize. + size (`SizeDict`): + Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image. + resample (`InterpolationMode`, *optional*, defaults to `InterpolationMode.BILINEAR`): + `InterpolationMode` filter to use when resizing the image e.g. `InterpolationMode.BICUBIC`. + + Returns: + `torch.Tensor`: The resized image. + """ + interpolation = interpolation if interpolation is not None else F.InterpolationMode.BILINEAR + if interpolation == F.InterpolationMode.LANCZOS: + logger.warning_once( + "You have used fast image processor with LANCZOS resample which not yet supported for torch.Tensor. " + "BICUBIC resample will be used as an alternative. Please fall back to slow image processor if you " + "want full consistency with the original model." + ) + interpolation = F.InterpolationMode.BICUBIC + + return super().resize( + image=image, + size=size, + interpolation=interpolation, + **kwargs, + ) + + +__all__ = ["ChameleonImageProcessorFast"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/chameleon/modeling_chameleon.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/chameleon/modeling_chameleon.py new file mode 100644 index 0000000000000000000000000000000000000000..033b8ecd7c630208602e1ae4d38b95df7718aaed --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/chameleon/modeling_chameleon.py @@ -0,0 +1,1169 @@ +# coding=utf-8 +# Copyright 2024 Meta Inc. and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch Chameleon model.""" + +from functools import cached_property +from typing import Callable, Optional, Union + +import torch +import torch.nn.functional as F +from torch import nn + +from ...activations import ACT2FN +from ...cache_utils import Cache, DynamicCache +from ...generation import GenerationMixin +from ...masking_utils import create_causal_mask +from ...modeling_flash_attention_utils import FlashAttentionKwargs +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast +from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel +from ...processing_utils import Unpack +from ...utils import ( + TransformersKwargs, + auto_docstring, + can_return_tuple, + logging, +) +from ...utils.deprecation import deprecate_kwarg +from .configuration_chameleon import ChameleonConfig, ChameleonVQVAEConfig + + +logger = logging.get_logger(__name__) + + +# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Chameleon +class ChameleonRMSNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + """ + ChameleonRMSNorm is equivalent to T5LayerNorm + """ + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + return self.weight * hidden_states.to(input_dtype) + + def extra_repr(self): + return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}" + + +# copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Chameleon +# TODO(joao): add me back asap :) +class ChameleonRotaryEmbedding(nn.Module): + inv_freq: torch.Tensor # fix linting for `register_buffer` + + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0): + super().__init__() + self.scaling_factor = scaling_factor + self.dim = dim + self.max_position_embeddings = max_position_embeddings + self.base = base + inv_freq = 1.0 / ( + self.base + ** (torch.arange(0, self.dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / self.dim) + ) + self.register_buffer("inv_freq", inv_freq, persistent=False) + # For BC we register cos and sin cached + self.max_seq_len_cached = max_position_embeddings + + @torch.no_grad() + def forward(self, x, position_ids): + # x: [bs, num_attention_heads, seq_len, head_size] + inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1) + position_ids_expanded = position_ids[:, None, :].float() + # Force float32 since bfloat16 loses precision on long contexts + # See https://github.com/huggingface/transformers/pull/29285 + device_type = x.device.type + device_type = device_type if device_type != "mps" else "cpu" + with torch.autocast(device_type=device_type, enabled=False): + freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) + emb = torch.cat((freqs, freqs), dim=-1) + cos = emb.cos() + sin = emb.sin() + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) + + +class ChameleonLinearScalingRotaryEmbedding(ChameleonRotaryEmbedding): + """ChameleonRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev""" + + def forward(self, x, position_ids): + # difference to the original RoPE: a scaling factor is applied to the position ids + position_ids = position_ids.float() / self.scaling_factor + cos, sin = super().forward(x, position_ids) + return cos, sin + + +class ChameleonDynamicNTKScalingRotaryEmbedding(ChameleonRotaryEmbedding): + """ChameleonRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla""" + + def forward(self, x, position_ids): + # difference to the original RoPE: inv_freq is recomputed when the sequence length > original length + seq_len = torch.max(position_ids) + 1 + if seq_len > self.max_position_embeddings: + base = self.base * ( + (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1) + ) ** (self.dim / (self.dim - 2)) + inv_freq = 1.0 / ( + base + ** (torch.arange(0, self.dim, 2, dtype=torch.int64).to(device=x.device, dtype=torch.float) / self.dim) + ) + self.register_buffer("inv_freq", inv_freq, persistent=False) # TODO joao: this may break with compilation + + cos, sin = super().forward(x, position_ids) + return cos, sin + + +# Copied from transformers.models.llama.modeling_llama.rotate_half +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + +# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb +def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1): + """Applies Rotary Position Embedding to the query and key tensors. + + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + position_ids (`torch.Tensor`, *optional*): + Deprecated and unused. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + cos = cos.unsqueeze(unsqueeze_dim) + sin = sin.unsqueeze(unsqueeze_dim) + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + +# Copied from transformers.models.llama.modeling_llama.LlamaMLP with Llama->Chameleon +class ChameleonMLP(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.intermediate_size = config.intermediate_size + self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias) + self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias) + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias) + self.act_fn = ACT2FN[config.hidden_act] + + # Ignore copy + def forward(self, x): + down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) + return down_proj + + +class ChameleonLayerNorm(nn.LayerNorm): + """ + LayerNorm but computes stats only over the last dim because Chameleon applies gamma and beta + from each shard separately to each head, instead of reducing. We can apply each head's own + gamma/beta by repeat-interleaving weights from each shard, but the stats have to be computed + in the last dimension. This module applies gamma/beta manually to fulfill this requirement. + """ + + def __init__(self, hidden_size, *args, **kwargs): + super().__init__(hidden_size, *args, **kwargs) + self.normalized_shape = (hidden_size[-1],) + + def forward(self, hidden_states): + hidden_states = F.layer_norm(hidden_states, self.normalized_shape, None, None, eps=1e-5) + hidden_states = hidden_states * self.weight + self.bias + return hidden_states + + +# Copied from transformers.models.llama.modeling_llama.repeat_kv +def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: + """ + This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, + num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) + """ + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) + + +# Copied from transformers.models.llama.modeling_llama.eager_attention_forward +def eager_attention_forward( + module: nn.Module, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attention_mask: Optional[torch.Tensor], + scaling: float, + dropout: float = 0.0, + **kwargs: Unpack[TransformersKwargs], +): + key_states = repeat_kv(key, module.num_key_value_groups) + value_states = repeat_kv(value, module.num_key_value_groups) + + attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling + if attention_mask is not None: + causal_mask = attention_mask[:, :, :, : key_states.shape[-2]] + attn_weights = attn_weights + causal_mask + + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype) + attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) + attn_output = torch.matmul(attn_weights, value_states) + attn_output = attn_output.transpose(1, 2).contiguous() + + return attn_output, attn_weights + + +class ChameleonAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config: ChameleonConfig, layer_idx: Optional[int] = None): + super().__init__() + self.config = config + self.layer_idx = layer_idx + if layer_idx is None: + logger.warning_once( + f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will " + "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` " + "when creating this class." + ) + + self.attention_dropout = config.attention_dropout + self.hidden_size = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.hidden_size // self.num_heads + self.num_key_value_heads = config.num_key_value_heads + self.num_key_value_groups = self.num_heads // self.num_key_value_heads + self.max_position_embeddings = config.max_position_embeddings + self.rope_theta = config.rope_theta + self.is_causal = True + self.model_parallel_size = config.model_parallel_size + self.scaling = self.head_dim**-0.5 + + if (self.head_dim * self.num_heads) != self.hidden_size: + raise ValueError( + f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}" + f" and `num_heads`: {self.num_heads})." + ) + + self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias) + self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias) + self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias) + self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=config.attention_bias) + self.q_norm = ChameleonLayerNorm((self.num_heads, self.head_dim)) + self.k_norm = ChameleonLayerNorm((self.num_key_value_heads, self.head_dim)) + self._init_rope() + + # copied from transformers.models.llama.modeling_llama.LlamaAttention._init_rope with Llama->Chameleon + # TODO(joao): add me back asap :) + def _init_rope(self): + if self.config.rope_scaling is None: + self.rotary_emb = ChameleonRotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + base=self.rope_theta, + ) + else: + scaling_type = self.config.rope_scaling["type"] + scaling_factor = self.config.rope_scaling["factor"] + if scaling_type == "linear": + self.rotary_emb = ChameleonLinearScalingRotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + scaling_factor=scaling_factor, + base=self.rope_theta, + ) + elif scaling_type == "dynamic": + self.rotary_emb = ChameleonDynamicNTKScalingRotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + scaling_factor=scaling_factor, + base=self.rope_theta, + ) + else: + raise ValueError(f"Unknown RoPE scaling type {scaling_type}") + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + cache_position: Optional[torch.LongTensor] = None, + **kwargs, + ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]: + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + query_states = query_states.reshape(-1, self.num_heads, self.head_dim) + query_states = self.q_norm(query_states) + + key_states = key_states.reshape(-1, self.num_key_value_heads, self.head_dim) + key_states = self.k_norm(key_states) + + query_states = query_states.reshape(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.reshape(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + cos, sin = self.rotary_emb(value_states, position_ids) + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) + + if past_key_values is not None: + # sin and cos are specific to RoPE models; position_ids needed for the static cache + cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} + key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs) + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + attn_output, attn_weights = attention_interface( + self, + query_states, + key_states, + value_states, + attention_mask, + dropout=0.0 if not self.training else self.attention_dropout, + scaling=self.scaling, + **kwargs, + ) + + attn_output = attn_output.reshape(bsz, q_len, -1).contiguous() + attn_output = self.o_proj(attn_output) + + return attn_output, attn_weights + + +# copied from transformers.models.llama.modeling_llama.LlamaDecoderLayer with Llama->Chameleon, LLAMA->CHAMELEON +class ChameleonDecoderLayer(GradientCheckpointingLayer): + def __init__(self, config: ChameleonConfig, layer_idx: int): + super().__init__() + self.hidden_size = config.hidden_size + + self.self_attn = ChameleonAttention(config=config, layer_idx=layer_idx) + + self.mlp = ChameleonMLP(config) + self.input_layernorm = ChameleonRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = ChameleonRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + cache_position: Optional[torch.LongTensor] = None, + **kwargs, + ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]: + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`, *optional*): + attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1, + query_sequence_length, key_sequence_length)` if default attention is used. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + past_key_values (`Cache`, *optional*): cached past key and value projection states + cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*): + Indices depicting the position of the input sequence tokens in the sequence + kwargs (`dict`, *optional*): + Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code + into the model + """ + residual = hidden_states + + hidden_states = self.input_layernorm(hidden_states) + + # Self Attention + hidden_states, self_attn_weights = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + output_attentions=output_attentions, + use_cache=use_cache, + cache_position=cache_position, + **kwargs, + ) + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights,) + + return outputs + + +class ChameleonSwinDecoderLayer(GradientCheckpointingLayer): + def __init__(self, config: ChameleonConfig, layer_idx: int): + super().__init__() + self.hidden_size = config.hidden_size + + self.self_attn = ChameleonAttention(config=config, layer_idx=layer_idx) + + self.mlp = ChameleonMLP(config) + self.input_layernorm = ChameleonRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = ChameleonRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + cache_position: Optional[torch.LongTensor] = None, + **kwargs, + ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]: + """ + Args: + hidden_states (`torch.FloatTensor`): + input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`, *optional*): + attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1, + query_sequence_length, key_sequence_length)` if default attention is used. + position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings + past_key_values (`Cache`, *optional*): cached past key and value projection states + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*): + Indices depicting the position of the input sequence tokens in the sequence. + """ + + residual = hidden_states + + # Self Attention + hidden_states, self_attn_weights = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + output_attentions=output_attentions, + use_cache=use_cache, + cache_position=cache_position, + **kwargs, + ) + hidden_states = self.input_layernorm(hidden_states) + hidden_states = residual + hidden_states + # Fully Connected + residual = hidden_states + hidden_states = self.mlp(hidden_states) + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = residual + hidden_states + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights,) + + return outputs + + +class ChameleonVQVAEVectorQuantizer(nn.Module): + """ + A module for vector quantization using learned embedding vectors. + + This module implements the quantization process similar to te one described in + the VQ-VAE (Vector Quantized Variational AutoEncoder) paper. It quantizes continuous + input vectors into discrete codebook vectors, which are learned during training. + Current implementation improves over previous ones by avoiding costly matrix multiplications + and allowing for post-hoc remapping of indices. + """ + + def __init__(self, config): + super().__init__() + self.num_embeddings = config.num_embeddings + self.embedding_dim = config.embed_dim + self.beta = getattr(config, "beta", 0.25) + + self.embedding = nn.Embedding(self.num_embeddings, self.embedding_dim) + + def forward(self, hidden_state: torch.Tensor): + hidden_state = hidden_state.permute(0, 2, 3, 1).contiguous() + hidden_state_flattened = hidden_state.view(-1, self.embedding_dim) + + # distances from z to embeddings e_j (z - e)^2 = z^2 + e^2 - 2 e * z + distances = ( + torch.sum(hidden_state_flattened**2, dim=1, keepdim=True) + + torch.sum(self.embedding.weight**2, dim=1) + - 2 * torch.einsum("bd,dn->bn", hidden_state_flattened, self.embedding.weight.transpose(0, 1)) + ) + + min_encoding_indices = torch.argmin(distances, dim=1) + hidden_state_quant = self.embedding(min_encoding_indices).view(hidden_state.shape) + + # compute loss for embedding + loss = torch.mean((hidden_state_quant.detach() - hidden_state) ** 2) + self.beta * torch.mean( + (hidden_state_quant - hidden_state.detach()) ** 2 + ) + + # preserve gradients + hidden_state_quant = hidden_state + (hidden_state_quant - hidden_state).detach() + + # reshape back to match original input shape + hidden_state_quant = hidden_state_quant.permute(0, 3, 1, 2).contiguous() + + return hidden_state_quant, loss, min_encoding_indices + + +class ChameleonVQVAEEncoderConvDownsample(nn.Module): + def __init__(self, in_channels): + super().__init__() + self.conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=2, padding=0) + + def forward(self, hidden_states): + # no asymmetric padding in torch conv, must do it ourselves + hidden_states = F.pad(hidden_states, pad=(0, 1, 0, 1), mode="constant", value=0) + hidden_states = self.conv(hidden_states) + return hidden_states + + +class ChameleonVQVAEEncoderResnetBlock(nn.Module): + def __init__( + self, + config, + in_channels, + out_channels=None, + conv_shortcut=False, + ): + super().__init__() + self.in_channels = in_channels + self.out_channels = in_channels if out_channels is None else out_channels + self.use_conv_shortcut = conv_shortcut + + self.norm1 = torch.nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True) + self.conv1 = torch.nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1) + self.norm2 = torch.nn.GroupNorm(num_groups=32, num_channels=out_channels, eps=1e-6, affine=True) + self.dropout = torch.nn.Dropout(config.dropout) + self.conv2 = torch.nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1) + if self.in_channels != self.out_channels: + if self.use_conv_shortcut: + self.conv_shortcut = torch.nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1) + else: + self.nin_shortcut = torch.nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0) + + def forward(self, hidden_states): + residual = hidden_states + hidden_states = self.norm1(hidden_states) + hidden_states *= torch.sigmoid(hidden_states) + hidden_states = self.conv1(hidden_states) + + hidden_states = self.norm2(hidden_states) + hidden_states *= torch.sigmoid(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.conv2(hidden_states) + + if self.in_channels != self.out_channels: + if self.use_conv_shortcut: + residual = self.conv_shortcut(residual) + else: + residual = self.nin_shortcut(residual) + + return residual + hidden_states + + +class ChameleonVQVAEEncoderAttnBlock(nn.Module): + def __init__(self, in_channels): + super().__init__() + self.in_channels = in_channels + + self.norm = torch.nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True) + self.q = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0) + self.k = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0) + self.v = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0) + self.proj_out = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0) + + def forward(self, hidden_states): + residual = hidden_states + hidden_states = self.norm(hidden_states) + query_states = self.q(hidden_states) + key_states = self.k(hidden_states) + value_states = self.v(hidden_states) + + # compute attention + batch_size, channels, height, width = query_states.shape + query_states = query_states.reshape(batch_size, channels, height * width).permute(0, 2, 1) + key_states = key_states.reshape(batch_size, channels, height * width) + attn_weights = torch.bmm(query_states, key_states) + attn_weights = attn_weights * (int(channels) ** (-0.5)) + attn_weights = F.softmax(attn_weights, dim=2) + + # attend to values + value_states = value_states.reshape(batch_size, channels, height * width) + attn_weights = attn_weights.permute(0, 2, 1) + attn_output = torch.bmm(value_states, attn_weights).reshape(batch_size, channels, height, width) + + attn_output = self.proj_out(attn_output) + return residual + attn_output + + +class ChameleonVQVAEEncoder(nn.Module): + def __init__(self, config): + super().__init__() + + self.num_resolutions = len(config.channel_multiplier) + self.num_res_blocks = config.num_res_blocks + base_channels = config.base_channels + resolution = config.resolution + in_channels = config.in_channels + double_latent = config.double_latent + latent_channels = config.latent_channels + channel_multiplier = config.channel_multiplier + + self.conv_in = torch.nn.Conv2d(in_channels, base_channels, kernel_size=3, stride=1, padding=1) + + curr_res = resolution + in_channel_multiplier = (1,) + tuple(channel_multiplier) + self.in_channel_multiplier = in_channel_multiplier + self.down = nn.ModuleList() + for i_level in range(self.num_resolutions): + block = nn.ModuleList() + attn = nn.ModuleList() + block_in = base_channels * in_channel_multiplier[i_level] + block_out = base_channels * channel_multiplier[i_level] + for i_block in range(self.num_res_blocks): + block.append( + ChameleonVQVAEEncoderResnetBlock( + config=config, + in_channels=block_in, + out_channels=block_out, + ) + ) + block_in = block_out + if ( + config.attn_resolutions is not None + and curr_res in config.attn_resolutions + and config.attn_type == "vanilla" + ): + attn.append(ChameleonVQVAEEncoderAttnBlock(block_in)) + + down = nn.Module() + down.block = block + down.attn = attn + if i_level != self.num_resolutions - 1: + down.downsample = ChameleonVQVAEEncoderConvDownsample(block_in) + curr_res = curr_res // 2 + self.down.append(down) + + self.mid = nn.Module() + self.mid.block_1 = ChameleonVQVAEEncoderResnetBlock( + config=config, + in_channels=block_in, + out_channels=block_in, + ) + self.mid.attn_1 = ChameleonVQVAEEncoderAttnBlock(block_in) if config.attn_type == "vanilla" else nn.Identity() + self.mid.block_2 = ChameleonVQVAEEncoderResnetBlock( + config=config, + in_channels=block_in, + out_channels=block_in, + ) + + self.norm_out = torch.nn.GroupNorm(num_groups=32, num_channels=block_in, eps=1e-6, affine=True) + self.conv_out = torch.nn.Conv2d( + block_in, + 2 * latent_channels if double_latent else latent_channels, + kernel_size=3, + stride=1, + padding=1, + ) + + def forward(self, pixel_values: torch.LongTensor): + # downsampling + hidden_states = [self.conv_in(pixel_values)] + for i_level in range(self.num_resolutions): + for i_block in range(self.num_res_blocks): + hidden_state = self.down[i_level].block[i_block]( + hidden_states[-1], + ) + if len(self.down[i_level].attn) > 0: + hidden_state = self.down[i_level].attn[i_block](hidden_state) + hidden_states.append(hidden_state) + if i_level != self.num_resolutions - 1: + hidden_states.append(self.down[i_level].downsample(hidden_states[-1])) + + # middle + last_hidden_state = hidden_states[-1] + last_hidden_state = self.mid.block_1(last_hidden_state) + last_hidden_state = self.mid.attn_1(last_hidden_state) + last_hidden_state = self.mid.block_2(last_hidden_state) + + # end + last_hidden_state = self.norm_out(last_hidden_state) + last_hidden_state *= torch.sigmoid(last_hidden_state) + last_hidden_state = self.conv_out(last_hidden_state) + return last_hidden_state + + +class ChameleonImageVocabularyMapping: + """ + A class for mapping discrete image tokens from VQGAN to BPE tokens. + """ + + def __init__(self, vocab_map): + self.vocab_map = vocab_map + self.image_token_id = vocab_map.get("") + + @cached_property + def val2name(self): + return {v: k for k, v in self.vocab_map.items()} + + @cached_property + def image_tokens(self): + return sorted([val for name, val in self.vocab_map.items() if name.startswith("IMGIMG")]) + + @cached_property + def bpe2img(self): + img_tkn_chr_mapping = {chr(ord("A") + i): str(i) for i in range(10)} + + def remap(old_name: str) -> str: + return "".join(img_tkn_chr_mapping.get(c, c) for c in old_name[len("IMGIMG") : -1]) + + return {tok: int(remap(self.val2name[tok])) for tok in self.image_tokens} + + @cached_property + def img2bpe(self): + return {v: k for k, v in self.bpe2img.items()} + + @cached_property + def bpe2img_search_tensors(self): + return torch.tensor(sorted(self.bpe2img.keys())), torch.tensor(sorted(self.bpe2img.values())) + + @cached_property + def img2bpe_mapping_tensor(self): + mapping = torch.zeros(max(self.img2bpe.keys()) + 1, dtype=torch.int) + for k, v in self.img2bpe.items(): + mapping[k] = v + return mapping + + def convert_img2bpe(self, img_batch: torch.Tensor) -> torch.Tensor: + device = img_batch.device + img_tokens = self.img2bpe_mapping_tensor[img_batch.to("cpu")] + return img_tokens.to(device) + + +@auto_docstring +class ChameleonPreTrainedModel(PreTrainedModel): + config: ChameleonConfig + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["ChameleonDecoderLayer", "ChameleonSwinDecoderLayer"] + _skip_keys_device_placement = ["past_key_values", "causal_mask"] + _supports_flash_attn = True + _supports_sdpa = True + + _can_compile_fullgraph = True + _supports_param_buffer_assignment = False + _supports_flex_attn = True + _supports_attention_backend = True + + +@auto_docstring( + custom_intro=""" + The VQ-VAE model used in Chameleon for encoding/decoding images into discrete tokens. + This model follows the "Make-a-scene: Scene-based text-to-image generation with human priors" paper from + [ Oran Gafni, Adam Polyak, Oron Ashual, Shelly Sheynin, Devi Parikh, and Yaniv + Taigman](https://huggingface.co/papers/2203.13131). + """ +) +class ChameleonVQVAE(ChameleonPreTrainedModel): + config: ChameleonVQVAEConfig + _no_split_modules = [ + "ChameleonVQVAEVectorQuantizer", + "ChameleonVQVAEEncoderAttnBlock", + "ChameleonVQVAEEncoderResnetBlock", + ] + + def __init__(self, config: ChameleonVQVAEConfig): + super().__init__(config) + + self.encoder = ChameleonVQVAEEncoder(config) + self.quantize = ChameleonVQVAEVectorQuantizer(config) + self.quant_conv = torch.nn.Conv2d(config.latent_channels, config.embed_dim, 1) + self.post_quant_conv = torch.nn.Conv2d(config.embed_dim, config.latent_channels, 1) + self.eval() # Chameleon's VQ model is frozen + + def encode(self, pixel_values: torch.LongTensor): + hidden_states = self.encoder(pixel_values) + hidden_states = self.quant_conv(hidden_states) + quant, emb_loss, indices = self.quantize(hidden_states) + return quant, emb_loss, indices + + +@auto_docstring +class ChameleonModel(ChameleonPreTrainedModel): + def __init__(self, config: ChameleonConfig): + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) + self.vocabulary_mapping = ChameleonImageVocabularyMapping(config.vocabulary_map) + decoder_layer = ChameleonDecoderLayer if not self.config.swin_norm else ChameleonSwinDecoderLayer + self.layers = nn.ModuleList( + [decoder_layer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + ) + self.norm = ChameleonRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.vqmodel = ChameleonVQVAE._from_config(config.vq_config) + self.gradient_checkpointing = False + + # Initialize weights and apply final processing + self.post_init() + + def get_image_tokens(self, pixel_values: torch.FloatTensor): + """ + Tokenizes images into discrete tokens with VQGAN module. Converts + obtained image tokens into BPE tokens and wraps with "boi" and "eoi" + special tokens. + + Args: + pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)): + The tensors corresponding to the input images. + """ + batch_size = pixel_values.shape[0] + _, _, image_toks = self.vqmodel.encode(pixel_values) + bpe_toks = self.vocabulary_mapping.convert_img2bpe(image_toks) + bpe_toks = bpe_toks.view(batch_size, -1) + return bpe_toks + + def get_image_features(self, pixel_values: torch.FloatTensor): + """ + Tokenizes images into discrete tokens with VQGAN module and embeds + them with text embeddings layer + + Args: + pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)): + The tensors corresponding to the input images. + """ + image_tokens = self.get_image_tokens(pixel_values) + vision_embeddings = self.get_input_embeddings()(image_tokens) + return vision_embeddings + + def get_placeholder_mask( + self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor, image_features: torch.FloatTensor + ): + """ + Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is + equal to the length of multimodal features. If the lengths are different, an error is raised. + """ + if input_ids is None: + special_image_mask = inputs_embeds == self.get_input_embeddings()( + torch.tensor(self.vocabulary_mapping.image_token_id, dtype=torch.long, device=inputs_embeds.device) + ) + special_image_mask = special_image_mask.all(-1) + else: + special_image_mask = input_ids == self.vocabulary_mapping.image_token_id + + n_image_tokens = special_image_mask.sum() + special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device) + n_image_features = image_features.shape[0] * image_features.shape[1] + if inputs_embeds[special_image_mask].numel() != image_features.numel(): + raise ValueError( + f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}" + ) + return special_image_mask + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Unpack[FlashAttentionKwargs], + ) -> Union[tuple, BaseModelOutputWithPast]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if self.gradient_checkpointing and self.training and use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`." + ) + use_cache = False + + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + if pixel_values is not None: + image_embeds = self.get_image_features(pixel_values) + special_image_mask = self.get_placeholder_mask( + input_ids, inputs_embeds=inputs_embeds, image_features=image_embeds + ) + inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_embeds) + + # torch.jit.trace() doesn't support cache objects in the output + if use_cache and past_key_values is None and not torch.jit.is_tracing(): + past_key_values = DynamicCache(config=self.config) + + if cache_position is None: + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + cache_position = torch.arange( + past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device + ) + + if position_ids is None: + position_ids = cache_position.unsqueeze(0) + + causal_mask = create_causal_mask( + config=self.config, + input_embeds=inputs_embeds, + attention_mask=attention_mask, + cache_position=cache_position, + past_key_values=past_key_values, + position_ids=position_ids, + ) + + # embed positions + hidden_states = inputs_embeds + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + + for decoder_layer in self.layers: + if output_hidden_states: + all_hidden_states += (hidden_states,) + + layer_outputs = decoder_layer( + hidden_states, + attention_mask=causal_mask, + position_ids=position_ids, + past_key_values=past_key_values, + output_attentions=output_attentions, + use_cache=use_cache, + cache_position=cache_position, + **kwargs, + ) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + hidden_states = self.norm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if not return_dict: + return tuple( + v for v in [hidden_states, past_key_values, all_hidden_states, all_self_attns] if v is not None + ) + + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=past_key_values, + hidden_states=all_hidden_states, + attentions=all_self_attns, + ) + + +@auto_docstring( + custom_intro=""" + Chameleon Model with a head on top used for outputting logits for next token prediction. + """ +) +class ChameleonForConditionalGeneration(ChameleonPreTrainedModel, GenerationMixin): + _tied_weights_keys = ["lm_head.weight"] + + def __init__(self, config): + super().__init__(config) + self.model = ChameleonModel(config) + self.vocab_size = config.vocab_size + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_image_tokens(self, pixel_values): + return self.model.get_image_tokens(pixel_values) + + def get_image_features(self, pixel_values): + return self.model.get_image_features(pixel_values) + + @can_return_tuple + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> Union[tuple, CausalLMOutputWithPast]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Example: + + ```python + >>> from transformers import ChameleonProcessor, ChameleonForConditionalGeneration + >>> import torch + >>> import requests + >>> from PIL import Image + + >>> model = ChameleonForConditionalGeneration.from_pretrained("facebook/chameleon-7b", dtype=torch.bfloat16) + >>> processor = ChameleonProcessor.from_pretrained("facebook/chameleon-7b") + + >>> prompt = "I used to know a lot about constellations when I was younger, but as I grew older, I forgot most of what I knew. These are the only two constellations that I really remember now.I would like for you to tell me about 3 more constellations and give me a little bit of history about the constellation." + >>> image = Image.open(requests.get("https://nineplanets.org/wp-content/uploads/2020/12/the-big-dipper-1.jpg", stream=True).raw) + >>> image_2 = Image.open(requests.get("https://www.kxan.com/wp-content/uploads/sites/40/2020/10/ORION.jpg", stream=True).raw) + + >>> inputs = processor(images=[image, image_2], text=prompt, return_tensors="pt").to(model.device, torch.bfloat16) + + >>> generated_ids = model.generate(**inputs, max_new_tokens=100, do_sample=False) + >>> processor.batch_decode(generated_ids, skip_special_tokens=True)[0] + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs = self.model( + input_ids=input_ids, + pixel_values=pixel_values, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=True, + cache_position=cache_position, + **kwargs, + ) + + hidden_states = outputs[0] + logits = self.lm_head(hidden_states) + + # Disallow image tokens which does not include special begin-image and end-image tokens + image_tokens = self.model.vocabulary_mapping.image_tokens + logits[:, :, image_tokens] = torch.finfo(logits.dtype).min + + loss = None + if labels is not None: + loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs) + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def prepare_inputs_for_generation( + self, + input_ids, + pixel_values=None, + past_key_values=None, + attention_mask=None, + inputs_embeds=None, + cache_position=None, + position_ids=None, + use_cache=True, + **kwargs, + ): + # Overwritten -- in specific circumstances we don't want to forward image inputs to the model + + model_inputs = super().prepare_inputs_for_generation( + input_ids, + pixel_values=pixel_values, + past_key_values=past_key_values, + attention_mask=attention_mask, + inputs_embeds=inputs_embeds, + cache_position=cache_position, + position_ids=position_ids, + use_cache=use_cache, + **kwargs, + ) + + if cache_position[0] != 0: + # If we're in cached decoding stage, pixel values should be `None` because input ids do not contain special image token anymore + # Otherwise we need pixel values to be passed to model + model_inputs["pixel_values"] = None + + return model_inputs + + +__all__ = ["ChameleonForConditionalGeneration", "ChameleonModel", "ChameleonPreTrainedModel", "ChameleonVQVAE"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/chameleon/processing_chameleon.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/chameleon/processing_chameleon.py new file mode 100644 index 0000000000000000000000000000000000000000..d481a62b6fc6608bd3088e4766b91ff843680ea0 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/chameleon/processing_chameleon.py @@ -0,0 +1,196 @@ +# coding=utf-8 +# Copyright 2024 Meta Inc. and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Processor class for Chameleon. +""" + +from typing import Optional, Union + +import numpy as np + +from ...feature_extraction_utils import BatchFeature +from ...image_utils import ImageInput +from ...processing_utils import ( + MultiModalData, + ProcessingKwargs, + ProcessorMixin, + TextKwargs, + Unpack, +) +from ...tokenization_utils_base import PreTokenizedInput, TextInput + + +class ChameleonTextKwargs(TextKwargs, total=False): + return_for_text_completion: bool + + +class ChameleonProcessorKwargs(ProcessingKwargs, total=False): + text_kwargs: ChameleonTextKwargs + _defaults = { + "text_kwargs": { + "padding": False, + "return_for_text_completion": False, + "return_mm_token_type_ids": False, + }, + "common_kwargs": { + "return_tensors": "pt", + }, + } + + +class ChameleonProcessor(ProcessorMixin): + r""" + Constructs a Chameleon processor which wraps a Chameleon image processor and a Chameleon tokenizer into a single + processor. + + [`ChameleonProcessor`] offers all the functionalities of [`ChameleonImageProcessor`] and [`LlamaTokenizerFast`]. + See the [`~ChameleonProcessor.__call__`] and [`~ChameleonProcessor.decode`] for more information. + + Args: + image_processor ([`ChameleonImageProcessor`]): + The image processor is a required input. + tokenizer ([`LlamaTokenizerFast`]): + The tokenizer is a required input. + image_seq_length (`int`, *optional*, defaults to 1024): + Sequence length of one image embedding. + image_token (`str`, *optional*, defaults to `""`): + The special token used to indicate image in the text. + """ + + attributes = ["image_processor", "tokenizer"] + tokenizer_class = ("LlamaTokenizer", "LlamaTokenizerFast") + image_processor_class = "ChameleonImageProcessor" + + def __init__(self, image_processor, tokenizer, image_seq_length: int = 1024, image_token: str = ""): + self.image_seq_length = image_seq_length + self.image_token = tokenizer.image_token if hasattr(tokenizer, "image_token") else image_token + self.image_token_id = tokenizer.convert_tokens_to_ids(self.image_token) + self.image_start_token = ( + tokenizer.boi_token if hasattr(tokenizer, "boi_token") else "" + ) # fixed tokens for start and end, so can hardcode + self.image_end_token = tokenizer.eoi_token if hasattr(tokenizer, "eoi_token") else "" + self.image_token_id = tokenizer.convert_tokens_to_ids(self.image_token) + self.image_start_token_id = tokenizer.convert_tokens_to_ids(self.image_start_token) + self.image_end_token_id = tokenizer.convert_tokens_to_ids(self.image_end_token) + self.image_ids = [self.image_token_id, self.image_start_token_id, self.image_end_token_id] + + super().__init__(image_processor, tokenizer) + + def __call__( + self, + images: Optional[ImageInput] = None, + text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None, + audio=None, + videos=None, + **kwargs: Unpack[ChameleonProcessorKwargs], + ) -> BatchFeature: + """ + Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text` + and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode + the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to + CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring + of the above two methods for more information. + + Args: + images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`): + The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch + tensor. Both channels-first and channels-last formats are supported. + text (`str`, `list[str]`, `list[list[str]]`): + The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings + (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set + `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). + return_tensors (`str` or [`~utils.TensorType`], *optional*): + If set, will return tensors of a particular framework. Acceptable values are: + + - `'tf'`: Return TensorFlow `tf.constant` objects. + - `'pt'`: Return PyTorch `torch.Tensor` objects. + - `'np'`: Return NumPy `np.ndarray` objects. + - `'jax'`: Return JAX `jnp.ndarray` objects. + + Returns: + [`BatchFeature`]: A [`BatchFeature`] with the following fields: + + - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`. + - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when + `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not + `None`). + - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`. + """ + + if isinstance(text, str): + text = [text] + elif not isinstance(text, list) and not isinstance(text[0], str): + raise TypeError("Invalid input text. Please provide a string, or a list of strings") + if text is None and images is None: + raise ValueError("You must provide either text or images") + + output_kwargs = self._merge_kwargs( + ChameleonProcessorKwargs, + tokenizer_init_kwargs=self.tokenizer.init_kwargs, + **kwargs, + ) + return_for_text_completion = output_kwargs["text_kwargs"].pop("return_for_text_completion", False) + + # Replace the image token with the expanded image token sequence + prompt_strings = [] + one_img_tokens = self.image_start_token + (self.image_token * self.image_seq_length) + self.image_end_token + for sample in text: + sample = sample.replace(self.image_token, one_img_tokens) + if not return_for_text_completion: + sample += self.tokenizer.sep_token # special Chameleon treatment to add sep for chat mode + prompt_strings.append(sample) + + image_inputs = {} + if images is not None: + image_inputs = self.image_processor(images, **output_kwargs["images_kwargs"]) + + return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None) + return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", False) + text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"], return_tensors=None) + self._check_special_mm_tokens(prompt_strings, text_inputs, modalities=["image"]) + + if return_mm_token_type_ids: + array_ids = np.array(text_inputs["input_ids"]) + mm_token_type_ids = np.zeros_like(text_inputs["input_ids"]) + mm_token_type_ids[np.isin(array_ids, self.image_ids)] = 1 + text_inputs["mm_token_type_ids"] = mm_token_type_ids.tolist() + + return BatchFeature(data={**text_inputs, **image_inputs}, tensor_type=return_tensors) + + def _get_num_multimodal_tokens(self, image_sizes=None, **kwargs): + """ + Computes the number of placeholder tokens needed for multimodal inputs with the given sizes. + + Args: + image_sizes (`list[list[int]]`, *optional*): + The input sizes formatted as (height, width) per each image. + + Returns: + `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided + input modalities, along with other useful data. + """ + + vision_data = {} + if image_sizes is not None: + # add 2 for BOI and EOI tokens + num_image_tokens = [self.image_seq_length + 2] * len(image_sizes) + num_image_patches = [1] * len(image_sizes) + + vision_data.update({"num_image_tokens": num_image_tokens, "num_image_patches": num_image_patches}) + + return MultiModalData(**vision_data) + + +__all__ = ["ChameleonProcessor"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/chinese_clip/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/chinese_clip/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..9c4476b9080ff514fe273fde33d96b5c0edaba2b --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/chinese_clip/__init__.py @@ -0,0 +1,31 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_chinese_clip import * + from .feature_extraction_chinese_clip import * + from .image_processing_chinese_clip import * + from .image_processing_chinese_clip_fast import * + from .modeling_chinese_clip import * + from .processing_chinese_clip import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/chinese_clip/configuration_chinese_clip.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/chinese_clip/configuration_chinese_clip.py new file mode 100644 index 0000000000000000000000000000000000000000..c628107048b9b7651edf1bd3ac28ccffc6e29dbc --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/chinese_clip/configuration_chinese_clip.py @@ -0,0 +1,423 @@ +# coding=utf-8 +# Copyright 2022 The OFA-Sys Team Authors and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Chinese-CLIP model configuration""" + +from collections import OrderedDict +from collections.abc import Mapping +from typing import TYPE_CHECKING, Any, Optional + + +if TYPE_CHECKING: + from ...processing_utils import ProcessorMixin + from ...utils import TensorType + +from ...configuration_utils import PretrainedConfig +from ...onnx import OnnxConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class ChineseCLIPTextConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`ChineseCLIPModel`]. It is used to instantiate a + Chinese CLIP model according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of the Chinese CLIP + [OFA-Sys/chinese-clip-vit-base-patch16](https: + //huggingface.co/OFA-Sys/chinese-clip-vit-base-patch16) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 30522): + Vocabulary size of the CHINESE_CLIP model. Defines the number of different tokens that can be represented + by the `inputs_ids` passed when calling [`ChineseCLIPModel`]. + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + intermediate_size (`int`, *optional*, defaults to 3072): + Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder. + hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"silu"` and `"gelu_new"` are supported. + hidden_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout ratio for the attention probabilities. + max_position_embeddings (`int`, *optional*, defaults to 512): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + type_vocab_size (`int`, *optional*, defaults to 2): + The vocabulary size of the `token_type_ids` passed when calling [`ChineseCLIPModel`]. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + initializer_factor (`float`, *optional*, defaults to 1.0): + A factor for initializing all weight matrices (should be kept to 1, used internally for initialization + testing). + layer_norm_eps (`float`, *optional*, defaults to 1e-12): + The epsilon used by the layer normalization layers. + pad_token_id (`int`, *optional*, defaults to 0): + Padding token id. + position_embedding_type (`str`, *optional*, defaults to `"absolute"`): + Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For + positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to + [Self-Attention with Relative Position Representations (Shaw et al.)](https://huggingface.co/papers/1803.02155). + For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models + with Better Relative Position Embeddings (Huang et al.)](https://huggingface.co/papers/2009.13658). + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + + Example: + + ```python + >>> from transformers import ChineseCLIPTextConfig, ChineseCLIPTextModel + + >>> # Initializing a ChineseCLIPTextConfig with OFA-Sys/chinese-clip-vit-base-patch16 style configuration + >>> configuration = ChineseCLIPTextConfig() + + >>> # Initializing a ChineseCLIPTextModel (with random weights) from the OFA-Sys/chinese-clip-vit-base-patch16 style configuration + >>> model = ChineseCLIPTextModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "chinese_clip_text_model" + base_config_key = "text_config" + + def __init__( + self, + vocab_size=30522, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=2, + initializer_range=0.02, + initializer_factor=1.0, + layer_norm_eps=1e-12, + pad_token_id=0, + position_embedding_type="absolute", + use_cache=True, + **kwargs, + ): + super().__init__(pad_token_id=pad_token_id, **kwargs) + + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.hidden_act = hidden_act + self.intermediate_size = intermediate_size + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.initializer_range = initializer_range + self.initializer_factor = initializer_factor + self.layer_norm_eps = layer_norm_eps + self.position_embedding_type = position_embedding_type + self.use_cache = use_cache + + +class ChineseCLIPVisionConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`ChineseCLIPModel`]. It is used to instantiate an + ChineseCLIP model according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of the ChineseCLIP + [OFA-Sys/chinese-clip-vit-base-patch16](https://huggingface.co/OFA-Sys/chinese-clip-vit-base-patch16) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + intermediate_size (`int`, *optional*, defaults to 3072): + Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + projection_dim (`int`, *optional*, defaults to 512): + Dimensionality of text and vision projection layers. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + num_channels (`int`, *optional*, defaults to 3): + The number of input channels. + image_size (`int`, *optional*, defaults to 224): + The size (resolution) of each image. + patch_size (`int`, *optional*, defaults to 32): + The size (resolution) of each patch. + hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported. + layer_norm_eps (`float`, *optional*, defaults to 1e-05): + The epsilon used by the layer normalization layers. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + initializer_factor (`float`, *optional*, defaults to 1.0): + A factor for initializing all weight matrices (should be kept to 1, used internally for initialization + testing). + Example: + ```python + >>> from transformers import ChineseCLIPVisionConfig, ChineseCLIPVisionModel + + >>> # Initializing a ChineseCLIPVisionConfig with OFA-Sys/chinese-clip-vit-base-patch16 style configuration + >>> configuration = ChineseCLIPVisionConfig() + + >>> # Initializing a ChineseCLIPVisionModel (with random weights) from the OFA-Sys/chinese-clip-vit-base-patch16 style configuration + >>> model = ChineseCLIPVisionModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "chinese_clip_vision_model" + base_config_key = "vision_config" + + def __init__( + self, + hidden_size=768, + intermediate_size=3072, + projection_dim=512, + num_hidden_layers=12, + num_attention_heads=12, + num_channels=3, + image_size=224, + patch_size=32, + hidden_act="quick_gelu", + layer_norm_eps=1e-5, + attention_dropout=0.0, + initializer_range=0.02, + initializer_factor=1.0, + **kwargs, + ): + super().__init__(**kwargs) + + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.projection_dim = projection_dim + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.num_channels = num_channels + self.patch_size = patch_size + self.image_size = image_size + self.initializer_range = initializer_range + self.initializer_factor = initializer_factor + self.attention_dropout = attention_dropout + self.layer_norm_eps = layer_norm_eps + self.hidden_act = hidden_act + + +class ChineseCLIPConfig(PretrainedConfig): + r""" + [`ChineseCLIPConfig`] is the configuration class to store the configuration of a [`ChineseCLIPModel`]. It is used + to instantiate Chinese-CLIP model according to the specified arguments, defining the text model and vision model + configs. Instantiating a configuration with the defaults will yield a similar configuration to that of the + Chinese-CLIP [OFA-Sys/chinese-clip-vit-base-patch16](https://huggingface.co/OFA-Sys/chinese-clip-vit-base-patch16) + architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + text_config (`dict`, *optional*): + Dictionary of configuration options used to initialize [`ChineseCLIPTextConfig`]. + vision_config (`dict`, *optional*): + Dictionary of configuration options used to initialize [`ChineseCLIPVisionConfig`]. + projection_dim (`int`, *optional*, defaults to 512): + Dimensionality of text and vision projection layers. + logit_scale_init_value (`float`, *optional*, defaults to 2.6592): + The initial value of the *logit_scale* parameter. Default is used as per the original ChineseCLIP + implementation. + kwargs (*optional*): + Dictionary of keyword arguments. + + Example: + + ```python + >>> from transformers import ChineseCLIPConfig, ChineseCLIPModel + + >>> # Initializing a ChineseCLIPConfig with OFA-Sys/chinese-clip-vit-base-patch16 style configuration + >>> configuration = ChineseCLIPConfig() + + >>> # Initializing a ChineseCLIPModel (with random weights) from the OFA-Sys/chinese-clip-vit-base-patch16 style configuration + >>> model = ChineseCLIPModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + + >>> # We can also initialize a ChineseCLIPConfig from a ChineseCLIPTextConfig and a ChineseCLIPVisionConfig + + >>> # Initializing a ChineseCLIPTextConfig and ChineseCLIPVisionConfig configuration + >>> config_text = ChineseCLIPTextConfig() + >>> config_vision = ChineseCLIPVisionConfig() + + >>> config = ChineseCLIPConfig.from_text_vision_configs(config_text, config_vision) + ```""" + + model_type = "chinese_clip" + sub_configs = {"text_config": ChineseCLIPTextConfig, "vision_config": ChineseCLIPVisionConfig} + + def __init__( + self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs + ): + # If `_config_dict` exist, we use them for the backward compatibility. + # We pop out these 2 attributes before calling `super().__init__` to avoid them being saved (which causes a lot + # of confusion!). + text_config_dict = kwargs.pop("text_config_dict", None) + vision_config_dict = kwargs.pop("vision_config_dict", None) + + super().__init__(**kwargs) + + # Instead of simply assigning `[text|vision]_config_dict` to `[text|vision]_config`, we use the values in + # `[text|vision]_config_dict` to update the values in `[text|vision]_config`. The values should be same in most + # cases, but we don't want to break anything regarding `_config_dict` that existed before commit `8827e1b2`. + if text_config_dict is not None: + if text_config is None: + text_config = {} + + # This is the complete result when using `text_config_dict`. + _text_config_dict = ChineseCLIPTextConfig(**text_config_dict).to_dict() + + # Give a warning if the values exist in both `_text_config_dict` and `text_config` but being different. + for key, value in _text_config_dict.items(): + if key in text_config and value != text_config[key] and key != "transformers_version": + # If specified in `text_config_dict` + if key in text_config_dict: + message = ( + f"`{key}` is found in both `text_config_dict` and `text_config` but with different values. " + f'The value `text_config_dict["{key}"]` will be used instead.' + ) + # If inferred from default argument values (just to be super careful) + else: + message = ( + f"`text_config_dict` is provided which will be used to initialize `ChineseCLIPTextConfig`. " + f'The value `text_config["{key}"]` will be overridden.' + ) + logger.info(message) + + # Update all values in `text_config` with the ones in `_text_config_dict`. + text_config.update(_text_config_dict) + + if vision_config_dict is not None: + if vision_config is None: + vision_config = {} + + # This is the complete result when using `vision_config_dict`. + _vision_config_dict = ChineseCLIPVisionConfig(**vision_config_dict).to_dict() + # convert keys to string instead of integer + if "id2label" in _vision_config_dict: + _vision_config_dict["id2label"] = { + str(key): value for key, value in _vision_config_dict["id2label"].items() + } + + # Give a warning if the values exist in both `_vision_config_dict` and `vision_config` but being different. + for key, value in _vision_config_dict.items(): + if key in vision_config and value != vision_config[key] and key != "transformers_version": + # If specified in `vision_config_dict` + if key in vision_config_dict: + message = ( + f"`{key}` is found in both `vision_config_dict` and `vision_config` but with different " + f'values. The value `vision_config_dict["{key}"]` will be used instead.' + ) + # If inferred from default argument values (just to be super careful) + else: + message = ( + f"`vision_config_dict` is provided which will be used to initialize " + f'`ChineseCLIPVisionConfig`. The value `vision_config["{key}"]` will be overridden.' + ) + logger.info(message) + + # Update all values in `vision_config` with the ones in `_vision_config_dict`. + vision_config.update(_vision_config_dict) + + if text_config is None: + text_config = {} + logger.info("`text_config` is `None`. Initializing the `ChineseCLIPTextConfig` with default values.") + + if vision_config is None: + vision_config = {} + logger.info("`vision_config` is `None`. initializing the `ChineseCLIPVisionConfig` with default values.") + + self.text_config = ChineseCLIPTextConfig(**text_config) + self.vision_config = ChineseCLIPVisionConfig(**vision_config) + + self.projection_dim = projection_dim + self.logit_scale_init_value = logit_scale_init_value + self.initializer_factor = 1.0 + self.initializer_range = 0.02 + + +class ChineseCLIPOnnxConfig(OnnxConfig): + @property + def inputs(self) -> Mapping[str, Mapping[int, str]]: + return OrderedDict( + [ + ("input_ids", {0: "batch", 1: "sequence"}), + ("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}), + ("attention_mask", {0: "batch", 1: "sequence"}), + ] + ) + + @property + def outputs(self) -> Mapping[str, Mapping[int, str]]: + return OrderedDict( + [ + ("logits_per_image", {0: "batch"}), + ("logits_per_text", {0: "batch"}), + ("text_embeds", {0: "batch"}), + ("image_embeds", {0: "batch"}), + ] + ) + + @property + def atol_for_validation(self) -> float: + return 1e-4 + + def generate_dummy_inputs( + self, + processor: "ProcessorMixin", + batch_size: int = -1, + seq_length: int = -1, + framework: Optional["TensorType"] = None, + ) -> Mapping[str, Any]: + text_input_dict = super().generate_dummy_inputs( + processor.tokenizer, batch_size=batch_size, seq_length=seq_length, framework=framework + ) + image_input_dict = super().generate_dummy_inputs( + processor.image_processor, batch_size=batch_size, framework=framework + ) + return {**text_input_dict, **image_input_dict} + + @property + def default_onnx_opset(self) -> int: + return 14 + + +__all__ = ["ChineseCLIPConfig", "ChineseCLIPOnnxConfig", "ChineseCLIPTextConfig", "ChineseCLIPVisionConfig"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/chinese_clip/feature_extraction_chinese_clip.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/chinese_clip/feature_extraction_chinese_clip.py new file mode 100644 index 0000000000000000000000000000000000000000..c4895bb06b510cfeb64294759c31bcc8d0e3d098 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/chinese_clip/feature_extraction_chinese_clip.py @@ -0,0 +1,38 @@ +# coding=utf-8 +# Copyright 2021 The OFA-Sys Team Authors and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Feature extractor class for Chinese-CLIP.""" + +import warnings + +from ...utils import logging +from ...utils.import_utils import requires +from .image_processing_chinese_clip import ChineseCLIPImageProcessor + + +logger = logging.get_logger(__name__) + + +@requires(backends=("vision",)) +class ChineseCLIPFeatureExtractor(ChineseCLIPImageProcessor): + def __init__(self, *args, **kwargs) -> None: + warnings.warn( + "The class ChineseCLIPFeatureExtractor is deprecated and will be removed in version 5 of Transformers." + " Please use ChineseCLIPImageProcessor instead.", + FutureWarning, + ) + super().__init__(*args, **kwargs) + + +__all__ = ["ChineseCLIPFeatureExtractor"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/chinese_clip/image_processing_chinese_clip.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/chinese_clip/image_processing_chinese_clip.py new file mode 100644 index 0000000000000000000000000000000000000000..c55805f28913dca050f48ee24f8fa7050f974513 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/chinese_clip/image_processing_chinese_clip.py @@ -0,0 +1,314 @@ +# coding=utf-8 +# Copyright 2022 The OFA-Sys Team Authors and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Image processor class for Chinese-CLIP.""" + +from typing import Optional, Union + +import numpy as np + +from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict +from ...image_transforms import ( + convert_to_rgb, + get_resize_output_image_size, + resize, + to_channel_dimension_format, +) +from ...image_utils import ( + OPENAI_CLIP_MEAN, + OPENAI_CLIP_STD, + ChannelDimension, + ImageInput, + PILImageResampling, + infer_channel_dimension_format, + is_scaled_image, + make_flat_list_of_images, + to_numpy_array, + valid_images, + validate_preprocess_arguments, +) +from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging + + +if is_vision_available(): + import PIL + + +from ...utils.import_utils import requires + + +logger = logging.get_logger(__name__) + + +@requires(backends=("vision",)) +class ChineseCLIPImageProcessor(BaseImageProcessor): + r""" + Constructs a Chinese-CLIP image processor. + + Args: + do_resize (`bool`, *optional*, defaults to `True`): + Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by + `do_resize` in the `preprocess` method. + size (`dict[str, int]` *optional*, defaults to `{"shortest_edge": 224}`): + Size of the image after resizing. The shortest edge of the image is resized to size["shortest_edge"], with + the longest edge resized to keep the input aspect ratio. Can be overridden by `size` in the `preprocess` + method. + resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`): + Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method. + do_center_crop (`bool`, *optional*, defaults to `True`): + Whether to center crop the image to the specified `crop_size`. Can be overridden by `do_center_crop` in the + `preprocess` method. + crop_size (`dict[str, int]` *optional*, defaults to 224): + Size of the output image after applying `center_crop`. Can be overridden by `crop_size` in the `preprocess` + method. + do_rescale (`bool`, *optional*, defaults to `True`): + Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in + the `preprocess` method. + rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): + Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess` + method. + do_normalize (`bool`, *optional*, defaults to `True`): + Whether to normalize the image. Can be overridden by `do_normalize` in the `preprocess` method. + image_mean (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`): + Mean to use if normalizing the image. This is a float or list of floats the length of the number of + channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. + image_std (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`): + Standard deviation to use if normalizing the image. This is a float or list of floats the length of the + number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method. + Can be overridden by the `image_std` parameter in the `preprocess` method. + do_convert_rgb (`bool`, *optional*, defaults to `True`): + Whether to convert the image to RGB. + """ + + model_input_names = ["pixel_values"] + + def __init__( + self, + do_resize: bool = True, + size: Optional[dict[str, int]] = None, + resample: PILImageResampling = PILImageResampling.BICUBIC, + do_center_crop: bool = True, + crop_size: Optional[dict[str, int]] = None, + do_rescale: bool = True, + rescale_factor: Union[int, float] = 1 / 255, + do_normalize: bool = True, + image_mean: Optional[Union[float, list[float]]] = None, + image_std: Optional[Union[float, list[float]]] = None, + do_convert_rgb: bool = True, + **kwargs, + ) -> None: + super().__init__(**kwargs) + size = size if size is not None else {"shortest_edge": 224} + size = get_size_dict(size, default_to_square=False) + crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224} + crop_size = get_size_dict(crop_size) + + self.do_resize = do_resize + self.size = size + self.resample = resample + self.do_center_crop = do_center_crop + self.crop_size = crop_size + self.do_rescale = do_rescale + self.rescale_factor = rescale_factor + self.do_normalize = do_normalize + self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN + self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD + self.do_convert_rgb = do_convert_rgb + + def resize( + self, + image: np.ndarray, + size: dict[str, int], + resample: PILImageResampling = PILImageResampling.BICUBIC, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ) -> np.ndarray: + """ + Resize an image. The shortest edge of the image is resized to size["shortest_edge"], with the longest edge + resized to keep the input aspect ratio. + + Args: + image (`np.ndarray`): + Image to resize. + size (`dict[str, int]`): + Size of the output image. + resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`): + Resampling filter to use when resiizing the image. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format of the image. If not provided, it will be the same as the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format of the input image. If not provided, it will be inferred from the input + image. + """ + size = get_size_dict(size, default_to_square=False) + output_size = get_resize_output_image_size( + image, size=(size["height"], size["width"]), default_to_square=False, input_data_format=input_data_format + ) + return resize( + image, + size=output_size, + resample=resample, + data_format=data_format, + input_data_format=input_data_format, + **kwargs, + ) + + @filter_out_non_signature_kwargs() + def preprocess( + self, + images: ImageInput, + do_resize: Optional[bool] = None, + size: Optional[dict[str, int]] = None, + resample: Optional[PILImageResampling] = None, + do_center_crop: Optional[bool] = None, + crop_size: Optional[int] = None, + do_rescale: Optional[bool] = None, + rescale_factor: Optional[float] = None, + do_normalize: Optional[bool] = None, + image_mean: Optional[Union[float, list[float]]] = None, + image_std: Optional[Union[float, list[float]]] = None, + do_convert_rgb: Optional[bool] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + data_format: Optional[ChannelDimension] = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> PIL.Image.Image: + """ + Preprocess an image or batch of images. + + Args: + images (`ImageInput`): + Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If + passing in images with pixel values between 0 and 1, set `do_rescale=False`. + do_resize (`bool`, *optional*, defaults to `self.do_resize`): + Whether to resize the image. + size (`dict[str, int]`, *optional*, defaults to `self.size`): + Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with + the longest edge resized to keep the input aspect ratio. + resample (`int`, *optional*, defaults to `self.resample`): + Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only + has an effect if `do_resize` is set to `True`. + do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`): + Whether to center crop the image. + crop_size (`dict[str, int]`, *optional*, defaults to `self.crop_size`): + Size of the center crop. Only has an effect if `do_center_crop` is set to `True`. + do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): + Whether to rescale the image. + rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): + Rescale factor to rescale the image by if `do_rescale` is set to `True`. + do_normalize (`bool`, *optional*, defaults to `self.do_normalize`): + Whether to normalize the image. + image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`): + Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`. + image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`): + Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to + `True`. + do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`): + Whether to convert the image to RGB. + return_tensors (`str` or `TensorType`, *optional*): + The type of tensors to return. Can be one of: + - Unset: Return a list of `np.ndarray`. + - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`. + - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. + - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. + - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. + data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - Unset: Use the channel dimension format of the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + """ + + do_resize = do_resize if do_resize is not None else self.do_resize + size = size if size is not None else self.size + size = get_size_dict(size, default_to_square=False) + resample = resample if resample is not None else self.resample + do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop + crop_size = crop_size if crop_size is not None else self.crop_size + crop_size = get_size_dict(crop_size) + do_rescale = do_rescale if do_rescale is not None else self.do_rescale + rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor + do_normalize = do_normalize if do_normalize is not None else self.do_normalize + image_mean = image_mean if image_mean is not None else self.image_mean + image_std = image_std if image_std is not None else self.image_std + do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb + + images = make_flat_list_of_images(images) + + if not valid_images(images): + raise ValueError( + "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " + "torch.Tensor, tf.Tensor or jax.ndarray." + ) + validate_preprocess_arguments( + do_rescale=do_rescale, + rescale_factor=rescale_factor, + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + do_center_crop=do_center_crop, + crop_size=crop_size, + do_resize=do_resize, + size=size, + resample=resample, + ) + if do_convert_rgb: + images = [convert_to_rgb(image) for image in images] + + # All transformations expect numpy arrays. + images = [to_numpy_array(image) for image in images] + + if do_rescale and is_scaled_image(images[0]): + logger.warning_once( + "It looks like you are trying to rescale already rescaled images. If the input" + " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again." + ) + + if input_data_format is None: + # We assume that all images have the same channel dimension format. + input_data_format = infer_channel_dimension_format(images[0]) + + all_images = [] + for image in images: + if do_resize: + image = self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format) + + if do_center_crop: + image = self.center_crop(image=image, size=crop_size, input_data_format=input_data_format) + + if do_rescale: + image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format) + + if do_normalize: + image = self.normalize( + image=image, mean=image_mean, std=image_std, input_data_format=input_data_format + ) + + all_images.append(image) + images = [ + to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) + for image in all_images + ] + + data = {"pixel_values": images} + return BatchFeature(data=data, tensor_type=return_tensors) + + +__all__ = ["ChineseCLIPImageProcessor"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/chinese_clip/image_processing_chinese_clip_fast.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/chinese_clip/image_processing_chinese_clip_fast.py new file mode 100644 index 0000000000000000000000000000000000000000..6ced07e0c29e523ce3be8775d16a8e1858887604 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/chinese_clip/image_processing_chinese_clip_fast.py @@ -0,0 +1,37 @@ +# coding=utf-8 +# Copyright 2025 The OFA-Sys Team Authors and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Fast Image processor class for Chinese-CLIP.""" + +from ...image_processing_utils_fast import BaseImageProcessorFast +from ...image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, PILImageResampling +from ...utils import auto_docstring + + +@auto_docstring +class ChineseCLIPImageProcessorFast(BaseImageProcessorFast): + resample = PILImageResampling.BICUBIC + image_mean = OPENAI_CLIP_MEAN + image_std = OPENAI_CLIP_STD + size = {"shortest_edge": 224} + default_to_square = False + crop_size = {"height": 224, "width": 224} + do_resize = True + do_center_crop = True + do_rescale = True + do_normalize = True + do_convert_rgb = True + + +__all__ = ["ChineseCLIPImageProcessorFast"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/chinese_clip/modeling_chinese_clip.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/chinese_clip/modeling_chinese_clip.py new file mode 100644 index 0000000000000000000000000000000000000000..a0b461ab3ed37445c0360b82cd20f9d76a7e48bb --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/chinese_clip/modeling_chinese_clip.py @@ -0,0 +1,1213 @@ +# coding=utf-8 +# Copyright 2022 The OFA-Sys Team Authors and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch Chinese-CLIP model.""" + +from dataclasses import dataclass +from typing import Any, Callable, Optional, Union + +import torch +from torch import nn + +from ...activations import ACT2FN +from ...cache_utils import Cache +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_outputs import ( + BaseModelOutput, + BaseModelOutputWithPooling, + BaseModelOutputWithPoolingAndCrossAttentions, +) +from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel +from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer +from ...utils import ModelOutput, auto_docstring, can_return_tuple, filter_out_non_signature_kwargs, logging, torch_int +from .configuration_chinese_clip import ChineseCLIPConfig, ChineseCLIPTextConfig, ChineseCLIPVisionConfig + + +logger = logging.get_logger(__name__) + + +# https://sachinruk.github.io/blog/pytorch/pytorch%20lightning/loss%20function/gpu/2021/03/07/CLIP.html +# Copied from transformers.models.clip.modeling_clip.contrastive_loss +def contrastive_loss(logits: torch.Tensor) -> torch.Tensor: + return nn.functional.cross_entropy(logits, torch.arange(len(logits), device=logits.device)) + + +def chinese_clip_loss(similarity: torch.Tensor) -> torch.Tensor: + caption_loss = contrastive_loss(similarity) + image_loss = contrastive_loss(similarity.t()) + return (caption_loss + image_loss) / 2.0 + + +@dataclass +@auto_docstring +class ChineseCLIPOutput(ModelOutput): + r""" + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`): + Contrastive loss for image-text similarity. + logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`): + The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text + similarity scores. + logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`): + The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image + similarity scores. + text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`): + The text embeddings obtained by applying the projection layer to the pooled output of + [`ChineseCLIPTextModel`]. + image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`): + The image embeddings obtained by applying the projection layer to the pooled output of + [`ChineseCLIPVisionModel`]. + text_model_output (`BaseModelOutputWithPoolingAndCrossAttentions`): + The output of the [`ChineseCLIPTextModel`]. + vision_model_output (`BaseModelOutputWithPoolingAndCrossAttentions`): + The output of the [`ChineseCLIPVisionModel`]. + """ + + loss: Optional[torch.FloatTensor] = None + logits_per_image: Optional[torch.FloatTensor] = None + logits_per_text: Optional[torch.FloatTensor] = None + text_embeds: Optional[torch.FloatTensor] = None + image_embeds: Optional[torch.FloatTensor] = None + text_model_output: BaseModelOutputWithPoolingAndCrossAttentions = None + vision_model_output: BaseModelOutputWithPoolingAndCrossAttentions = None + + def to_tuple(self) -> tuple[Any]: + return tuple( + self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple() + for k in self.keys() + ) + + +# Copied from transformers.models.align.modeling_align.AlignTextEmbeddings with Align->ChineseCLIP +class ChineseCLIPTextEmbeddings(nn.Module): + """Construct the embeddings from word, position and token_type embeddings.""" + + def __init__(self, config): + super().__init__() + self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id) + self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) + self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size) + + # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load + # any TensorFlow checkpoint file + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + # position_ids (1, len position emb) is contiguous in memory and exported when serialized + self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) + self.register_buffer( + "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False + ) + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + ) -> torch.Tensor: + if input_ids is not None: + input_shape = input_ids.size() + else: + input_shape = inputs_embeds.size()[:-1] + + seq_length = input_shape[1] + + if position_ids is None: + position_ids = self.position_ids[:, :seq_length] + + # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs + # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves + # issue #5664 + if token_type_ids is None: + if hasattr(self, "token_type_ids"): + buffered_token_type_ids = self.token_type_ids[:, :seq_length] + buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length) + token_type_ids = buffered_token_type_ids_expanded + else: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device) + + if inputs_embeds is None: + inputs_embeds = self.word_embeddings(input_ids) + token_type_embeddings = self.token_type_embeddings(token_type_ids) + + embeddings = inputs_embeds + token_type_embeddings + if self.position_embedding_type == "absolute": + position_embeddings = self.position_embeddings(position_ids) + embeddings += position_embeddings + embeddings = self.LayerNorm(embeddings) + embeddings = self.dropout(embeddings) + return embeddings + + +# Copied from transformers.models.clip.modeling_clip.CLIPVisionEmbeddings with CLIP->ChineseCLIP +class ChineseCLIPVisionEmbeddings(nn.Module): + def __init__(self, config: ChineseCLIPVisionConfig): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.image_size = config.image_size + self.patch_size = config.patch_size + + self.class_embedding = nn.Parameter(torch.randn(self.embed_dim)) + + self.patch_embedding = nn.Conv2d( + in_channels=config.num_channels, + out_channels=self.embed_dim, + kernel_size=self.patch_size, + stride=self.patch_size, + bias=False, + ) + + self.num_patches = (self.image_size // self.patch_size) ** 2 + self.num_positions = self.num_patches + 1 + self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim) + self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False) + + def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor: + """ + This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution + images. This method is also adapted to support torch.jit tracing. + + Adapted from: + - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and + - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211 + """ + + num_patches = embeddings.shape[1] - 1 + position_embedding = self.position_embedding.weight.unsqueeze(0) + num_positions = position_embedding.shape[1] - 1 + + # always interpolate when tracing to ensure the exported model works for dynamic input shapes + if not torch.jit.is_tracing() and num_patches == num_positions and height == width: + return self.position_embedding(self.position_ids) + + class_pos_embed = position_embedding[:, :1] + patch_pos_embed = position_embedding[:, 1:] + + dim = embeddings.shape[-1] + + new_height = height // self.patch_size + new_width = width // self.patch_size + + sqrt_num_positions = torch_int(num_positions**0.5) + patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim) + patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2) + + patch_pos_embed = nn.functional.interpolate( + patch_pos_embed, + size=(new_height, new_width), + mode="bicubic", + align_corners=False, + ) + + patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim) + + return torch.cat((class_pos_embed, patch_pos_embed), dim=1) + + def forward(self, pixel_values: torch.FloatTensor, interpolate_pos_encoding=False) -> torch.Tensor: + batch_size, _, height, width = pixel_values.shape + if not interpolate_pos_encoding and (height != self.image_size or width != self.image_size): + raise ValueError( + f"Input image size ({height}*{width}) doesn't match model ({self.image_size}*{self.image_size})." + ) + target_dtype = self.patch_embedding.weight.dtype + patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid] + patch_embeds = patch_embeds.flatten(2).transpose(1, 2) + + class_embeds = self.class_embedding.expand(batch_size, 1, -1) + embeddings = torch.cat([class_embeds, patch_embeds], dim=1) + if interpolate_pos_encoding: + embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width) + else: + embeddings = embeddings + self.position_embedding(self.position_ids) + return embeddings + + +# Copied from transformers.models.align.modeling_align.eager_attention_forward +def eager_attention_forward( + module: nn.Module, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attention_mask: Optional[torch.Tensor], + scaling: float, + dropout: float = 0.0, + head_mask: Optional[torch.Tensor] = None, + **kwargs, +): + attn_weights = torch.matmul(query, key.transpose(2, 3)) * scaling + if attention_mask is not None: + causal_mask = attention_mask[:, :, :, : key.shape[-2]] + attn_weights = attn_weights + causal_mask + + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype) + attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) + + if head_mask is not None: + attn_weights = attn_weights * head_mask.view(1, -1, 1, 1) + + attn_output = torch.matmul(attn_weights, value) + attn_output = attn_output.transpose(1, 2).contiguous() + return attn_output, attn_weights + + +# Copied from transformers.models.align.modeling_align.AlignTextSelfAttention with Align->ChineseCLIP +class ChineseCLIPTextSelfAttention(nn.Module): + def __init__(self, config): + super().__init__() + if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): + raise ValueError( + f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention " + f"heads ({config.num_attention_heads})" + ) + + self.config = config + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size) + self.key = nn.Linear(config.hidden_size, self.all_head_size) + self.value = nn.Linear(config.hidden_size, self.all_head_size) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + self.attention_dropout = config.attention_probs_dropout_prob + self.scaling = self.attention_head_size**-0.5 + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = False, + **kwargs, + ) -> tuple[torch.Tensor]: + input_shape = hidden_states.shape[:-1] + hidden_shape = (*input_shape, -1, self.attention_head_size) + + query_states = self.query(hidden_states).view(hidden_shape).transpose(1, 2) + key_states = self.key(hidden_states).view(hidden_shape).transpose(1, 2) + value_states = self.value(hidden_states).view(hidden_shape).transpose(1, 2) + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + attn_output, attn_weights = attention_interface( + self, + query_states, + key_states, + value_states, + attention_mask, + dropout=0.0 if not self.training else self.attention_dropout, + scaling=self.scaling, + head_mask=head_mask, + **kwargs, + ) + + attn_output = attn_output.reshape(*input_shape, -1).contiguous() + outputs = (attn_output, attn_weights) if output_attentions else (attn_output,) + return outputs + + +# Copied from transformers.models.bert.modeling_bert.BertSelfOutput with Bert->ChineseCLIPText +class ChineseCLIPTextSelfOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +# Copied from transformers.models.align.modeling_align.AlignTextAttention with Align->ChineseCLIP +class ChineseCLIPTextAttention(nn.Module): + def __init__(self, config): + super().__init__() + self.self = ChineseCLIPTextSelfAttention(config) + self.output = ChineseCLIPTextSelfOutput(config) + self.pruned_heads = set() + + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads + ) + + # Prune linear layers + self.self.query = prune_linear_layer(self.self.query, index) + self.self.key = prune_linear_layer(self.self.key, index) + self.self.value = prune_linear_layer(self.self.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.self.num_attention_heads = self.self.num_attention_heads - len(heads) + self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = False, + **kwargs, + ) -> tuple[torch.Tensor]: + self_outputs = self.self( + hidden_states, + attention_mask=attention_mask, + head_mask=head_mask, + output_attentions=output_attentions, + **kwargs, + ) + attention_output = self.output(self_outputs[0], hidden_states) + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + return outputs + + +class ChineseCLIPVisionAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.embed_dim // self.num_heads + if self.head_dim * self.num_heads != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:" + f" {self.num_heads})." + ) + self.scale = self.head_dim**-0.5 + self.dropout = config.attention_dropout + + self.k_proj = nn.Linear(self.embed_dim, self.embed_dim) + self.v_proj = nn.Linear(self.embed_dim, self.embed_dim) + self.q_proj = nn.Linear(self.embed_dim, self.embed_dim) + self.out_proj = nn.Linear(self.embed_dim, self.embed_dim) + + def forward( + self, hidden_states: torch.Tensor, output_attentions: Optional[bool] = False, **kwargs + ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]: + """Input shape: Batch x Time x Channel""" + + input_shape = hidden_states.shape[:-1] + hidden_shape = (*input_shape, -1, self.head_dim) + + query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2) * self.scale + key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2) + value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2) + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + attn_output, attn_weights = attention_interface( + self, + query_states, + key_states, + value_states, + None, + dropout=0.0 if not self.training else self.dropout, + scaling=1.0, + **kwargs, + ) + + attn_output = attn_output.reshape(*input_shape, -1).contiguous() + attn_output = self.out_proj(attn_output) + + return attn_output, attn_weights + + +# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->ChineseCLIPText +class ChineseCLIPTextIntermediate(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +# Copied from transformers.models.bert.modeling_bert.BertOutput with Bert->ChineseCLIPText +class ChineseCLIPTextOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +# Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->ChineseCLIPVision +class ChineseCLIPVisionMLP(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.activation_fn = ACT2FN[config.hidden_act] + self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size) + self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.fc1(hidden_states) + hidden_states = self.activation_fn(hidden_states) + hidden_states = self.fc2(hidden_states) + return hidden_states + + +# Copied from transformers.models.align.modeling_align.AlignTextLayer with Align->ChineseCLIP +class ChineseCLIPTextLayer(GradientCheckpointingLayer): + def __init__(self, config): + super().__init__() + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 + self.attention = ChineseCLIPTextAttention(config) + self.intermediate = ChineseCLIPTextIntermediate(config) + self.output = ChineseCLIPTextOutput(config) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = False, + **kwargs, + ) -> tuple[torch.Tensor]: + self_attention_outputs = self.attention( + hidden_states, + attention_mask=attention_mask, + head_mask=head_mask, + output_attentions=output_attentions, + **kwargs, + ) + attention_output = self_attention_outputs[0] + + outputs = self_attention_outputs[1:] # add self attentions if we output attention weights + layer_output = apply_chunking_to_forward( + self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output + ) + outputs = (layer_output,) + outputs + + return outputs + + def feed_forward_chunk(self, attention_output): + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + return layer_output + + +class ChineseCLIPVisionLayer(GradientCheckpointingLayer): + def __init__(self, config: ChineseCLIPConfig): + super().__init__() + self.embed_dim = config.hidden_size + self.self_attn = ChineseCLIPVisionAttention(config) + self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) + self.mlp = ChineseCLIPVisionMLP(config) + self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) + + def forward( + self, + hidden_states: torch.Tensor, + output_attentions: Optional[bool] = False, + ) -> tuple[torch.FloatTensor]: + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + """ + residual = hidden_states + + hidden_states = self.layer_norm1(hidden_states) + hidden_states, attn_weights = self.self_attn( + hidden_states=hidden_states, + output_attentions=output_attentions, + ) + hidden_states = residual + hidden_states + + residual = hidden_states + hidden_states = self.layer_norm2(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (attn_weights,) + + return outputs + + +# Copied from transformers.models.bert.modeling_bert.BertPooler with Bert->ChineseCLIPText +class ChineseCLIPTextPooler(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.activation = nn.Tanh() + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. + first_token_tensor = hidden_states[:, 0] + pooled_output = self.dense(first_token_tensor) + pooled_output = self.activation(pooled_output) + return pooled_output + + +@auto_docstring +class ChineseCLIPPreTrainedModel(PreTrainedModel): + config: ChineseCLIPConfig + base_model_prefix = "chinese_clip" + supports_gradient_checkpointing = True + + def _init_weights(self, module): + """Initialize the weights""" + factor = self.config.initializer_factor + if isinstance(module, ChineseCLIPVisionEmbeddings): + factor = self.config.initializer_factor + nn.init.normal_(module.class_embedding, mean=0.0, std=module.embed_dim**-0.5 * factor) + nn.init.normal_(module.patch_embedding.weight, std=module.config.initializer_range * factor) + nn.init.normal_(module.position_embedding.weight, std=module.config.initializer_range * factor) + elif isinstance(module, ChineseCLIPTextEmbeddings): + nn.init.normal_(module.word_embeddings.weight, mean=0.0, std=self.config.initializer_range) + nn.init.normal_(module.position_embeddings.weight, mean=0.0, std=self.config.initializer_range) + nn.init.normal_(module.token_type_embeddings.weight, mean=0.0, std=self.config.initializer_range) + for embedding in [module.word_embeddings, module.position_embeddings, module.token_type_embeddings]: + if embedding.padding_idx is not None: + embedding.weight.data[embedding.padding_idx].zero_() + elif isinstance(module, ChineseCLIPVisionAttention): + factor = self.config.initializer_factor + in_proj_std = (module.embed_dim**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor + out_proj_std = (module.embed_dim**-0.5) * factor + nn.init.normal_(module.q_proj.weight, std=in_proj_std) + nn.init.normal_(module.k_proj.weight, std=in_proj_std) + nn.init.normal_(module.v_proj.weight, std=in_proj_std) + nn.init.normal_(module.out_proj.weight, std=out_proj_std) + elif isinstance(module, ChineseCLIPVisionMLP): + factor = self.config.initializer_factor + in_proj_std = (module.config.hidden_size**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor + fc_std = (2 * module.config.hidden_size) ** -0.5 * factor + nn.init.normal_(module.fc1.weight, std=fc_std) + nn.init.normal_(module.fc2.weight, std=in_proj_std) + elif isinstance(module, ChineseCLIPModel): + nn.init.normal_( + module.text_projection.weight, + std=module.text_embed_dim**-0.5 * self.config.initializer_factor, + ) + nn.init.normal_( + module.visual_projection.weight, + std=module.vision_embed_dim**-0.5 * self.config.initializer_factor, + ) + + if isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + + +# Copied from transformers.models.align.modeling_align.AlignTextEncoder with Align->ChineseCLIP +class ChineseCLIPTextEncoder(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.layer = nn.ModuleList([ChineseCLIPTextLayer(config) for i in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + + @can_return_tuple + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = False, + output_hidden_states: Optional[bool] = False, + return_dict: Optional[bool] = True, + **kwargs, + ) -> Union[tuple[torch.Tensor], BaseModelOutput]: + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + + for i, layer_module in enumerate(self.layer): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_head_mask = head_mask[i] if head_mask is not None else None + + layer_outputs = layer_module( + hidden_states=hidden_states, + attention_mask=attention_mask, + head_mask=layer_head_mask, + output_attentions=output_attentions, + **kwargs, + ) + + hidden_states = layer_outputs[0] + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + return BaseModelOutput( + last_hidden_state=hidden_states, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + ) + + +class ChineseCLIPVisionEncoder(nn.Module): + """ + Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a + [`ChineseCLIPVisionEncoderLayer`]. + + Args: + config: ChineseCLIPConfig + """ + + def __init__(self, config: ChineseCLIPConfig): + super().__init__() + self.config = config + self.layers = nn.ModuleList([ChineseCLIPVisionLayer(config) for _ in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + + @can_return_tuple + def forward( + self, + inputs_embeds, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, BaseModelOutput]: + r""" + Args: + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert `input_ids` indices into associated vectors + than the model's internal embedding lookup matrix. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + encoder_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + + hidden_states = inputs_embeds + for idx, encoder_layer in enumerate(self.layers): + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + layer_outputs = encoder_layer( + hidden_states, + output_attentions=output_attentions, + ) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + + return BaseModelOutput( + last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions + ) + + +class ChineseCLIPVisionTransformer(nn.Module): + def __init__(self, config: ChineseCLIPVisionConfig): + super().__init__() + self.config = config + embed_dim = config.hidden_size + + self.embeddings = ChineseCLIPVisionEmbeddings(config) + self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) + self.encoder = ChineseCLIPVisionEncoder(config) + self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) + + @can_return_tuple + @auto_docstring + def forward( + self, + pixel_values: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: bool = False, + return_dict: Optional[bool] = None, + ) -> Union[tuple, BaseModelOutputWithPooling]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if pixel_values is None: + raise ValueError("You have to specify pixel_values") + + hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding) + hidden_states = self.pre_layrnorm(hidden_states) + + encoder_outputs = self.encoder( + inputs_embeds=hidden_states, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=True, + ) + + last_hidden_state = encoder_outputs[0] + pooled_output = last_hidden_state[:, 0, :] + pooled_output = self.post_layernorm(pooled_output) + + return BaseModelOutputWithPooling( + last_hidden_state=last_hidden_state, + pooler_output=pooled_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + +@auto_docstring( + custom_intro=""" + The text model from CHINESE_CLIP without any head or projection on top. + """ +) +class ChineseCLIPTextModel(ChineseCLIPPreTrainedModel): + """ + + The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of + cross-attention is added between the self-attention layers, following the architecture described in [Attention is + all you need](https://huggingface.co/papers/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, + Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin. + + To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set + to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and + `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass. + """ + + config: ChineseCLIPTextConfig + _no_split_modules = ["ChineseCLIPTextEmbeddings"] + + def __init__(self, config, add_pooling_layer=True): + r""" + add_pooling_layer (bool, *optional*, defaults to `True`): + Whether to add a pooling layer + """ + super().__init__(config) + self.config = config + + self.embeddings = ChineseCLIPTextEmbeddings(config) + self.encoder = ChineseCLIPTextEncoder(config) + + self.pooler = ChineseCLIPTextPooler(config) if add_pooling_layer else None + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embeddings.word_embeddings + + def set_input_embeddings(self, value): + self.embeddings.word_embeddings = value + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + @can_return_tuple + @auto_docstring + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[Cache] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple[torch.Tensor], BaseModelOutputWithPooling]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask) + input_shape = input_ids.size() + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + batch_size, seq_length = input_shape + device = input_ids.device if input_ids is not None else inputs_embeds.device + + if attention_mask is None: + attention_mask = torch.ones(((batch_size, seq_length)), device=device) + + if token_type_ids is None: + if hasattr(self.embeddings, "token_type_ids"): + buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length] + buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length) + token_type_ids = buffered_token_type_ids_expanded + else: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) + + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape) + + embedding_output = self.embeddings( + input_ids=input_ids, + position_ids=position_ids, + token_type_ids=token_type_ids, + inputs_embeds=inputs_embeds, + ) + encoder_outputs = self.encoder( + embedding_output, + attention_mask=extended_attention_mask, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=True, + ) + sequence_output = encoder_outputs[0] + pooled_output = self.pooler(sequence_output) if self.pooler is not None else None + + return BaseModelOutputWithPooling( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + +@auto_docstring( + custom_intro=""" + The vision model from CHINESE_CLIP without any head or projection on top. + """ +) +class ChineseCLIPVisionModel(ChineseCLIPPreTrainedModel): + config: ChineseCLIPVisionConfig + main_input_name = "pixel_values" + _no_split_modules = ["ChineseCLIPVisionEmbeddings", "ChineseCLIPVisionAttention"] + + def __init__(self, config: ChineseCLIPVisionConfig): + super().__init__(config) + self.vision_model = ChineseCLIPVisionTransformer(config) + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self) -> nn.Module: + return self.vision_model.embeddings.patch_embedding + + @auto_docstring + def forward( + self, + pixel_values: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: bool = False, + return_dict: Optional[bool] = None, + ) -> Union[tuple, BaseModelOutputWithPooling]: + r""" + Examples: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import CLIPProcessor, ChineseCLIPVisionModel + + >>> model = ChineseCLIPVisionModel.from_pretrained("OFA-Sys/chinese-clip-vit-base-patch16") + >>> processor = CLIPProcessor.from_pretrained("OFA-Sys/chinese-clip-vit-base-patch16") + + >>> url = "https://clip-cn-beijing.oss-cn-beijing.aliyuncs.com/pokemon.jpeg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> inputs = processor(images=image, return_tensors="pt") + + >>> outputs = model(**inputs) + >>> last_hidden_state = outputs.last_hidden_state + >>> pooled_output = outputs.pooler_output # pooled CLS states + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + return self.vision_model( + pixel_values=pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + interpolate_pos_encoding=interpolate_pos_encoding, + return_dict=return_dict, + ) + + +@auto_docstring +class ChineseCLIPModel(ChineseCLIPPreTrainedModel): + config: ChineseCLIPConfig + + def __init__(self, config: ChineseCLIPConfig): + super().__init__(config) + + if not isinstance(config.text_config, ChineseCLIPTextConfig): + raise TypeError( + "config.text_config is expected to be of type ChineseCLIPTextConfig but is of type" + f" {type(config.text_config)}." + ) + + if not isinstance(config.vision_config, ChineseCLIPVisionConfig): + raise TypeError( + "config.vision_config is expected to be of type ChineseCLIPVisionConfig but is of type" + f" {type(config.vision_config)}." + ) + + text_config = config.text_config + vision_config = config.vision_config + # The module using it is not a PreTrainedModel subclass so we need this + vision_config._attn_implementation = config._attn_implementation + + self.projection_dim = config.projection_dim + self.text_embed_dim = text_config.hidden_size + self.vision_embed_dim = vision_config.hidden_size + + self.text_model = ChineseCLIPTextModel(text_config, add_pooling_layer=False) + self.vision_model = ChineseCLIPVisionTransformer(vision_config) + + self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False) + self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False) + self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value)) + + # Initialize weights and apply final processing + self.post_init() + + @filter_out_non_signature_kwargs() + @auto_docstring + def get_text_features( + self, + input_ids: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + ) -> torch.FloatTensor: + r""" + Returns: + text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by + applying the projection layer to the final [CLS] hidden state of Text-Transformer. + + Examples: + + ```python + >>> import torch + >>> from transformers import AutoTokenizer, ChineseCLIPModel + + >>> model = ChineseCLIPModel.from_pretrained("OFA-Sys/chinese-clip-vit-base-patch16") + >>> tokenizer = AutoTokenizer.from_pretrained("OFA-Sys/chinese-clip-vit-base-patch16") + + >>> inputs = tokenizer(["杰尼龟", "妙蛙种子", "小火龙", "皮卡丘"], padding=True, return_tensors="pt") + >>> with torch.inference_mode(): + ... text_features = model.get_text_features(**inputs) + >>> text_features = text_features / text_features.norm(p=2, dim=-1, keepdim=True) + ```""" + text_outputs: BaseModelOutputWithPooling = self.text_model( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + ) + + pooled_output = text_outputs.pooler_output + text_features = self.text_projection(pooled_output) + + return text_features + + @filter_out_non_signature_kwargs() + @auto_docstring + def get_image_features( + self, + pixel_values: torch.FloatTensor, + interpolate_pos_encoding: bool = False, + ) -> torch.FloatTensor: + r""" + Returns: + image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by + applying the projection layer to the final [CLS] hidden state of Vision-Transformer. + + Examples: + + ```python + >>> import torch + >>> from transformers import AutoProcessor, ChineseCLIPModel + >>> from transformers.image_utils import load_image + + >>> model = ChineseCLIPModel.from_pretrained("OFA-Sys/chinese-clip-vit-base-patch16") + >>> processor = AutoProcessor.from_pretrained("OFA-Sys/chinese-clip-vit-base-patch16") + + >>> url = "https://clip-cn-beijing.oss-cn-beijing.aliyuncs.com/pokemon.jpeg" + >>> image = load_image(url) + + >>> inputs = processor(images=image, return_tensors="pt") + + >>> with torch.inference_mode(): + ... image_features = model.get_image_features(**inputs) + >>> image_features = image_features / image_features.norm(p=2, dim=-1, keepdim=True) + ```""" + vision_outputs: BaseModelOutputWithPooling = self.vision_model( + pixel_values=pixel_values, + interpolate_pos_encoding=interpolate_pos_encoding, + ) + + pooled_output = vision_outputs.pooler_output + image_features = self.visual_projection(pooled_output) + + return image_features + + @can_return_tuple + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + return_loss: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: bool = False, + return_dict: Optional[bool] = None, + ) -> Union[tuple, ChineseCLIPOutput]: + r""" + return_loss (`bool`, *optional*): + Whether or not to return the contrastive loss. + + Examples: + + ```python + >>> import torch + >>> from transformers import AutoProcessor, ChineseCLIPModel + >>> from transformers.image_utils import load_image + + >>> model = ChineseCLIPModel.from_pretrained("OFA-Sys/chinese-clip-vit-base-patch16") + >>> processor = AutoProcessor.from_pretrained("OFA-Sys/chinese-clip-vit-base-patch16") + + >>> url = "https://clip-cn-beijing.oss-cn-beijing.aliyuncs.com/pokemon.jpeg" + >>> image = load_image(url) + + >>> inputs = processor(text=["杰尼龟", "妙蛙种子", "小火龙", "皮卡丘"], images=image, return_tensors="pt", padding=True) + + >>> with torch.inference_mode(): + ... outputs = model(**inputs) + >>> logits_per_image = outputs.logits_per_image # this is the image-text similarity score + >>> probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities + ```""" + # Use CHINESE_CLIP model's config for some fields (if specified) instead of those of vision & text components. + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + + vision_outputs = self.vision_model( + pixel_values=pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + interpolate_pos_encoding=interpolate_pos_encoding, + return_dict=True, + ) + + text_outputs = self.text_model( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=True, + ) + + image_embeds = vision_outputs[1] + image_embeds = self.visual_projection(image_embeds) + + text_embeds = text_outputs[0][:, 0, :] + text_embeds = self.text_projection(text_embeds) + + # normalized features + image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True) + text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True) + + # cosine similarity as logits + logit_scale = self.logit_scale.exp() + logits_per_text = torch.matmul(text_embeds, image_embeds.t()) * logit_scale + logits_per_image = logits_per_text.t() + + loss = None + if return_loss: + loss = chinese_clip_loss(logits_per_text) + + return ChineseCLIPOutput( + loss=loss, + logits_per_image=logits_per_image, + logits_per_text=logits_per_text, + text_embeds=text_embeds, + image_embeds=image_embeds, + text_model_output=text_outputs, + vision_model_output=vision_outputs, + ) + + +__all__ = ["ChineseCLIPModel", "ChineseCLIPPreTrainedModel", "ChineseCLIPTextModel", "ChineseCLIPVisionModel"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/chinese_clip/processing_chinese_clip.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/chinese_clip/processing_chinese_clip.py new file mode 100644 index 0000000000000000000000000000000000000000..c6fb9f9df247555b2946cf471c650db2974450df --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/chinese_clip/processing_chinese_clip.py @@ -0,0 +1,67 @@ +# coding=utf-8 +# Copyright 2022 The OFA-Sys Team Authors and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Image/Text processor class for Chinese-CLIP +""" + +import warnings + +from ...processing_utils import ProcessorMixin + + +class ChineseCLIPProcessor(ProcessorMixin): + r""" + Constructs a Chinese-CLIP processor which wraps a Chinese-CLIP image processor and a Chinese-CLIP tokenizer into a + single processor. + + [`ChineseCLIPProcessor`] offers all the functionalities of [`ChineseCLIPImageProcessor`] and [`BertTokenizerFast`]. + See the [`~ChineseCLIPProcessor.__call__`] and [`~ChineseCLIPProcessor.decode`] for more information. + + Args: + image_processor ([`ChineseCLIPImageProcessor`], *optional*): + The image processor is a required input. + tokenizer ([`BertTokenizerFast`], *optional*): + The tokenizer is a required input. + """ + + attributes = ["image_processor", "tokenizer"] + image_processor_class = ("ChineseCLIPImageProcessor", "ChineseCLIPImageProcessorFast") + tokenizer_class = ("BertTokenizer", "BertTokenizerFast") + + def __init__(self, image_processor=None, tokenizer=None, **kwargs): + feature_extractor = None + if "feature_extractor" in kwargs: + warnings.warn( + "The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`" + " instead.", + FutureWarning, + ) + feature_extractor = kwargs.pop("feature_extractor") + + image_processor = image_processor if image_processor is not None else feature_extractor + + super().__init__(image_processor, tokenizer) + self.current_processor = self.image_processor + + @property + def feature_extractor_class(self): + warnings.warn( + "`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.", + FutureWarning, + ) + return self.image_processor_class + + +__all__ = ["ChineseCLIPProcessor"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/clip/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/clip/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..18a4db32e9943d78adb459ee9bffeb2222ce4107 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/clip/__init__.py @@ -0,0 +1,35 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_clip import * + from .feature_extraction_clip import * + from .image_processing_clip import * + from .image_processing_clip_fast import * + from .modeling_clip import * + from .modeling_flax_clip import * + from .modeling_tf_clip import * + from .processing_clip import * + from .tokenization_clip import * + from .tokenization_clip_fast import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/clip/configuration_clip.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/clip/configuration_clip.py new file mode 100644 index 0000000000000000000000000000000000000000..e343715e29ee9db64a1c85530e5d7d5e53bbee14 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/clip/configuration_clip.py @@ -0,0 +1,411 @@ +# coding=utf-8 +# Copyright 2021 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""CLIP model configuration""" + +from collections import OrderedDict +from collections.abc import Mapping +from typing import TYPE_CHECKING, Any, Optional + + +if TYPE_CHECKING: + from ...processing_utils import ProcessorMixin + from ...utils import TensorType + +from ...configuration_utils import PretrainedConfig +from ...onnx import OnnxConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class CLIPTextConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`CLIPTextModel`]. It is used to instantiate a CLIP + text encoder according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of the text encoder of the CLIP + [openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + vocab_size (`int`, *optional*, defaults to 49408): + Vocabulary size of the CLIP text model. Defines the number of different tokens that can be represented by + the `inputs_ids` passed when calling [`CLIPModel`]. + hidden_size (`int`, *optional*, defaults to 512): + Dimensionality of the encoder layers and the pooler layer. + intermediate_size (`int`, *optional*, defaults to 2048): + Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + projection_dim (`int`, *optional*, defaults to 512): + Dimensionality of text and vision projection layers. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 8): + Number of attention heads for each attention layer in the Transformer encoder. + max_position_embeddings (`int`, *optional*, defaults to 77): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported. + layer_norm_eps (`float`, *optional*, defaults to 1e-05): + The epsilon used by the layer normalization layers. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + initializer_factor (`float`, *optional*, defaults to 1.0): + A factor for initializing all weight matrices (should be kept to 1, used internally for initialization + testing). + pad_token_id (`int`, *optional*, defaults to 1): + Padding token id. + bos_token_id (`int`, *optional*, defaults to 49406): + Beginning of stream token id. + eos_token_id (`int`, *optional*, defaults to 49407): + End of stream token id. + + Example: + + ```python + >>> from transformers import CLIPTextConfig, CLIPTextModel + + >>> # Initializing a CLIPTextConfig with openai/clip-vit-base-patch32 style configuration + >>> configuration = CLIPTextConfig() + + >>> # Initializing a CLIPTextModel (with random weights) from the openai/clip-vit-base-patch32 style configuration + >>> model = CLIPTextModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "clip_text_model" + base_config_key = "text_config" + + def __init__( + self, + vocab_size=49408, + hidden_size=512, + intermediate_size=2048, + projection_dim=512, + num_hidden_layers=12, + num_attention_heads=8, + max_position_embeddings=77, + hidden_act="quick_gelu", + layer_norm_eps=1e-5, + attention_dropout=0.0, + initializer_range=0.02, + initializer_factor=1.0, + # This differs from `CLIPTokenizer`'s default and from openai/clip + # See https://github.com/huggingface/transformers/pull/24773#issuecomment-1632287538 + pad_token_id=1, + bos_token_id=49406, + eos_token_id=49407, + **kwargs, + ): + super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) + + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.projection_dim = projection_dim + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.max_position_embeddings = max_position_embeddings + self.layer_norm_eps = layer_norm_eps + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.initializer_factor = initializer_factor + self.attention_dropout = attention_dropout + + +class CLIPVisionConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`CLIPVisionModel`]. It is used to instantiate a + CLIP vision encoder according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of the vision encoder of the CLIP + [openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + intermediate_size (`int`, *optional*, defaults to 3072): + Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + projection_dim (`int`, *optional*, defaults to 512): + Dimensionality of text and vision projection layers. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + num_channels (`int`, *optional*, defaults to 3): + The number of input channels. + image_size (`int`, *optional*, defaults to 224): + The size (resolution) of each image. + patch_size (`int`, *optional*, defaults to 32): + The size (resolution) of each patch. + hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported. + layer_norm_eps (`float`, *optional*, defaults to 1e-05): + The epsilon used by the layer normalization layers. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + initializer_factor (`float`, *optional*, defaults to 1.0): + A factor for initializing all weight matrices (should be kept to 1, used internally for initialization + testing). + + Example: + + ```python + >>> from transformers import CLIPVisionConfig, CLIPVisionModel + + >>> # Initializing a CLIPVisionConfig with openai/clip-vit-base-patch32 style configuration + >>> configuration = CLIPVisionConfig() + + >>> # Initializing a CLIPVisionModel (with random weights) from the openai/clip-vit-base-patch32 style configuration + >>> model = CLIPVisionModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "clip_vision_model" + base_config_key = "vision_config" + + def __init__( + self, + hidden_size=768, + intermediate_size=3072, + projection_dim=512, + num_hidden_layers=12, + num_attention_heads=12, + num_channels=3, + image_size=224, + patch_size=32, + hidden_act="quick_gelu", + layer_norm_eps=1e-5, + attention_dropout=0.0, + initializer_range=0.02, + initializer_factor=1.0, + **kwargs, + ): + super().__init__(**kwargs) + + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.projection_dim = projection_dim + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.num_channels = num_channels + self.patch_size = patch_size + self.image_size = image_size + self.initializer_range = initializer_range + self.initializer_factor = initializer_factor + self.attention_dropout = attention_dropout + self.layer_norm_eps = layer_norm_eps + self.hidden_act = hidden_act + + +class CLIPConfig(PretrainedConfig): + r""" + [`CLIPConfig`] is the configuration class to store the configuration of a [`CLIPModel`]. It is used to instantiate + a CLIP model according to the specified arguments, defining the text model and vision model configs. Instantiating + a configuration with the defaults will yield a similar configuration to that of the CLIP + [openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + text_config (`dict`, *optional*): + Dictionary of configuration options used to initialize [`CLIPTextConfig`]. + vision_config (`dict`, *optional*): + Dictionary of configuration options used to initialize [`CLIPVisionConfig`]. + projection_dim (`int`, *optional*, defaults to 512): + Dimensionality of text and vision projection layers. + logit_scale_init_value (`float`, *optional*, defaults to 2.6592): + The initial value of the *logit_scale* parameter. Default is used as per the original CLIP implementation. + kwargs (*optional*): + Dictionary of keyword arguments. + + Example: + + ```python + >>> from transformers import CLIPConfig, CLIPModel + + >>> # Initializing a CLIPConfig with openai/clip-vit-base-patch32 style configuration + >>> configuration = CLIPConfig() + + >>> # Initializing a CLIPModel (with random weights) from the openai/clip-vit-base-patch32 style configuration + >>> model = CLIPModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + + >>> # We can also initialize a CLIPConfig from a CLIPTextConfig and a CLIPVisionConfig + >>> from transformers import CLIPTextConfig, CLIPVisionConfig + + >>> # Initializing a CLIPText and CLIPVision configuration + >>> config_text = CLIPTextConfig() + >>> config_vision = CLIPVisionConfig() + + >>> config = CLIPConfig.from_text_vision_configs(config_text, config_vision) + ```""" + + model_type = "clip" + sub_configs = {"text_config": CLIPTextConfig, "vision_config": CLIPVisionConfig} + + def __init__( + self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs + ): + # If `_config_dict` exist, we use them for the backward compatibility. + # We pop out these 2 attributes before calling `super().__init__` to avoid them being saved (which causes a lot + # of confusion!). + text_config_dict = kwargs.pop("text_config_dict", None) + vision_config_dict = kwargs.pop("vision_config_dict", None) + + super().__init__(**kwargs) + + # Instead of simply assigning `[text|vision]_config_dict` to `[text|vision]_config`, we use the values in + # `[text|vision]_config_dict` to update the values in `[text|vision]_config`. The values should be same in most + # cases, but we don't want to break anything regarding `_config_dict` that existed before commit `8827e1b2`. + if text_config_dict is not None: + if text_config is None: + text_config = {} + + # This is the complete result when using `text_config_dict`. + _text_config_dict = CLIPTextConfig(**text_config_dict).to_dict() + + # Give a warning if the values exist in both `_text_config_dict` and `text_config` but being different. + for key, value in _text_config_dict.items(): + if key in text_config and value != text_config[key] and key != "transformers_version": + # If specified in `text_config_dict` + if key in text_config_dict: + message = ( + f"`{key}` is found in both `text_config_dict` and `text_config` but with different values. " + f'The value `text_config_dict["{key}"]` will be used instead.' + ) + # If inferred from default argument values (just to be super careful) + else: + message = ( + f"`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The " + f'value `text_config["{key}"]` will be overridden.' + ) + logger.info(message) + + # Update all values in `text_config` with the ones in `_text_config_dict`. + text_config.update(_text_config_dict) + + if vision_config_dict is not None: + if vision_config is None: + vision_config = {} + + # This is the complete result when using `vision_config_dict`. + _vision_config_dict = CLIPVisionConfig(**vision_config_dict).to_dict() + # convert keys to string instead of integer + if "id2label" in _vision_config_dict: + _vision_config_dict["id2label"] = { + str(key): value for key, value in _vision_config_dict["id2label"].items() + } + + # Give a warning if the values exist in both `_vision_config_dict` and `vision_config` but being different. + for key, value in _vision_config_dict.items(): + if key in vision_config and value != vision_config[key] and key != "transformers_version": + # If specified in `vision_config_dict` + if key in vision_config_dict: + message = ( + f"`{key}` is found in both `vision_config_dict` and `vision_config` but with different " + f'values. The value `vision_config_dict["{key}"]` will be used instead.' + ) + # If inferred from default argument values (just to be super careful) + else: + message = ( + f"`vision_config_dict` is provided which will be used to initialize `CLIPVisionConfig`. " + f'The value `vision_config["{key}"]` will be overridden.' + ) + logger.info(message) + + # Update all values in `vision_config` with the ones in `_vision_config_dict`. + vision_config.update(_vision_config_dict) + + if text_config is None: + text_config = {} + logger.info("`text_config` is `None`. Initializing the `CLIPTextConfig` with default values.") + + if vision_config is None: + vision_config = {} + logger.info("`vision_config` is `None`. initializing the `CLIPVisionConfig` with default values.") + + self.text_config = CLIPTextConfig(**text_config) + self.vision_config = CLIPVisionConfig(**vision_config) + + self.projection_dim = projection_dim + self.logit_scale_init_value = logit_scale_init_value + self.initializer_factor = 1.0 + + +class CLIPOnnxConfig(OnnxConfig): + @property + def inputs(self) -> Mapping[str, Mapping[int, str]]: + return OrderedDict( + [ + ("input_ids", {0: "batch", 1: "sequence"}), + ("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}), + ("attention_mask", {0: "batch", 1: "sequence"}), + ] + ) + + @property + def outputs(self) -> Mapping[str, Mapping[int, str]]: + return OrderedDict( + [ + ("logits_per_image", {0: "batch"}), + ("logits_per_text", {0: "batch"}), + ("text_embeds", {0: "batch"}), + ("image_embeds", {0: "batch"}), + ] + ) + + @property + def atol_for_validation(self) -> float: + return 1e-4 + + def generate_dummy_inputs( + self, + processor: "ProcessorMixin", + batch_size: int = -1, + seq_length: int = -1, + framework: Optional["TensorType"] = None, + ) -> Mapping[str, Any]: + text_input_dict = super().generate_dummy_inputs( + processor.tokenizer, batch_size=batch_size, seq_length=seq_length, framework=framework + ) + image_input_dict = super().generate_dummy_inputs( + processor.image_processor, batch_size=batch_size, framework=framework + ) + return {**text_input_dict, **image_input_dict} + + @property + def default_onnx_opset(self) -> int: + return 14 + + +__all__ = ["CLIPConfig", "CLIPOnnxConfig", "CLIPTextConfig", "CLIPVisionConfig"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/clip/feature_extraction_clip.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/clip/feature_extraction_clip.py new file mode 100644 index 0000000000000000000000000000000000000000..bed6b59eaa4595a2b7600660e34842ac92bf1784 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/clip/feature_extraction_clip.py @@ -0,0 +1,38 @@ +# coding=utf-8 +# Copyright 2021 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Feature extractor class for CLIP.""" + +import warnings + +from ...utils import logging +from ...utils.import_utils import requires +from .image_processing_clip import CLIPImageProcessor + + +logger = logging.get_logger(__name__) + + +@requires(backends=("vision",)) +class CLIPFeatureExtractor(CLIPImageProcessor): + def __init__(self, *args, **kwargs) -> None: + warnings.warn( + "The class CLIPFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please" + " use CLIPImageProcessor instead.", + FutureWarning, + ) + super().__init__(*args, **kwargs) + + +__all__ = ["CLIPFeatureExtractor"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/clip/image_processing_clip.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/clip/image_processing_clip.py new file mode 100644 index 0000000000000000000000000000000000000000..ea17e4a65ff49fe6e0ce1a9ef8ef1db84ec36fca --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/clip/image_processing_clip.py @@ -0,0 +1,351 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Image processor class for CLIP.""" + +from typing import Optional, Union + +import numpy as np + +from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict +from ...image_transforms import ( + convert_to_rgb, + get_resize_output_image_size, + resize, + to_channel_dimension_format, +) +from ...image_utils import ( + OPENAI_CLIP_MEAN, + OPENAI_CLIP_STD, + ChannelDimension, + ImageInput, + PILImageResampling, + infer_channel_dimension_format, + is_scaled_image, + make_flat_list_of_images, + to_numpy_array, + valid_images, + validate_kwargs, + validate_preprocess_arguments, +) +from ...utils import TensorType, is_vision_available, logging +from ...utils.import_utils import requires + + +logger = logging.get_logger(__name__) + + +if is_vision_available(): + import PIL + + +@requires(backends=("vision",)) +class CLIPImageProcessor(BaseImageProcessor): + r""" + Constructs a CLIP image processor. + + Args: + do_resize (`bool`, *optional*, defaults to `True`): + Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by + `do_resize` in the `preprocess` method. + size (`dict[str, int]` *optional*, defaults to `{"shortest_edge": 224}`): + Size of the image after resizing. The shortest edge of the image is resized to size["shortest_edge"], with + the longest edge resized to keep the input aspect ratio. Can be overridden by `size` in the `preprocess` + method. + resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`): + Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method. + do_center_crop (`bool`, *optional*, defaults to `True`): + Whether to center crop the image to the specified `crop_size`. Can be overridden by `do_center_crop` in the + `preprocess` method. + crop_size (`dict[str, int]` *optional*, defaults to 224): + Size of the output image after applying `center_crop`. Can be overridden by `crop_size` in the `preprocess` + method. + do_rescale (`bool`, *optional*, defaults to `True`): + Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in + the `preprocess` method. + rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): + Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess` + method. + do_normalize (`bool`, *optional*, defaults to `True`): + Whether to normalize the image. Can be overridden by `do_normalize` in the `preprocess` method. + image_mean (`float` or `list[float]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`): + Mean to use if normalizing the image. This is a float or list of floats the length of the number of + channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. + image_std (`float` or `list[float]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`): + Standard deviation to use if normalizing the image. This is a float or list of floats the length of the + number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method. + Can be overridden by the `image_std` parameter in the `preprocess` method. + do_convert_rgb (`bool`, *optional*, defaults to `True`): + Whether to convert the image to RGB. + """ + + model_input_names = ["pixel_values"] + + def __init__( + self, + do_resize: bool = True, + size: Optional[dict[str, int]] = None, + resample: PILImageResampling = PILImageResampling.BICUBIC, + do_center_crop: bool = True, + crop_size: Optional[dict[str, int]] = None, + do_rescale: bool = True, + rescale_factor: Union[int, float] = 1 / 255, + do_normalize: bool = True, + image_mean: Optional[Union[float, list[float]]] = None, + image_std: Optional[Union[float, list[float]]] = None, + do_convert_rgb: bool = True, + **kwargs, + ) -> None: + super().__init__(**kwargs) + size = size if size is not None else {"shortest_edge": 224} + size = get_size_dict(size, default_to_square=False) + crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224} + crop_size = get_size_dict(crop_size, default_to_square=True, param_name="crop_size") + + self.do_resize = do_resize + self.size = size + self.resample = resample + self.do_center_crop = do_center_crop + self.crop_size = crop_size + self.do_rescale = do_rescale + self.rescale_factor = rescale_factor + self.do_normalize = do_normalize + self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN + self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD + self.do_convert_rgb = do_convert_rgb + self._valid_processor_keys = [ + "images", + "do_resize", + "size", + "resample", + "do_center_crop", + "crop_size", + "do_rescale", + "rescale_factor", + "do_normalize", + "image_mean", + "image_std", + "do_convert_rgb", + "return_tensors", + "data_format", + "input_data_format", + ] + + # for backwards compatibility of KOSMOS-2 + if "use_square_size" in kwargs and kwargs["use_square_size"]: + self.size = {"height": size["shortest_edge"], "width": size["shortest_edge"]} + # Let's remove `use_square_size` (as it is removed from #27690), so the future Kosmos-2 image processors + # won't have this attr. being saved. (otherwise, it will enter this if branch while there is no more + # `shortest_edge` key. + delattr(self, "use_square_size") + + def resize( + self, + image: np.ndarray, + size: dict[str, int], + resample: PILImageResampling = PILImageResampling.BICUBIC, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ) -> np.ndarray: + """ + Resize an image. The shortest edge of the image is resized to size["shortest_edge"], with the longest edge + resized to keep the input aspect ratio. + + Args: + image (`np.ndarray`): + Image to resize. + size (`dict[str, int]`): + Size of the output image. + resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`): + Resampling filter to use when resiizing the image. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format of the image. If not provided, it will be the same as the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format of the input image. If not provided, it will be inferred. + """ + default_to_square = True + if "shortest_edge" in size: + size = size["shortest_edge"] + default_to_square = False + elif "height" in size and "width" in size: + size = (size["height"], size["width"]) + else: + raise ValueError("Size must contain either 'shortest_edge' or 'height' and 'width'.") + + output_size = get_resize_output_image_size( + image, + size=size, + default_to_square=default_to_square, + input_data_format=input_data_format, + ) + return resize( + image, + size=output_size, + resample=resample, + data_format=data_format, + input_data_format=input_data_format, + **kwargs, + ) + + def preprocess( + self, + images: ImageInput, + do_resize: Optional[bool] = None, + size: Optional[dict[str, int]] = None, + resample: Optional[PILImageResampling] = None, + do_center_crop: Optional[bool] = None, + crop_size: Optional[int] = None, + do_rescale: Optional[bool] = None, + rescale_factor: Optional[float] = None, + do_normalize: Optional[bool] = None, + image_mean: Optional[Union[float, list[float]]] = None, + image_std: Optional[Union[float, list[float]]] = None, + do_convert_rgb: Optional[bool] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + data_format: Optional[ChannelDimension] = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ) -> PIL.Image.Image: + """ + Preprocess an image or batch of images. + + Args: + images (`ImageInput`): + Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If + passing in images with pixel values between 0 and 1, set `do_rescale=False`. + do_resize (`bool`, *optional*, defaults to `self.do_resize`): + Whether to resize the image. + size (`dict[str, int]`, *optional*, defaults to `self.size`): + Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with + the longest edge resized to keep the input aspect ratio. + resample (`int`, *optional*, defaults to `self.resample`): + Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only + has an effect if `do_resize` is set to `True`. + do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`): + Whether to center crop the image. + crop_size (`dict[str, int]`, *optional*, defaults to `self.crop_size`): + Size of the center crop. Only has an effect if `do_center_crop` is set to `True`. + do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): + Whether to rescale the image. + rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): + Rescale factor to rescale the image by if `do_rescale` is set to `True`. + do_normalize (`bool`, *optional*, defaults to `self.do_normalize`): + Whether to normalize the image. + image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`): + Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`. + image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`): + Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to + `True`. + do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`): + Whether to convert the image to RGB. + return_tensors (`str` or `TensorType`, *optional*): + The type of tensors to return. Can be one of: + - Unset: Return a list of `np.ndarray`. + - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`. + - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. + - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. + - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. + data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - Unset: Use the channel dimension format of the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + """ + do_resize = do_resize if do_resize is not None else self.do_resize + size = size if size is not None else self.size + size = get_size_dict(size, param_name="size", default_to_square=False) + resample = resample if resample is not None else self.resample + do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop + crop_size = crop_size if crop_size is not None else self.crop_size + crop_size = get_size_dict(crop_size, param_name="crop_size", default_to_square=True) + do_rescale = do_rescale if do_rescale is not None else self.do_rescale + rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor + do_normalize = do_normalize if do_normalize is not None else self.do_normalize + image_mean = image_mean if image_mean is not None else self.image_mean + image_std = image_std if image_std is not None else self.image_std + do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb + + validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self._valid_processor_keys) + + images = self.fetch_images(images) + images = make_flat_list_of_images(images) + + if not valid_images(images): + raise ValueError( + "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " + "torch.Tensor, tf.Tensor or jax.ndarray." + ) + validate_preprocess_arguments( + do_rescale=do_rescale, + rescale_factor=rescale_factor, + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + do_center_crop=do_center_crop, + crop_size=crop_size, + do_resize=do_resize, + size=size, + resample=resample, + ) + + if do_convert_rgb: + images = [convert_to_rgb(image) for image in images] + + # All transformations expect numpy arrays. + images = [to_numpy_array(image) for image in images] + + if do_rescale and is_scaled_image(images[0]): + logger.warning_once( + "It looks like you are trying to rescale already rescaled images. If the input" + " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again." + ) + + if input_data_format is None: + # We assume that all images have the same channel dimension format. + input_data_format = infer_channel_dimension_format(images[0]) + + all_images = [] + for image in images: + if do_resize: + image = self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format) + + if do_center_crop: + image = self.center_crop(image=image, size=crop_size, input_data_format=input_data_format) + + if do_rescale: + image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format) + + if do_normalize: + image = self.normalize( + image=image, mean=image_mean, std=image_std, input_data_format=input_data_format + ) + + all_images.append(image) + images = [ + to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) + for image in all_images + ] + + data = {"pixel_values": images} + return BatchFeature(data=data, tensor_type=return_tensors) + + +__all__ = ["CLIPImageProcessor"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/clip/image_processing_clip_fast.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/clip/image_processing_clip_fast.py new file mode 100644 index 0000000000000000000000000000000000000000..accd41475dd76344ec25e251270361cbd8a63d14 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/clip/image_processing_clip_fast.py @@ -0,0 +1,39 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Fast Image processor class for CLIP.""" + +from ...image_processing_utils_fast import BaseImageProcessorFast +from ...image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, PILImageResampling +from ...utils import auto_docstring + + +@auto_docstring +class CLIPImageProcessorFast(BaseImageProcessorFast): + # To be checked against the slow image processor + # None values left after checking can be removed + resample = PILImageResampling.BICUBIC + image_mean = OPENAI_CLIP_MEAN + image_std = OPENAI_CLIP_STD + size = {"shortest_edge": 224} + default_to_square = False + crop_size = {"height": 224, "width": 224} + do_resize = True + do_center_crop = True + do_rescale = True + do_normalize = True + do_convert_rgb = True + + +__all__ = ["CLIPImageProcessorFast"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/clip/modeling_clip.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/clip/modeling_clip.py new file mode 100644 index 0000000000000000000000000000000000000000..196381f33bbdf9526dba01dc45a8f409e9c0d248 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/clip/modeling_clip.py @@ -0,0 +1,1240 @@ +# coding=utf-8 +# Copyright 2021 The OpenAI Team Authors and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch CLIP model.""" + +from dataclasses import dataclass +from typing import Any, Callable, Optional, Union + +import torch +from torch import nn + +from ...activations import ACT2FN +from ...modeling_attn_mask_utils import _create_4d_causal_attention_mask, _prepare_4d_attention_mask +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, ImageClassifierOutput +from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel +from ...utils import ModelOutput, auto_docstring, can_return_tuple, filter_out_non_signature_kwargs, logging, torch_int +from .configuration_clip import CLIPConfig, CLIPTextConfig, CLIPVisionConfig + + +logger = logging.get_logger(__name__) + + +# contrastive loss function, adapted from +# https://sachinruk.github.io/blog/2021-03-07-clip.html +def contrastive_loss(logits: torch.Tensor) -> torch.Tensor: + return nn.functional.cross_entropy(logits, torch.arange(len(logits), device=logits.device)) + + +def clip_loss(similarity: torch.Tensor) -> torch.Tensor: + caption_loss = contrastive_loss(similarity) + image_loss = contrastive_loss(similarity.t()) + return (caption_loss + image_loss) / 2.0 + + +def _get_vector_norm(tensor: torch.Tensor) -> torch.Tensor: + """ + This method is equivalent to tensor.norm(p=2, dim=-1, keepdim=True) and used to make + model `executorch` exportable. See issue https://github.com/pytorch/executorch/issues/3566 + """ + square_tensor = torch.pow(tensor, 2) + sum_tensor = torch.sum(square_tensor, dim=-1, keepdim=True) + normed_tensor = torch.pow(sum_tensor, 0.5) + return normed_tensor + + +@dataclass +@auto_docstring( + custom_intro=""" + Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states. + """ +) +class CLIPVisionModelOutput(ModelOutput): + r""" + image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`): + The image embeddings obtained by applying the projection layer to the pooler_output. + """ + + image_embeds: Optional[torch.FloatTensor] = None + last_hidden_state: Optional[torch.FloatTensor] = None + hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None + attentions: Optional[tuple[torch.FloatTensor, ...]] = None + + +@dataclass +@auto_docstring( + custom_intro=""" + Base class for text model's outputs that also contains a pooling of the last hidden states. + """ +) +class CLIPTextModelOutput(ModelOutput): + r""" + text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`): + The text embeddings obtained by applying the projection layer to the pooler_output. + """ + + text_embeds: Optional[torch.FloatTensor] = None + last_hidden_state: Optional[torch.FloatTensor] = None + hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None + attentions: Optional[tuple[torch.FloatTensor, ...]] = None + + +@dataclass +@auto_docstring +class CLIPOutput(ModelOutput): + r""" + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`): + Contrastive loss for image-text similarity. + logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`): + The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text + similarity scores. + logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`): + The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image + similarity scores. + text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`): + The text embeddings obtained by applying the projection layer to the pooled output of [`CLIPTextModel`]. + image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`): + The image embeddings obtained by applying the projection layer to the pooled output of [`CLIPVisionModel`]. + text_model_output (`BaseModelOutputWithPooling`): + The output of the [`CLIPTextModel`]. + vision_model_output (`BaseModelOutputWithPooling`): + The output of the [`CLIPVisionModel`]. + """ + + loss: Optional[torch.FloatTensor] = None + logits_per_image: Optional[torch.FloatTensor] = None + logits_per_text: Optional[torch.FloatTensor] = None + text_embeds: Optional[torch.FloatTensor] = None + image_embeds: Optional[torch.FloatTensor] = None + text_model_output: BaseModelOutputWithPooling = None + vision_model_output: BaseModelOutputWithPooling = None + + def to_tuple(self) -> tuple[Any]: + return tuple( + self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple() + for k in self.keys() + ) + + +class CLIPVisionEmbeddings(nn.Module): + def __init__(self, config: CLIPVisionConfig): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.image_size = config.image_size + self.patch_size = config.patch_size + + self.class_embedding = nn.Parameter(torch.randn(self.embed_dim)) + + self.patch_embedding = nn.Conv2d( + in_channels=config.num_channels, + out_channels=self.embed_dim, + kernel_size=self.patch_size, + stride=self.patch_size, + bias=False, + ) + + self.num_patches = (self.image_size // self.patch_size) ** 2 + self.num_positions = self.num_patches + 1 + self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim) + self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False) + + def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor: + """ + This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution + images. This method is also adapted to support torch.jit tracing. + + Adapted from: + - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and + - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211 + """ + + num_patches = embeddings.shape[1] - 1 + position_embedding = self.position_embedding.weight.unsqueeze(0) + num_positions = position_embedding.shape[1] - 1 + + # always interpolate when tracing to ensure the exported model works for dynamic input shapes + if not torch.jit.is_tracing() and num_patches == num_positions and height == width: + return self.position_embedding(self.position_ids) + + class_pos_embed = position_embedding[:, :1] + patch_pos_embed = position_embedding[:, 1:] + + dim = embeddings.shape[-1] + + new_height = height // self.patch_size + new_width = width // self.patch_size + + sqrt_num_positions = torch_int(num_positions**0.5) + patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim) + patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2) + + patch_pos_embed = nn.functional.interpolate( + patch_pos_embed, + size=(new_height, new_width), + mode="bicubic", + align_corners=False, + ) + + patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim) + + return torch.cat((class_pos_embed, patch_pos_embed), dim=1) + + def forward(self, pixel_values: torch.FloatTensor, interpolate_pos_encoding=False) -> torch.Tensor: + batch_size, _, height, width = pixel_values.shape + if not interpolate_pos_encoding and (height != self.image_size or width != self.image_size): + raise ValueError( + f"Input image size ({height}*{width}) doesn't match model ({self.image_size}*{self.image_size})." + ) + target_dtype = self.patch_embedding.weight.dtype + patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid] + patch_embeds = patch_embeds.flatten(2).transpose(1, 2) + + class_embeds = self.class_embedding.expand(batch_size, 1, -1) + embeddings = torch.cat([class_embeds, patch_embeds], dim=1) + if interpolate_pos_encoding: + embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width) + else: + embeddings = embeddings + self.position_embedding(self.position_ids) + return embeddings + + +class CLIPTextEmbeddings(nn.Module): + def __init__(self, config: CLIPTextConfig): + super().__init__() + embed_dim = config.hidden_size + + self.token_embedding = nn.Embedding(config.vocab_size, embed_dim) + self.position_embedding = nn.Embedding(config.max_position_embeddings, embed_dim) + + # position_ids (1, len position emb) is contiguous in memory and exported when serialized + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + ) -> torch.Tensor: + seq_length = input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2] + max_position_embedding = self.position_embedding.weight.shape[0] + + if seq_length > max_position_embedding: + raise ValueError( + f"Sequence length must be less than max_position_embeddings (got `sequence length`: " + f"{seq_length} and max_position_embeddings: {max_position_embedding}" + ) + + if position_ids is None: + position_ids = self.position_ids[:, :seq_length] + + if inputs_embeds is None: + inputs_embeds = self.token_embedding(input_ids) + + position_embeddings = self.position_embedding(position_ids) + embeddings = inputs_embeds + position_embeddings + + return embeddings + + +def eager_attention_forward( + module: nn.Module, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attention_mask: Optional[torch.Tensor], + scaling: float, + dropout: float = 0.0, + output_attentions: bool = True, + **kwargs, +): + attn_weights = torch.matmul(query, key.transpose(-1, -2)) * scaling + if attention_mask is not None: + attn_weights = attn_weights + attention_mask + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype) + attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) + + attn_output = torch.matmul(attn_weights, value) + attn_output = attn_output.transpose(1, 2).contiguous() + if not output_attentions: + attn_weights = None + return attn_output, attn_weights + + +class CLIPAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config: Union[CLIPVisionConfig, CLIPTextConfig]): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.embed_dim // self.num_heads + if self.head_dim * self.num_heads != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:" + f" {self.num_heads})." + ) + self.scale = self.head_dim**-0.5 + self.dropout = config.attention_dropout + self.is_causal = False + + self.k_proj = nn.Linear(self.embed_dim, self.embed_dim) + self.v_proj = nn.Linear(self.embed_dim, self.embed_dim) + self.q_proj = nn.Linear(self.embed_dim, self.embed_dim) + self.out_proj = nn.Linear(self.embed_dim, self.embed_dim) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + causal_attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = False, + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: + """Input shape: Batch x Time x Channel""" + + batch_size, seq_length, embed_dim = hidden_states.shape + + queries = self.q_proj(hidden_states) + keys = self.k_proj(hidden_states) + values = self.v_proj(hidden_states) + + queries = queries.view(batch_size, seq_length, -1, self.head_dim).transpose(1, 2) + keys = keys.view(batch_size, seq_length, -1, self.head_dim).transpose(1, 2) + values = values.view(batch_size, seq_length, -1, self.head_dim).transpose(1, 2) + # CLIP text model uses both `causal_attention_mask` and `attention_mask` + # in case FA2 kernel is called, `is_causal` should be inferred from `causal_attention_mask` + if self.config._attn_implementation == "flash_attention_2": + self.is_causal = causal_attention_mask is not None + else: + if attention_mask is not None and causal_attention_mask is not None: + attention_mask = attention_mask + causal_attention_mask + elif causal_attention_mask is not None: + attention_mask = causal_attention_mask + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + attn_output, attn_weights = attention_interface( + self, + queries, + keys, + values, + attention_mask, + is_causal=self.is_causal, + scaling=self.scale, + dropout=0.0 if not self.training else self.dropout, + output_attentions=output_attentions, + ) + + attn_output = attn_output.reshape(batch_size, seq_length, embed_dim).contiguous() + attn_output = self.out_proj(attn_output) + + if not output_attentions: + attn_weights = None + return attn_output, attn_weights + + +class CLIPMLP(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.activation_fn = ACT2FN[config.hidden_act] + self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size) + self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.fc1(hidden_states) + hidden_states = self.activation_fn(hidden_states) + hidden_states = self.fc2(hidden_states) + return hidden_states + + +class CLIPEncoderLayer(GradientCheckpointingLayer): + def __init__(self, config: Union[CLIPVisionConfig, CLIPTextConfig]): + super().__init__() + self.embed_dim = config.hidden_size + self.self_attn = CLIPAttention(config) + self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) + self.mlp = CLIPMLP(config) + self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: torch.Tensor, + causal_attention_mask: torch.Tensor, + output_attentions: Optional[bool] = False, + ) -> tuple[torch.FloatTensor]: + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`): attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + `(config.encoder_attention_heads,)`. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + """ + residual = hidden_states + + hidden_states = self.layer_norm1(hidden_states) + hidden_states, attn_weights = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + causal_attention_mask=causal_attention_mask, + output_attentions=output_attentions, + ) + hidden_states = residual + hidden_states + + residual = hidden_states + hidden_states = self.layer_norm2(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (attn_weights,) + + return outputs + + +@auto_docstring +class CLIPPreTrainedModel(PreTrainedModel): + config: CLIPConfig + base_model_prefix = "clip" + supports_gradient_checkpointing = True + _supports_sdpa = True + _supports_flash_attn = True + _supports_flex_attn = True + _supports_attention_backend = True + + def _init_weights(self, module): + """Initialize the weights""" + factor = self.config.initializer_factor + if isinstance(module, CLIPTextEmbeddings): + module.token_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02) + module.position_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02) + elif isinstance(module, CLIPVisionEmbeddings): + factor = self.config.initializer_factor + nn.init.normal_(module.class_embedding, mean=0.0, std=module.embed_dim**-0.5 * factor) + nn.init.normal_(module.patch_embedding.weight, std=module.config.initializer_range * factor) + nn.init.normal_(module.position_embedding.weight, std=module.config.initializer_range * factor) + elif isinstance(module, CLIPAttention): + factor = self.config.initializer_factor + in_proj_std = (module.embed_dim**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor + out_proj_std = (module.embed_dim**-0.5) * factor + nn.init.normal_(module.q_proj.weight, std=in_proj_std) + nn.init.normal_(module.k_proj.weight, std=in_proj_std) + nn.init.normal_(module.v_proj.weight, std=in_proj_std) + nn.init.normal_(module.out_proj.weight, std=out_proj_std) + elif isinstance(module, CLIPMLP): + factor = self.config.initializer_factor + in_proj_std = (module.config.hidden_size**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor + fc_std = (2 * module.config.hidden_size) ** -0.5 * factor + nn.init.normal_(module.fc1.weight, std=fc_std) + nn.init.normal_(module.fc2.weight, std=in_proj_std) + elif isinstance(module, CLIPModel): + nn.init.normal_( + module.text_projection.weight, + std=module.text_embed_dim**-0.5 * self.config.initializer_factor, + ) + nn.init.normal_( + module.visual_projection.weight, + std=module.vision_embed_dim**-0.5 * self.config.initializer_factor, + ) + elif isinstance(module, CLIPVisionModelWithProjection): + nn.init.normal_( + module.visual_projection.weight, + std=self.config.hidden_size**-0.5 * self.config.initializer_factor, + ) + elif isinstance(module, CLIPTextModelWithProjection): + nn.init.normal_( + module.text_projection.weight, + std=self.config.hidden_size**-0.5 * self.config.initializer_factor, + ) + elif isinstance(module, CLIPForImageClassification): + nn.init.normal_( + module.classifier.weight, + std=self.config.vision_config.hidden_size**-0.5 * self.config.initializer_factor, + ) + + if isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + if isinstance(module, nn.Linear) and module.bias is not None: + module.bias.data.zero_() + + +class CLIPEncoder(nn.Module): + """ + Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a + [`CLIPEncoderLayer`]. + + Args: + config: CLIPConfig + """ + + def __init__(self, config: CLIPConfig): + super().__init__() + self.config = config + self.layers = nn.ModuleList([CLIPEncoderLayer(config) for _ in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + + def forward( + self, + inputs_embeds, + attention_mask: Optional[torch.Tensor] = None, + causal_attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + ) -> BaseModelOutput: + r""" + Args: + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert `input_ids` indices into associated vectors + than the model's internal embedding lookup matrix. + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Causal mask for the text model. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + + encoder_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + + hidden_states = inputs_embeds + for idx, encoder_layer in enumerate(self.layers): + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + layer_outputs = encoder_layer( + hidden_states, + attention_mask, + causal_attention_mask, + output_attentions=output_attentions, + ) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + + return BaseModelOutput( + last_hidden_state=hidden_states, + hidden_states=encoder_states, + attentions=all_attentions, + ) + + +class CLIPTextTransformer(nn.Module): + def __init__(self, config: CLIPTextConfig): + super().__init__() + self.config = config + embed_dim = config.hidden_size + self.embeddings = CLIPTextEmbeddings(config) + self.encoder = CLIPEncoder(config) + self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) + + # For `pooled_output` computation + self.eos_token_id = config.eos_token_id + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + ) -> BaseModelOutputWithPooling: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + + if input_ids is None: + raise ValueError("You have to specify input_ids") + + input_shape = input_ids.size() + input_ids = input_ids.view(-1, input_shape[-1]) + + hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids) + + # CLIP's text model uses causal mask, prepare it here. + # https://github.com/openai/CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clip/model.py#L324 + causal_attention_mask = _create_4d_causal_attention_mask( + input_shape, hidden_states.dtype, device=hidden_states.device + ) + + # expand attention_mask + if attention_mask is not None and self.config._attn_implementation != "flash_attention_2": + # [batch_size, seq_len] -> [batch_size, 1, tgt_seq_len, src_seq_len] + attention_mask = _prepare_4d_attention_mask(attention_mask, hidden_states.dtype) + + encoder_outputs: BaseModelOutput = self.encoder( + inputs_embeds=hidden_states, + attention_mask=attention_mask, + causal_attention_mask=causal_attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + ) + + last_hidden_state = encoder_outputs.last_hidden_state + last_hidden_state = self.final_layer_norm(last_hidden_state) + + if self.eos_token_id == 2: + # The `eos_token_id` was incorrect before PR #24773: Let's keep what have been done here. + # A CLIP model with such `eos_token_id` in the config can't work correctly with extra new tokens added + # ------------------------------------------------------------ + # text_embeds.shape = [batch_size, sequence_length, transformer.width] + # take features from the eot embedding (eot_token is the highest number in each sequence) + # casting to torch.int for onnx compatibility: argmax doesn't support int64 inputs with opset 14 + pooled_output = last_hidden_state[ + torch.arange(last_hidden_state.shape[0], device=last_hidden_state.device), + input_ids.to(dtype=torch.int, device=last_hidden_state.device).argmax(dim=-1), + ] + else: + # The config gets updated `eos_token_id` from PR #24773 (so the use of extra new tokens is possible) + pooled_output = last_hidden_state[ + torch.arange(last_hidden_state.shape[0], device=last_hidden_state.device), + # We need to get the first position of `eos_token_id` value (`pad_token_ids` might equal to `eos_token_id`) + # Note: we assume each sequence (along batch dim.) contains an `eos_token_id` (e.g. prepared by the tokenizer) + (input_ids.to(dtype=torch.int, device=last_hidden_state.device) == self.eos_token_id) + .int() + .argmax(dim=-1), + ] + + return BaseModelOutputWithPooling( + last_hidden_state=last_hidden_state, + pooler_output=pooled_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + +@auto_docstring( + custom_intro=""" + The text model from CLIP without any head or projection on top. + """ +) +class CLIPTextModel(CLIPPreTrainedModel): + config: CLIPTextConfig + + _no_split_modules = ["CLIPTextEmbeddings", "CLIPEncoderLayer"] + _supports_flash_attn = False # mask creation only accounts for sdpa/eager + + def __init__(self, config: CLIPTextConfig): + super().__init__(config) + self.text_model = CLIPTextTransformer(config) + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self) -> nn.Module: + return self.text_model.embeddings.token_embedding + + def set_input_embeddings(self, value): + self.text_model.embeddings.token_embedding = value + + @can_return_tuple + @auto_docstring + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + ) -> BaseModelOutputWithPooling: + r""" + Examples: + + ```python + >>> from transformers import AutoTokenizer, CLIPTextModel + + >>> model = CLIPTextModel.from_pretrained("openai/clip-vit-base-patch32") + >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32") + + >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt") + + >>> outputs = model(**inputs) + >>> last_hidden_state = outputs.last_hidden_state + >>> pooled_output = outputs.pooler_output # pooled (EOS token) states + ```""" + + return self.text_model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + ) + + +class CLIPVisionTransformer(nn.Module): + def __init__(self, config: CLIPVisionConfig): + super().__init__() + self.config = config + embed_dim = config.hidden_size + + self.embeddings = CLIPVisionEmbeddings(config) + self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) + self.encoder = CLIPEncoder(config) + self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) + + @auto_docstring + def forward( + self, + pixel_values: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: Optional[bool] = False, + ) -> BaseModelOutputWithPooling: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + + if pixel_values is None: + raise ValueError("You have to specify pixel_values") + + hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding) + hidden_states = self.pre_layrnorm(hidden_states) + + encoder_outputs: BaseModelOutput = self.encoder( + inputs_embeds=hidden_states, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + ) + + last_hidden_state = encoder_outputs.last_hidden_state + pooled_output = last_hidden_state[:, 0, :] + pooled_output = self.post_layernorm(pooled_output) + + return BaseModelOutputWithPooling( + last_hidden_state=last_hidden_state, + pooler_output=pooled_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + +@auto_docstring( + custom_intro=""" + The vision model from CLIP without any head or projection on top. + """ +) +class CLIPVisionModel(CLIPPreTrainedModel): + config: CLIPVisionConfig + main_input_name = "pixel_values" + _no_split_modules = ["CLIPEncoderLayer"] + + def __init__(self, config: CLIPVisionConfig): + super().__init__(config) + self.vision_model = CLIPVisionTransformer(config) + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self) -> nn.Module: + return self.vision_model.embeddings.patch_embedding + + @can_return_tuple + @auto_docstring + def forward( + self, + pixel_values: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: bool = False, + ) -> BaseModelOutputWithPooling: + r""" + Example: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, CLIPVisionModel + + >>> model = CLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32") + >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> inputs = processor(images=image, return_tensors="pt") + + >>> outputs = model(**inputs) + >>> last_hidden_state = outputs.last_hidden_state + >>> pooled_output = outputs.pooler_output # pooled CLS states + ```""" + + return self.vision_model( + pixel_values=pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + interpolate_pos_encoding=interpolate_pos_encoding, + ) + + +@auto_docstring +class CLIPModel(CLIPPreTrainedModel): + config: CLIPConfig + _no_split_modules = ["CLIPTextEmbeddings", "CLIPEncoderLayer", "CLIPVisionEmbeddings"] + _supports_flash_attn = False # mask creation only accounts for sdpa/eager + + def __init__(self, config: CLIPConfig): + super().__init__(config) + + if not isinstance(config.text_config, CLIPTextConfig): + raise TypeError( + "config.text_config is expected to be of type CLIPTextConfig but is of type" + f" {type(config.text_config)}." + ) + + if not isinstance(config.vision_config, CLIPVisionConfig): + raise TypeError( + "config.vision_config is expected to be of type CLIPVisionConfig but is of type" + f" {type(config.vision_config)}." + ) + + text_config = config.text_config + vision_config = config.vision_config + + self.projection_dim = config.projection_dim + self.text_embed_dim = text_config.hidden_size + self.vision_embed_dim = vision_config.hidden_size + + text_model = CLIPTextModel._from_config(text_config) + self.text_model = text_model.text_model + + vision_model = CLIPVisionModel._from_config(vision_config) + self.vision_model = vision_model.vision_model + + self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False) + self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False) + self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value)) + + # Initialize weights and apply final processing + self.post_init() + + @filter_out_non_signature_kwargs() + @auto_docstring + def get_text_features( + self, + input_ids: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + ) -> torch.FloatTensor: + r""" + Returns: + text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by + applying the projection layer to the pooled output of [`CLIPTextModel`]. + + Examples: + + ```python + >>> import torch + >>> from transformers import AutoTokenizer, CLIPModel + + >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32") + >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32") + + >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt") + + >>> with torch.inference_mode(): + ... text_features = model.get_text_features(**inputs) + ```""" + text_outputs: BaseModelOutputWithPooling = self.text_model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + ) + pooled_output = text_outputs.pooler_output + text_features = self.text_projection(pooled_output) + + return text_features + + @filter_out_non_signature_kwargs() + @auto_docstring + def get_image_features( + self, + pixel_values: torch.FloatTensor, + interpolate_pos_encoding: bool = False, + ) -> torch.FloatTensor: + r""" + Returns: + image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by + applying the projection layer to the pooled output of [`CLIPVisionModel`]. + + Examples: + + ```python + >>> import torch + >>> from transformers import AutoProcessor, CLIPModel + >>> from transformers.image_utils import load_image + + >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32") + >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = load_image(url) + + >>> inputs = processor(images=image, return_tensors="pt") + + >>> with torch.inference_mode(): + ... image_features = model.get_image_features(**inputs) + ```""" + vision_outputs: BaseModelOutputWithPooling = self.vision_model( + pixel_values=pixel_values, + interpolate_pos_encoding=interpolate_pos_encoding, + ) + pooled_output = vision_outputs.pooler_output + image_features = self.visual_projection(pooled_output) + + return image_features + + @can_return_tuple + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + return_loss: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: bool = False, + ) -> CLIPOutput: + r""" + return_loss (`bool`, *optional*): + Whether or not to return the contrastive loss. + + Examples: + + ```python + >>> import torch + >>> from transformers import AutoProcessor, CLIPModel + >>> from transformers.image_utils import load_image + + >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32") + >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = load_image(url) + + >>> inputs = processor( + ... text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True + ... ) + + >>> with torch.inference_mode(): + ... outputs = model(**inputs) + >>> logits_per_image = outputs.logits_per_image # this is the image-text similarity score + >>> probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities + ```""" + # Use CLIP model's config for some fields (if specified) instead of those of vision & text components. + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + + vision_outputs: BaseModelOutputWithPooling = self.vision_model( + pixel_values=pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + interpolate_pos_encoding=interpolate_pos_encoding, + ) + + text_outputs: BaseModelOutputWithPooling = self.text_model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + ) + + image_embeds = vision_outputs.pooler_output + image_embeds = self.visual_projection(image_embeds) + + text_embeds = text_outputs.pooler_output + text_embeds = self.text_projection(text_embeds) + + # normalized features + image_embeds = image_embeds / _get_vector_norm(image_embeds) + text_embeds = text_embeds / _get_vector_norm(text_embeds) + + # cosine similarity as logits + logits_per_text = torch.matmul(text_embeds, image_embeds.t().to(text_embeds.device)) + logits_per_text = logits_per_text * self.logit_scale.exp().to(text_embeds.device) + + logits_per_image = logits_per_text.t() + + loss = None + if return_loss: + loss = clip_loss(logits_per_text) + + return CLIPOutput( + loss=loss, + logits_per_image=logits_per_image, + logits_per_text=logits_per_text, + text_embeds=text_embeds, + image_embeds=image_embeds, + text_model_output=text_outputs, + vision_model_output=vision_outputs, + ) + + +@auto_docstring +class CLIPTextModelWithProjection(CLIPPreTrainedModel): + config: CLIPTextConfig + + _supports_flash_attn = False + _no_split_modules = ["CLIPTextEmbeddings", "CLIPEncoderLayer"] + + def __init__(self, config: CLIPTextConfig): + super().__init__(config) + + text_model = CLIPTextModel._from_config(config) + self.text_model = text_model.text_model + + self.text_projection = nn.Linear(config.hidden_size, config.projection_dim, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self) -> nn.Module: + return self.text_model.embeddings.token_embedding + + def set_input_embeddings(self, value): + self.text_model.embeddings.token_embedding = value + + @can_return_tuple + @auto_docstring + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + ) -> CLIPTextModelOutput: + r""" + Examples: + + ```python + >>> import torch + >>> from transformers import AutoTokenizer, CLIPTextModelWithProjection + + >>> model = CLIPTextModelWithProjection.from_pretrained("openai/clip-vit-base-patch32") + >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32") + + >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt") + + >>> with torch.inference_mode(): + ... outputs = model(**inputs) + >>> text_embeds = outputs.text_embeds + ```""" + + text_outputs: BaseModelOutputWithPooling = self.text_model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + ) + pooled_output = text_outputs.pooler_output + text_embeds = self.text_projection(pooled_output) + + return CLIPTextModelOutput( + text_embeds=text_embeds, + last_hidden_state=text_outputs.last_hidden_state, + hidden_states=text_outputs.hidden_states, + attentions=text_outputs.attentions, + ) + + +@auto_docstring +class CLIPVisionModelWithProjection(CLIPPreTrainedModel): + config: CLIPVisionConfig + main_input_name = "pixel_values" + + def __init__(self, config: CLIPVisionConfig): + super().__init__(config) + + vision_model = CLIPVisionModel._from_config(config) + self.vision_model = vision_model.vision_model + + self.visual_projection = nn.Linear(config.hidden_size, config.projection_dim, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self) -> nn.Module: + return self.vision_model.embeddings.patch_embedding + + @can_return_tuple + @auto_docstring + def forward( + self, + pixel_values: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: bool = False, + ) -> CLIPVisionModelOutput: + r""" + Examples: + + ```python + >>> import torch + >>> from transformers import AutoProcessor, CLIPVisionModelWithProjection + >>> from transformers.image_utils import load_image + + >>> model = CLIPVisionModelWithProjection.from_pretrained("openai/clip-vit-base-patch32") + >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = load_image(url) + + >>> inputs = processor(images=image, return_tensors="pt") + + >>> with torch.inference_mode(): + ... outputs = model(**inputs) + >>> image_embeds = outputs.image_embeds + ```""" + + vision_outputs: BaseModelOutputWithPooling = self.vision_model( + pixel_values=pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + interpolate_pos_encoding=interpolate_pos_encoding, + ) + pooled_output = vision_outputs.pooler_output + image_embeds = self.visual_projection(pooled_output) + + return CLIPVisionModelOutput( + image_embeds=image_embeds, + last_hidden_state=vision_outputs.last_hidden_state, + hidden_states=vision_outputs.hidden_states, + attentions=vision_outputs.attentions, + ) + + +@auto_docstring( + custom_intro=""" + CLIP vision encoder with an image classification head on top (a linear layer on top of the pooled final hidden states of + the patch tokens) e.g. for ImageNet. + """ +) +class CLIPForImageClassification(CLIPPreTrainedModel): + main_input_name = "pixel_values" + + def __init__(self, config: CLIPConfig) -> None: + super().__init__(config) + + self.num_labels = config.num_labels + vision_model = CLIPVisionModel._from_config(config.vision_config) + self.vision_model = vision_model.vision_model + + # Classifier head + self.classifier = ( + nn.Linear(config.vision_config.hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity() + ) + + # Initialize weights and apply final processing + self.post_init() + + @can_return_tuple + @auto_docstring + def forward( + self, + pixel_values: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + ) -> ImageClassifierOutput: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the image classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + + outputs: BaseModelOutputWithPooling = self.vision_model( + pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + ) + + sequence_output = outputs.last_hidden_state + + # average pool the patch tokens + sequence_output = torch.mean(sequence_output[:, 1:, :], dim=1) + # apply classifier + logits = self.classifier(sequence_output) + + loss = None + if labels is not None: + loss = self.loss_function(labels, logits, self.config) + + return ImageClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +__all__ = [ + "CLIPModel", + "CLIPPreTrainedModel", + "CLIPTextModel", + "CLIPTextModelWithProjection", + "CLIPVisionModel", + "CLIPVisionModelWithProjection", + "CLIPForImageClassification", +] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/clip/modeling_flax_clip.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/clip/modeling_flax_clip.py new file mode 100644 index 0000000000000000000000000000000000000000..0394974d06477064c2a151e499a847002106fe45 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/clip/modeling_flax_clip.py @@ -0,0 +1,1306 @@ +# coding=utf-8 +# Copyright 2021 The OpenAI Team Authors, The Google Flax Team Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Any, Optional, Union + +import flax +import flax.linen as nn +import jax +import jax.numpy as jnp +from flax.core.frozen_dict import FrozenDict, freeze, unfreeze +from flax.linen import combine_masks, make_causal_mask +from flax.linen.attention import dot_product_attention_weights +from flax.traverse_util import flatten_dict, unflatten_dict +from jax import lax + +from ...modeling_flax_outputs import FlaxBaseModelOutput, FlaxBaseModelOutputWithPooling +from ...modeling_flax_utils import ( + ACT2FN, + FlaxPreTrainedModel, + append_replace_return_docstrings, + overwrite_call_docstring, +) +from ...utils import ModelOutput, add_start_docstrings, logging +from .configuration_clip import CLIPConfig, CLIPTextConfig, CLIPVisionConfig + + +logger = logging.get_logger(__name__) + +CLIP_START_DOCSTRING = r""" + + This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading, saving and converting weights from PyTorch models) + + This model is also a + [flax.linen.Module](https://flax.readthedocs.io/en/latest/api_reference/flax.linen/module.html) subclass. Use it as + a regular Flax linen Module and refer to the Flax documentation for all matter related to general usage and + behavior. + + Finally, this model supports inherent JAX features such as: + + - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit) + - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation) + - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap) + - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap) + + Parameters: + config ([`CLIPConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights. + dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`): + The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on GPUs) and + `jax.numpy.bfloat16` (on TPUs). + + This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If + specified all the computation will be performed with the given `dtype`. + + **Note that this only specifies the dtype of the computation and does not influence the dtype of model + parameters.** + + If you wish to change the dtype of the model parameters, see [`~FlaxPreTrainedModel.to_fp16`] and + [`~FlaxPreTrainedModel.to_bf16`]. +""" + +CLIP_TEXT_INPUTS_DOCSTRING = r""" + Args: + input_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + + [What are position IDs?](../glossary#position-ids) + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + +CLIP_VISION_INPUTS_DOCSTRING = r""" + Args: + pixel_values (`numpy.ndarray` of shape `(batch_size, num_channels, height, width)`): + Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using + [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + +CLIP_INPUTS_DOCSTRING = r""" + Args: + input_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + + [What are position IDs?](../glossary#position-ids) + pixel_values (`numpy.ndarray` of shape `(batch_size, num_channels, height, width)`): + Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using + [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +@flax.struct.dataclass +class FlaxCLIPTextModelOutput(ModelOutput): + """ + Base class for text model's outputs that also contains a pooling of the last hidden states. + + Args: + text_embeds (`jnp.ndarray` of shape `(batch_size, output_dim`): + The text embeddings obtained by applying the projection layer to the pooled output of + [`FlaxCLIPTextModel`]. + last_hidden_state (`jnp.ndarray` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape + `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + text_embeds: jnp.ndarray = None + last_hidden_state: jnp.ndarray = None + hidden_states: Optional[tuple[jnp.ndarray, ...]] = None + attentions: Optional[tuple[jnp.ndarray, ...]] = None + + +@flax.struct.dataclass +class FlaxCLIPOutput(ModelOutput): + """ + Args: + logits_per_image:(`jnp.ndarray` of shape `(image_batch_size, text_batch_size)`): + The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text + similarity scores. + logits_per_text:(`jnp.ndarray` of shape `(text_batch_size, image_batch_size)`): + The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image + similarity scores. + text_embeds(`jnp.ndarray` of shape `(batch_size, output_dim`): + The text embeddings obtained by applying the projection layer to the pooled output of + [`FlaxCLIPTextModel`]. + image_embeds(`jnp.ndarray` of shape `(batch_size, output_dim`): + The image embeddings obtained by applying the projection layer to the pooled output of + [`FlaxCLIPVisionModel`]. + text_model_output(`FlaxBaseModelOutputWithPooling`): + The output of the [`FlaxCLIPTextModel`]. + vision_model_output(`FlaxBaseModelOutputWithPooling`): + The output of the [`FlaxCLIPVisionModel`]. + """ + + logits_per_image: jnp.ndarray = None + logits_per_text: jnp.ndarray = None + text_embeds: jnp.ndarray = None + image_embeds: jnp.ndarray = None + text_model_output: FlaxBaseModelOutputWithPooling = None + vision_model_output: FlaxBaseModelOutputWithPooling = None + + def to_tuple(self) -> tuple[Any]: + return tuple( + self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple() + for k in self.keys() + ) + + +class FlaxCLIPVisionEmbeddings(nn.Module): + config: CLIPVisionConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self): + embed_dim = self.config.hidden_size + image_size = self.config.image_size + patch_size = self.config.patch_size + + self.class_embedding = self.param("class_embedding", jax.nn.initializers.normal(stddev=0.02), (embed_dim,)) + + self.patch_embedding = nn.Conv( + embed_dim, + kernel_size=(patch_size, patch_size), + strides=(patch_size, patch_size), + padding="VALID", + use_bias=False, + dtype=self.dtype, + kernel_init=jax.nn.initializers.normal(), + ) + + self.num_patches = (image_size // patch_size) ** 2 + num_positions = self.num_patches + 1 + self.position_embedding = nn.Embed(num_positions, embed_dim, embedding_init=jax.nn.initializers.normal()) + self.position_ids = jnp.expand_dims(jnp.arange(0, num_positions, dtype="i4"), axis=0) + + def __call__(self, pixel_values): + patch_embeds = self.patch_embedding(pixel_values) + batch_size, height, width, channels = patch_embeds.shape + patch_embeds = jnp.reshape(patch_embeds, (batch_size, height * width, channels)) + + class_embeds = jnp.expand_dims(self.class_embedding, axis=(0, 1)) + class_embeds = jnp.tile(class_embeds, (batch_size, 1, 1)) + embeddings = jnp.concatenate([class_embeds, patch_embeds], axis=1) + embeddings = embeddings + self.position_embedding(self.position_ids) + return embeddings + + +class FlaxCLIPTextEmbeddings(nn.Module): + config: CLIPTextConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self): + embed_dim = self.config.hidden_size + + self.token_embedding = nn.Embed(self.config.vocab_size, embed_dim, embedding_init=jax.nn.initializers.normal()) + self.position_embedding = nn.Embed( + self.config.max_position_embeddings, embed_dim, embedding_init=jax.nn.initializers.normal() + ) + self.position_ids = jnp.expand_dims( + jnp.arange(0, self.config.max_position_embeddings, dtype="i4"), axis=(0, 1) + ) + + def __call__(self, input_ids, position_ids): + input_embeds = self.token_embedding(input_ids.astype("i4")) + position_embeds = self.position_embedding(position_ids.astype("i4")) + + embeddings = input_embeds + position_embeds + return embeddings + + +class FlaxCLIPAttention(nn.Module): + config: Union[CLIPTextConfig, CLIPVisionConfig] + dtype: jnp.dtype = jnp.float32 + + def setup(self): + self.embed_dim = self.config.hidden_size + self.num_heads = self.config.num_attention_heads + self.head_dim = self.embed_dim // self.num_heads + if self.head_dim * self.num_heads != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:" + f" {self.num_heads})." + ) + self.scale = self.head_dim**-0.5 + self.dropout = self.config.attention_dropout + + self.k_proj = nn.Dense(self.embed_dim, dtype=self.dtype, kernel_init=jax.nn.initializers.normal(0.01)) + self.v_proj = nn.Dense(self.embed_dim, dtype=self.dtype, kernel_init=jax.nn.initializers.normal(0.01)) + self.q_proj = nn.Dense(self.embed_dim, dtype=self.dtype, kernel_init=jax.nn.initializers.normal(0.01)) + self.out_proj = nn.Dense(self.embed_dim, dtype=self.dtype, kernel_init=jax.nn.initializers.normal(0.01)) + + self.causal = isinstance(self.config, CLIPTextConfig) + if self.causal: + self.causal_mask = make_causal_mask(jnp.ones((1, self.config.max_position_embeddings), dtype="i4")) + + def _split_heads(self, hidden_states): + return hidden_states.reshape(hidden_states.shape[:2] + (self.num_heads, self.head_dim)) + + def _merge_heads(self, hidden_states): + return hidden_states.reshape(hidden_states.shape[:2] + (self.embed_dim,)) + + def __call__( + self, + hidden_states, + attention_mask=None, + deterministic: bool = True, + output_attentions: bool = False, + ): + query = self.q_proj(hidden_states) + key = self.k_proj(hidden_states) + value = self.v_proj(hidden_states) + + query = self._split_heads(query) + key = self._split_heads(key) + value = self._split_heads(value) + + causal_attention_mask = None + if self.causal: + query_length, key_length = query.shape[1], key.shape[1] + causal_attention_mask = self.causal_mask[:, :, key_length - query_length : key_length, :key_length] + + if attention_mask is not None and causal_attention_mask is not None: + attention_mask = jnp.expand_dims(attention_mask, axis=(-3, -2)) + attention_mask = combine_masks(attention_mask, causal_attention_mask, dtype="i4") + elif causal_attention_mask is not None: + attention_mask = causal_attention_mask + elif attention_mask is not None: + attention_mask = jnp.expand_dims(attention_mask, axis=(-3, -2)) + + if attention_mask is not None: + attention_bias = lax.select( + attention_mask > 0, + jnp.full(attention_mask.shape, 0.0).astype(self.dtype), + jnp.full(attention_mask.shape, jnp.finfo(self.dtype).min).astype(self.dtype), + ) + else: + attention_bias = None + + dropout_rng = None + if not deterministic and self.dropout > 0.0: + dropout_rng = self.make_rng("dropout") + + attn_weights = dot_product_attention_weights( + query, + key, + bias=attention_bias, + dropout_rng=dropout_rng, + dropout_rate=self.dropout, + deterministic=deterministic, + dtype=self.dtype, + precision=None, + ) + + attn_output = jnp.einsum("...hqk,...khd->...qhd", attn_weights, value) + attn_output = self._merge_heads(attn_output) + attn_output = self.out_proj(attn_output) + + outputs = (attn_output, attn_weights) if output_attentions else (attn_output,) + return outputs + + +class FlaxCLIPMLP(nn.Module): + config: Union[CLIPTextConfig, CLIPVisionConfig] + dtype: jnp.dtype = jnp.float32 + + def setup(self): + self.activation_fn = ACT2FN[self.config.hidden_act] + self.fc1 = nn.Dense( + self.config.intermediate_size, + dtype=self.dtype, + kernel_init=jax.nn.initializers.normal(0.01), + ) + self.fc2 = nn.Dense(self.config.hidden_size, dtype=self.dtype, kernel_init=jax.nn.initializers.normal(0.01)) + + def __call__(self, hidden_states): + hidden_states = self.fc1(hidden_states) + hidden_states = self.activation_fn(hidden_states) + hidden_states = self.fc2(hidden_states) + return hidden_states + + +class FlaxCLIPEncoderLayer(nn.Module): + config: Union[CLIPTextConfig, CLIPVisionConfig] + dtype: jnp.dtype = jnp.float32 + + def setup(self): + self.self_attn = FlaxCLIPAttention(self.config, dtype=self.dtype) + self.layer_norm1 = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype) + self.mlp = FlaxCLIPMLP(self.config, dtype=self.dtype) + self.layer_norm2 = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype) + + def __call__( + self, + hidden_states, + attention_mask, + deterministic: bool = True, + output_attentions: bool = False, + ): + residual = hidden_states + + hidden_states = self.layer_norm1(hidden_states) + attn_outputs = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + deterministic=deterministic, + output_attentions=output_attentions, + ) + hidden_states = attn_outputs[0] + hidden_states = residual + hidden_states + + residual = hidden_states + hidden_states = self.layer_norm2(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += attn_outputs[1:] + + return outputs + + +class FlaxCLIPLayerCollection(nn.Module): + config: Union[CLIPTextConfig, CLIPVisionConfig] + dtype: jnp.dtype = jnp.float32 + + def setup(self): + self.layers = [ + FlaxCLIPEncoderLayer(self.config, name=str(i), dtype=self.dtype) + for i in range(self.config.num_hidden_layers) + ] + + def __call__( + self, + hidden_states, + attention_mask=None, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + all_attentions = () if output_attentions else None + all_hidden_states = () if output_hidden_states else None + + for layer in self.layers: + if output_hidden_states: + all_hidden_states += (hidden_states,) + + layer_outputs = layer( + hidden_states, attention_mask, deterministic=deterministic, output_attentions=output_attentions + ) + hidden_states = layer_outputs[0] + + if output_attentions: + all_attentions += (layer_outputs[1],) + + if output_hidden_states: + all_hidden_states += (hidden_states,) + + outputs = (hidden_states,) + + if not return_dict: + return tuple(v for v in outputs if v is not None) + + return FlaxBaseModelOutput( + last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions + ) + + +class FlaxCLIPEncoder(nn.Module): + config: Union[CLIPTextConfig, CLIPVisionConfig] + dtype: jnp.dtype = jnp.float32 + + def setup(self): + self.layers = FlaxCLIPLayerCollection(self.config, dtype=self.dtype) + + def __call__( + self, + inputs_embeds, + attention_mask=None, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + return self.layers( + hidden_states=inputs_embeds, + attention_mask=attention_mask, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + +class FlaxCLIPTextTransformer(nn.Module): + config: CLIPTextConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self): + self.embeddings = FlaxCLIPTextEmbeddings(self.config, dtype=self.dtype) + self.encoder = FlaxCLIPEncoder(self.config, dtype=self.dtype) + self.final_layer_norm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype) + + # For `pooled_output` computation + self.eos_token_id = self.config.eos_token_id + + def __call__( + self, + input_ids, + attention_mask, + position_ids, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids) + + encoder_outputs = self.encoder( + inputs_embeds=hidden_states, + attention_mask=attention_mask, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + last_hidden_state = encoder_outputs[0] + last_hidden_state = self.final_layer_norm(last_hidden_state) + + if self.eos_token_id == 2: + # The `eos_token_id` was incorrect before PR #24773: Let's keep what have been done here. + # A CLIP model with such `eos_token_id` in the config can't work correctly with extra new tokens added + # ------------------------------------------------------------ + # text_embeds.shape = [batch_size, sequence_length, transformer.width] + # take features from the EOS embedding (eos_token_id is the highest number in each sequence) + pooled_output = last_hidden_state[jnp.arange(last_hidden_state.shape[0]), input_ids.argmax(axis=-1)] + else: + # (no need to cast from bool to int after comparing to `eos_token_id`) + pooled_output = last_hidden_state[ + jnp.arange(last_hidden_state.shape[0]), (input_ids == self.eos_token_id).argmax(axis=-1) + ] + + if not return_dict: + return (last_hidden_state, pooled_output) + encoder_outputs[1:] + + return FlaxBaseModelOutputWithPooling( + last_hidden_state=last_hidden_state, + pooler_output=pooled_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + +class FlaxCLIPVisionTransformer(nn.Module): + config: CLIPVisionConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self): + self.embeddings = FlaxCLIPVisionEmbeddings(self.config, dtype=self.dtype) + self.pre_layrnorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype) + self.encoder = FlaxCLIPEncoder(self.config, dtype=self.dtype) + self.post_layernorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype) + + def __call__( + self, + pixel_values=None, + deterministic: bool = True, + output_attentions=None, + output_hidden_states=None, + return_dict: bool = True, + ): + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + hidden_states = self.embeddings(pixel_values) + hidden_states = self.pre_layrnorm(hidden_states) + + encoder_outputs = self.encoder( + inputs_embeds=hidden_states, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + last_hidden_state = encoder_outputs[0] + pooled_output = last_hidden_state[:, 0, :] + pooled_output = self.post_layernorm(pooled_output) + + if not return_dict: + return (last_hidden_state, pooled_output) + encoder_outputs[1:] + + return FlaxBaseModelOutputWithPooling( + last_hidden_state=last_hidden_state, + pooler_output=pooled_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + +class FlaxCLIPTextPreTrainedModel(FlaxPreTrainedModel): + config_class = CLIPTextConfig + module_class: nn.Module = None + + def __init__( + self, + config: CLIPTextConfig, + input_shape=(1, 1), + seed: int = 0, + dtype: jnp.dtype = jnp.float32, + _do_init: bool = True, + **kwargs, + ): + module = self.module_class(config=config, dtype=dtype, **kwargs) + super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init) + + def init_weights(self, rng: jax.random.PRNGKey, input_shape: tuple, params: FrozenDict = None) -> FrozenDict: + # init input tensor + input_ids = jnp.zeros(input_shape, dtype="i4") + position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_shape) + attention_mask = jnp.ones_like(input_ids) + + params_rng, dropout_rng = jax.random.split(rng) + rngs = {"params": params_rng, "dropout": dropout_rng} + + random_params = self.module.init(rngs, input_ids, attention_mask, position_ids)["params"] + + if params is not None: + random_params = flatten_dict(unfreeze(random_params)) + params = flatten_dict(unfreeze(params)) + for missing_key in self._missing_keys: + params[missing_key] = random_params[missing_key] + self._missing_keys = set() + return freeze(unflatten_dict(params)) + else: + return random_params + + def __call__( + self, + input_ids, + attention_mask=None, + position_ids=None, + params: Optional[dict] = None, + dropout_rng: jax.random.PRNGKey = None, + train: bool = False, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ): + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.return_dict + + if position_ids is None: + position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape) + + if attention_mask is None: + attention_mask = jnp.ones_like(input_ids) + + # Handle any PRNG if needed + rngs = {} + if dropout_rng is not None: + rngs["dropout"] = dropout_rng + + return self.module.apply( + {"params": params or self.params}, + jnp.array(input_ids, dtype="i4"), + jnp.array(attention_mask, dtype="i4"), + jnp.array(position_ids, dtype="i4"), + not train, + output_attentions, + output_hidden_states, + return_dict, + rngs=rngs, + ) + + +class FlaxCLIPVisionPreTrainedModel(FlaxPreTrainedModel): + config_class = CLIPVisionConfig + main_input_name = "pixel_values" + module_class: nn.Module = None + + def __init__( + self, + config: CLIPVisionConfig, + input_shape: Optional[tuple] = None, + seed: int = 0, + dtype: jnp.dtype = jnp.float32, + _do_init: bool = True, + **kwargs, + ): + if input_shape is None: + input_shape = (1, config.image_size, config.image_size, 3) + module = self.module_class(config=config, dtype=dtype, **kwargs) + super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init) + + def init_weights(self, rng: jax.random.PRNGKey, input_shape: tuple, params: FrozenDict = None) -> FrozenDict: + # init input tensor + pixel_values = jax.random.normal(rng, input_shape) + + params_rng, dropout_rng = jax.random.split(rng) + rngs = {"params": params_rng, "dropout": dropout_rng} + + random_params = self.module.init(rngs, pixel_values)["params"] + + if params is not None: + random_params = flatten_dict(unfreeze(random_params)) + params = flatten_dict(unfreeze(params)) + for missing_key in self._missing_keys: + params[missing_key] = random_params[missing_key] + self._missing_keys = set() + return freeze(unflatten_dict(params)) + else: + return random_params + + def __call__( + self, + pixel_values, + params: Optional[dict] = None, + dropout_rng: jax.random.PRNGKey = None, + train: bool = False, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ): + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.return_dict + + pixel_values = jnp.transpose(pixel_values, (0, 2, 3, 1)) + + # Handle any PRNG if needed + rngs = {} + if dropout_rng is not None: + rngs["dropout"] = dropout_rng + + return self.module.apply( + {"params": params or self.params}, + jnp.array(pixel_values, dtype=jnp.float32), + not train, + output_attentions, + output_hidden_states, + return_dict, + rngs=rngs, + ) + + +class FlaxCLIPPreTrainedModel(FlaxPreTrainedModel): + config_class = CLIPConfig + module_class: nn.Module = None + + def __init__( + self, + config: CLIPConfig, + input_shape: Optional[tuple] = None, + seed: int = 0, + dtype: jnp.dtype = jnp.float32, + _do_init: bool = True, + **kwargs, + ): + if input_shape is None: + input_shape = ((1, 1), (1, config.vision_config.image_size, config.vision_config.image_size, 3)) + module = self.module_class(config=config, dtype=dtype, **kwargs) + super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init) + + def init_weights(self, rng: jax.random.PRNGKey, input_shape: tuple, params: FrozenDict = None) -> FrozenDict: + # init input tensor + input_ids = jnp.zeros(input_shape[0], dtype="i4") + position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_shape[0]) + attention_mask = jnp.ones_like(input_ids) + + pixel_values = jax.random.normal(rng, input_shape[1]) + + params_rng, dropout_rng = jax.random.split(rng) + rngs = {"params": params_rng, "dropout": dropout_rng} + + random_params = self.module.init(rngs, input_ids, pixel_values, attention_mask, position_ids)["params"] + + if params is not None: + random_params = flatten_dict(unfreeze(random_params)) + params = flatten_dict(unfreeze(params)) + for missing_key in self._missing_keys: + params[missing_key] = random_params[missing_key] + self._missing_keys = set() + return freeze(unflatten_dict(params)) + else: + return random_params + + def __call__( + self, + input_ids, + pixel_values, + attention_mask=None, + position_ids=None, + params: Optional[dict] = None, + dropout_rng: jax.random.PRNGKey = None, + train: bool = False, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ): + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.return_dict + + if position_ids is None: + position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape) + + if attention_mask is None: + attention_mask = jnp.ones_like(input_ids) + + pixel_values = jnp.transpose(pixel_values, (0, 2, 3, 1)) + + # Handle any PRNG if needed + rngs = {} + if dropout_rng is not None: + rngs["dropout"] = dropout_rng + + return self.module.apply( + {"params": params or self.params}, + jnp.array(input_ids, dtype="i4"), + jnp.array(pixel_values, dtype=jnp.float32), + jnp.array(attention_mask, dtype="i4"), + jnp.array(position_ids, dtype="i4"), + not train, + output_attentions, + output_hidden_states, + return_dict, + rngs=rngs, + ) + + def get_text_features( + self, + input_ids, + attention_mask=None, + position_ids=None, + params: Optional[dict] = None, + dropout_rng: jax.random.PRNGKey = None, + train=False, + ): + r""" + Args: + input_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you + provide it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + + Returns: + text_features (`jnp.ndarray` of shape `(batch_size, output_dim`): The text embeddings obtained by applying + the projection layer to the pooled output of [`FlaxCLIPTextModel`]. + + Examples: + + ```python + >>> from transformers import AutoTokenizer, FlaxCLIPModel + + >>> model = FlaxCLIPModel.from_pretrained("openai/clip-vit-base-patch32") + >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32") + + >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="np") + >>> text_features = model.get_text_features(**inputs) + ```""" + if position_ids is None: + position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape) + + if attention_mask is None: + attention_mask = jnp.ones_like(input_ids) + + # Handle any PRNG if needed + rngs = {} + if dropout_rng is not None: + rngs["dropout"] = dropout_rng + + def _get_features(module, input_ids, attention_mask, position_ids, deterministic): + text_outputs = module.text_model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + deterministic=deterministic, + ) + pooled_output = text_outputs[1] + text_features = module.text_projection(pooled_output) + return text_features + + return self.module.apply( + {"params": params or self.params}, + jnp.array(input_ids, dtype="i4"), + jnp.array(attention_mask, dtype="i4"), + jnp.array(position_ids, dtype="i4"), + not train, + method=_get_features, + rngs=rngs, + ) + + def get_image_features( + self, pixel_values, params: Optional[dict] = None, dropout_rng: jax.random.PRNGKey = None, train=False + ): + r""" + Args: + pixel_values (`numpy.ndarray` of shape `(batch_size, num_channels, height, width)`): + Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained + using [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details. + + Returns: + image_features (`jnp.ndarray` of shape `(batch_size, output_dim`): The image embeddings obtained by + applying the projection layer to the pooled output of [`FlaxCLIPVisionModel`] + + Examples: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, FlaxCLIPModel + + >>> model = FlaxCLIPModel.from_pretrained("openai/clip-vit-base-patch32") + >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> inputs = processor(images=image, return_tensors="np") + + >>> image_features = model.get_image_features(**inputs) + ```""" + pixel_values = jnp.transpose(pixel_values, (0, 2, 3, 1)) + + # Handle any PRNG if needed + rngs = {} + if dropout_rng is not None: + rngs["dropout"] = dropout_rng + + def _get_features(module, pixel_values, deterministic): + vision_outputs = module.vision_model(pixel_values=pixel_values, deterministic=deterministic) + pooled_output = vision_outputs[1] # pooled_output + image_features = module.visual_projection(pooled_output) + return image_features + + return self.module.apply( + {"params": params or self.params}, + jnp.array(pixel_values, dtype=jnp.float32), + not train, + method=_get_features, + rngs=rngs, + ) + + +class FlaxCLIPTextModule(nn.Module): + config: CLIPTextConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self): + self.text_model = FlaxCLIPTextTransformer(self.config, dtype=self.dtype) + + def __call__( + self, + input_ids, + attention_mask, + position_ids, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + return self.text_model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + +class FlaxCLIPTextModel(FlaxCLIPTextPreTrainedModel): + module_class = FlaxCLIPTextModule + + +FLAX_CLIP_TEXT_MODEL_DOCSTRING = """ + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, FlaxCLIPTextModel + + >>> model = FlaxCLIPTextModel.from_pretrained("openai/clip-vit-base-patch32") + >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32") + + >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="np") + + >>> outputs = model(**inputs) + >>> last_hidden_state = outputs.last_hidden_state + >>> pooler_output = outputs.pooler_output # pooled (EOS token) states + ``` +""" + +overwrite_call_docstring(FlaxCLIPTextModel, CLIP_TEXT_INPUTS_DOCSTRING + FLAX_CLIP_TEXT_MODEL_DOCSTRING) +append_replace_return_docstrings( + FlaxCLIPTextModel, output_type=FlaxBaseModelOutputWithPooling, config_class=CLIPTextConfig +) + + +class FlaxCLIPTextModelWithProjectionModule(nn.Module): + config: CLIPTextConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self): + self.text_model = FlaxCLIPTextTransformer(self.config, dtype=self.dtype) + self.text_projection = nn.Dense(self.config.projection_dim, use_bias=False, dtype=self.dtype) + + def __call__( + self, + input_ids, + attention_mask, + position_ids, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + text_outputs = self.text_model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + pooled_output = text_outputs[1] + text_embeds = self.text_projection(pooled_output) + + if not return_dict: + return (text_embeds, text_outputs[0]) + text_outputs[2:] + + return FlaxCLIPTextModelOutput( + text_embeds=text_embeds, + last_hidden_state=text_outputs.last_hidden_state, + hidden_states=text_outputs.hidden_states, + attentions=text_outputs.attentions, + ) + + +class FlaxCLIPTextModelWithProjection(FlaxCLIPTextPreTrainedModel): + module_class = FlaxCLIPTextModelWithProjectionModule + + +FLAX_CLIP_TEXT_MODEL_WITH_PROJECTION_DOCSTRING = """ + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, FlaxCLIPTextModelWithProjection + + >>> model = FlaxCLIPTextModelWithProjection.from_pretrained("openai/clip-vit-base-patch32") + >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32") + + >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="np") + + >>> outputs = model(**inputs) + >>> text_embeds = outputs.text_embeds + ``` +""" + +overwrite_call_docstring( + FlaxCLIPTextModelWithProjection, CLIP_TEXT_INPUTS_DOCSTRING + FLAX_CLIP_TEXT_MODEL_WITH_PROJECTION_DOCSTRING +) +append_replace_return_docstrings( + FlaxCLIPTextModelWithProjection, output_type=FlaxCLIPTextModelOutput, config_class=CLIPTextConfig +) + + +class FlaxCLIPVisionModule(nn.Module): + config: CLIPVisionConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self): + self.vision_model = FlaxCLIPVisionTransformer(self.config, dtype=self.dtype) + + def __call__( + self, + pixel_values, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + return self.vision_model( + pixel_values=pixel_values, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + +class FlaxCLIPVisionModel(FlaxCLIPVisionPreTrainedModel): + module_class = FlaxCLIPVisionModule + + +FLAX_CLIP_VISION_MODEL_DOCSTRING = """ + Returns: + + Example: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, FlaxCLIPVisionModel + + >>> model = FlaxCLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32") + >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> inputs = processor(images=image, return_tensors="np") + + >>> outputs = model(**inputs) + >>> last_hidden_state = outputs.last_hidden_state + >>> pooler_output = outputs.pooler_output # pooled CLS states + ``` +""" + +overwrite_call_docstring(FlaxCLIPVisionModel, CLIP_VISION_INPUTS_DOCSTRING + FLAX_CLIP_VISION_MODEL_DOCSTRING) +append_replace_return_docstrings( + FlaxCLIPVisionModel, output_type=FlaxBaseModelOutputWithPooling, config_class=CLIPVisionConfig +) + + +class FlaxCLIPModule(nn.Module): + config: CLIPConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self): + text_config = self.config.text_config + vision_config = self.config.vision_config + + self.projection_dim = self.config.projection_dim + self.text_embed_dim = text_config.hidden_size + self.vision_embed_dim = vision_config.hidden_size + + self.text_model = FlaxCLIPTextTransformer(text_config, dtype=self.dtype) + self.vision_model = FlaxCLIPVisionTransformer(vision_config, dtype=self.dtype) + + self.visual_projection = nn.Dense( + self.projection_dim, + dtype=self.dtype, + kernel_init=jax.nn.initializers.normal(0.02), + use_bias=False, + ) + self.text_projection = nn.Dense( + self.projection_dim, + dtype=self.dtype, + kernel_init=jax.nn.initializers.normal(0.02), + use_bias=False, + ) + + self.logit_scale = self.param( + "logit_scale", lambda _, shape: jnp.ones(shape) * self.config.logit_scale_init_value, [] + ) + + def __call__( + self, + input_ids=None, + pixel_values=None, + attention_mask=None, + position_ids=None, + deterministic: bool = True, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + return_dict = return_dict if return_dict is not None else self.config.return_dict + + vision_outputs = self.vision_model( + pixel_values=pixel_values, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + text_outputs = self.text_model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + image_embeds = vision_outputs[1] + image_embeds = self.visual_projection(image_embeds) + + text_embeds = text_outputs[1] + text_embeds = self.text_projection(text_embeds) + + # normalized features + image_embeds = image_embeds / jnp.linalg.norm(image_embeds, axis=-1, keepdims=True) + text_embeds = text_embeds / jnp.linalg.norm(text_embeds, axis=-1, keepdims=True) + + # cosine similarity as logits + logit_scale = jnp.exp(self.logit_scale) + logits_per_text = jnp.matmul(text_embeds, image_embeds.T) * logit_scale + logits_per_image = logits_per_text.T + + if not return_dict: + return (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs) + + return FlaxCLIPOutput( + logits_per_image=logits_per_image, + logits_per_text=logits_per_text, + text_embeds=text_embeds, + image_embeds=image_embeds, + text_model_output=text_outputs, + vision_model_output=vision_outputs, + ) + + +@add_start_docstrings(CLIP_START_DOCSTRING) +class FlaxCLIPModel(FlaxCLIPPreTrainedModel): + module_class = FlaxCLIPModule + + +FLAX_CLIP_MODEL_DOCSTRING = """ + Returns: + + Example: + + ```python + >>> import jax + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, FlaxCLIPModel + + >>> model = FlaxCLIPModel.from_pretrained("openai/clip-vit-base-patch32") + >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> inputs = processor( + ... text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="np", padding=True + ... ) + + >>> outputs = model(**inputs) + >>> logits_per_image = outputs.logits_per_image # this is the image-text similarity score + >>> probs = jax.nn.softmax(logits_per_image, axis=1) # we can take the softmax to get the label probabilities + ``` +""" + +overwrite_call_docstring(FlaxCLIPModel, CLIP_INPUTS_DOCSTRING + FLAX_CLIP_MODEL_DOCSTRING) +append_replace_return_docstrings(FlaxCLIPModel, output_type=FlaxCLIPOutput, config_class=CLIPConfig) + + +__all__ = [ + "FlaxCLIPModel", + "FlaxCLIPPreTrainedModel", + "FlaxCLIPTextModel", + "FlaxCLIPTextPreTrainedModel", + "FlaxCLIPTextModelWithProjection", + "FlaxCLIPVisionModel", + "FlaxCLIPVisionPreTrainedModel", +] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/clip/modeling_tf_clip.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/clip/modeling_tf_clip.py new file mode 100644 index 0000000000000000000000000000000000000000..ab2e38827998ad4c67059b42387dbe9c192c6a18 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/clip/modeling_tf_clip.py @@ -0,0 +1,1460 @@ +# coding=utf-8 +# Copyright 2021 The OpenAI Team Authors and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""TF 2.0 CLIP model.""" + +from __future__ import annotations + +import math +from dataclasses import dataclass +from typing import Any + +import numpy as np +import tensorflow as tf + +from ...activations_tf import get_tf_activation +from ...modeling_tf_outputs import TFBaseModelOutput, TFBaseModelOutputWithPooling + +# Public API +from ...modeling_tf_utils import ( + TFModelInputType, + TFPreTrainedModel, + get_initializer, + keras, + keras_serializable, + unpack_inputs, +) +from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax +from ...utils import ( + ModelOutput, + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, + replace_return_docstrings, +) +from .configuration_clip import CLIPConfig, CLIPTextConfig, CLIPVisionConfig + + +logger = logging.get_logger(__name__) + +_CHECKPOINT_FOR_DOC = "openai/clip-vit-base-patch32" + + +LARGE_NEGATIVE = -1e8 + + +# Copied from transformers.models.bart.modeling_tf_bart._expand_mask +def _expand_mask(mask: tf.Tensor, tgt_len: int | None = None): + """ + Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`. + """ + src_len = shape_list(mask)[1] + tgt_len = tgt_len if tgt_len is not None else src_len + one_cst = tf.constant(1.0) + mask = tf.cast(mask, dtype=one_cst.dtype) + expanded_mask = tf.tile(mask[:, None, None, :], (1, 1, tgt_len, 1)) + + return (one_cst - expanded_mask) * LARGE_NEGATIVE + + +# contrastive loss function, adapted from +# https://sachinruk.github.io/blog/pytorch/pytorch%20lightning/loss%20function/gpu/2021/03/07/CLIP.html +def contrastive_loss(logits: tf.Tensor) -> tf.Tensor: + return tf.math.reduce_mean( + keras.metrics.sparse_categorical_crossentropy( + y_true=tf.range(shape_list(logits)[0]), y_pred=logits, from_logits=True + ) + ) + + +def clip_loss(similarity: tf.Tensor) -> tf.Tensor: + caption_loss = contrastive_loss(similarity) + image_loss = contrastive_loss(tf.transpose(similarity)) + return (caption_loss + image_loss) / 2.0 + + +@dataclass +class TFCLIPOutput(ModelOutput): + """ + Args: + loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`): + Contrastive loss for image-text similarity. + logits_per_image:(`tf.Tensor` of shape `(image_batch_size, text_batch_size)`): + The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text + similarity scores. + logits_per_text:(`tf.Tensor` of shape `(text_batch_size, image_batch_size)`): + The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image + similarity scores. + text_embeds(`tf.Tensor` of shape `(batch_size, output_dim`): + The text embeddings obtained by applying the projection layer to the pooled output of [`TFCLIPTextModel`]. + image_embeds(`tf.Tensor` of shape `(batch_size, output_dim`): + The image embeddings obtained by applying the projection layer to the pooled output of + [`TFCLIPVisionModel`]. + text_model_output([`~modeling_tf_utils.TFBaseModelOutputWithPooling`]): + The output of the [`TFCLIPTextModel`]. + vision_model_output([`~modeling_tf_utils.TFBaseModelOutputWithPooling`]): + The output of the [`TFCLIPVisionModel`]. + """ + + loss: tf.Tensor | None = None + logits_per_image: tf.Tensor | None = None + logits_per_text: tf.Tensor | None = None + text_embeds: tf.Tensor | None = None + image_embeds: tf.Tensor | None = None + text_model_output: TFBaseModelOutputWithPooling = None + vision_model_output: TFBaseModelOutputWithPooling = None + + def to_tuple(self) -> tuple[Any]: + return tuple( + self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple() + for k in self.keys() + ) + + +class TFCLIPVisionEmbeddings(keras.layers.Layer): + def __init__(self, config: CLIPVisionConfig, **kwargs): + super().__init__(**kwargs) + + self.embed_dim = config.hidden_size + self.image_size = config.image_size + self.patch_size = config.patch_size + + self.num_patches = (self.image_size // self.patch_size) ** 2 + self.num_positions = self.num_patches + 1 + + self.config = config + + self.patch_embedding = keras.layers.Conv2D( + filters=self.embed_dim, + kernel_size=self.patch_size, + strides=self.patch_size, + padding="valid", + data_format="channels_last", + use_bias=False, + kernel_initializer=get_initializer(self.config.initializer_range * self.config.initializer_factor), + name="patch_embedding", + ) + + def build(self, input_shape: tf.TensorShape = None): + factor = self.config.initializer_factor + + self.class_embedding = self.add_weight( + shape=(self.embed_dim,), + initializer=get_initializer(self.embed_dim**-0.5 * factor), + trainable=True, + name="class_embedding", + ) + + with tf.name_scope("position_embedding"): + self.position_embedding = self.add_weight( + shape=(self.num_positions, self.embed_dim), + initializer=get_initializer(self.config.initializer_range * factor), + trainable=True, + name="embeddings", + ) + + if self.built: + return + self.built = True + if getattr(self, "patch_embedding", None) is not None: + with tf.name_scope(self.patch_embedding.name): + self.patch_embedding.build([None, None, None, self.config.num_channels]) + + def call(self, pixel_values: tf.Tensor) -> tf.Tensor: + """`pixel_values` is expected to be of NCHW format.""" + + batch_size, num_channels, height, width = shape_list(pixel_values) + + # When running on CPU, `tf.nn.conv2d` doesn't support `NCHW` format. + # So change the input format from `NCHW` to `NHWC`. + # shape = (batch_size, in_height, in_width, in_channels=num_channels) + pixel_values = tf.transpose(pixel_values, perm=(0, 2, 3, 1)) + + patch_embeds = self.patch_embedding(pixel_values) + + # Change the 2D spatial dimensions to a single temporal dimension. + # shape = (batch_size, num_patches, out_channels=embed_dim) + patch_embeds = tf.reshape(tensor=patch_embeds, shape=(batch_size, self.num_patches, -1)) + + # add the [CLS] token to the embedded patch tokens + class_embeds = tf.broadcast_to(self.class_embedding, shape=(batch_size, 1, self.embed_dim)) + embeddings = tf.concat((class_embeds, patch_embeds), axis=1) + + embeddings = embeddings + self.position_embedding + + return embeddings + + +class TFCLIPTextEmbeddings(keras.layers.Layer): + def __init__(self, config: CLIPTextConfig, **kwargs): + super().__init__(**kwargs) + + self.embed_dim = config.hidden_size + + self.config = config + + def build(self, input_shape: tf.TensorShape = None): + with tf.name_scope("token_embedding"): + self.weight = self.add_weight( + shape=(self.config.vocab_size, self.embed_dim), + initializer=get_initializer(self.config.initializer_factor * self.config.initializer_range), + trainable=True, + name="weight", + ) + + with tf.name_scope("position_embedding"): + self.position_embedding = self.add_weight( + shape=(self.config.max_position_embeddings, self.embed_dim), + initializer=get_initializer(self.config.initializer_factor * self.config.initializer_range), + trainable=True, + name="embeddings", + ) + + super().build(input_shape) + + def call( + self, + input_ids: tf.Tensor | None = None, + position_ids: tf.Tensor | None = None, + inputs_embeds: tf.Tensor | None = None, + ) -> tf.Tensor: + """ + Applies embedding based on inputs tensor. + + Returns: + final_embeddings (`tf.Tensor`): output embedding tensor. + """ + if input_ids is None and inputs_embeds is None: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + if inputs_embeds is None: + check_embeddings_within_bounds(input_ids, self.config.vocab_size) + inputs_embeds = tf.gather(params=self.weight, indices=input_ids) + + input_shape = shape_list(inputs_embeds)[:-1] + + if position_ids is None: + position_ids = tf.expand_dims(tf.range(start=0, limit=input_shape[-1]), axis=0) + + position_embeds = tf.gather(params=self.position_embedding, indices=position_ids) + position_embeds = tf.tile(input=position_embeds, multiples=(input_shape[0], 1, 1)) + final_embeddings = inputs_embeds + position_embeds + + return final_embeddings + + +class TFCLIPAttention(keras.layers.Layer): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config: CLIPConfig, **kwargs): + super().__init__(**kwargs) + + self.embed_dim = config.hidden_size + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = self.embed_dim // self.num_attention_heads + if self.attention_head_size * self.num_attention_heads != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:" + f" {self.num_attention_heads})." + ) + + factor = config.initializer_factor + in_proj_std = (self.embed_dim**-0.5) * ((2 * config.num_hidden_layers) ** -0.5) * factor + out_proj_std = (self.embed_dim**-0.5) * factor + + self.sqrt_att_head_size = math.sqrt(self.attention_head_size) + + self.q_proj = keras.layers.Dense( + units=self.embed_dim, kernel_initializer=get_initializer(in_proj_std), name="q_proj" + ) + self.k_proj = keras.layers.Dense( + units=self.embed_dim, kernel_initializer=get_initializer(in_proj_std), name="k_proj" + ) + self.v_proj = keras.layers.Dense( + units=self.embed_dim, kernel_initializer=get_initializer(in_proj_std), name="v_proj" + ) + + self.dropout = keras.layers.Dropout(rate=config.attention_dropout) + + self.out_proj = keras.layers.Dense( + units=self.embed_dim, kernel_initializer=get_initializer(out_proj_std), name="out_proj" + ) + + # copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention.transpose_for_scores + def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor: + # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size] + tensor = tf.reshape(tensor=tensor, shape=(batch_size, -1, self.num_attention_heads, self.attention_head_size)) + + # Transpose the tensor from [batch_size, seq_length, num_attention_heads, attention_head_size] to [batch_size, num_attention_heads, seq_length, attention_head_size] + return tf.transpose(tensor, perm=[0, 2, 1, 3]) + + def call( + self, + hidden_states: tf.Tensor, + attention_mask: tf.Tensor, + causal_attention_mask: tf.Tensor, + output_attentions: bool, + training: bool = False, + ) -> tuple[tf.Tensor]: + """Input shape: Batch x Time x Channel""" + + batch_size = shape_list(hidden_states)[0] + mixed_query_layer = self.q_proj(inputs=hidden_states) + mixed_key_layer = self.k_proj(inputs=hidden_states) + mixed_value_layer = self.v_proj(inputs=hidden_states) + query_layer = self.transpose_for_scores(mixed_query_layer, batch_size) + key_layer = self.transpose_for_scores(mixed_key_layer, batch_size) + value_layer = self.transpose_for_scores(mixed_value_layer, batch_size) + + # Take the dot product between "query" and "key" to get the raw attention scores. + # (batch size, num_heads, seq_len_q, seq_len_k) + attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True) + dk = tf.cast(self.sqrt_att_head_size, dtype=attention_scores.dtype) + attention_scores = tf.divide(attention_scores, dk) + + # apply the causal_attention_mask first + if causal_attention_mask is not None: + # Apply the causal attention mask (precomputed for all layers in TFCLIPModel call() function) + attention_scores = tf.add(attention_scores, causal_attention_mask) + + if attention_mask is not None: + # Apply the attention mask (precomputed for all layers in TFCLIPModel call() function) + attention_scores = tf.add(attention_scores, attention_mask) + + # Normalize the attention scores to probabilities. + _attention_probs = stable_softmax(logits=attention_scores, axis=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(inputs=_attention_probs, training=training) + + attention_output = tf.matmul(attention_probs, value_layer) + attention_output = tf.transpose(attention_output, perm=[0, 2, 1, 3]) + + # (batch_size, seq_len_q, embed_dim) + attention_output = tf.reshape(tensor=attention_output, shape=(batch_size, -1, self.embed_dim)) + + attention_output = self.out_proj(attention_output, training=training) + # In TFBert, attention weights are returned after dropout. + # However, in CLIP, they are returned before dropout. + outputs = (attention_output, _attention_probs) if output_attentions else (attention_output,) + + return outputs + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "q_proj", None) is not None: + with tf.name_scope(self.q_proj.name): + self.q_proj.build([None, None, self.embed_dim]) + if getattr(self, "k_proj", None) is not None: + with tf.name_scope(self.k_proj.name): + self.k_proj.build([None, None, self.embed_dim]) + if getattr(self, "v_proj", None) is not None: + with tf.name_scope(self.v_proj.name): + self.v_proj.build([None, None, self.embed_dim]) + if getattr(self, "out_proj", None) is not None: + with tf.name_scope(self.out_proj.name): + self.out_proj.build([None, None, self.embed_dim]) + + +class TFCLIPMLP(keras.layers.Layer): + def __init__(self, config: CLIPConfig, **kwargs): + super().__init__(**kwargs) + + self.activation_fn = get_tf_activation(config.hidden_act) + + factor = config.initializer_factor + in_proj_std = (config.hidden_size**-0.5) * ((2 * config.num_hidden_layers) ** -0.5) * factor + fc_std = (2 * config.hidden_size) ** -0.5 * factor + + self.fc1 = keras.layers.Dense( + units=config.intermediate_size, kernel_initializer=get_initializer(fc_std), name="fc1" + ) + self.fc2 = keras.layers.Dense( + units=config.hidden_size, kernel_initializer=get_initializer(in_proj_std), name="fc2" + ) + self.config = config + + def call(self, hidden_states: tf.Tensor) -> tf.Tensor: + hidden_states = self.fc1(inputs=hidden_states) + hidden_states = self.activation_fn(hidden_states) + hidden_states = self.fc2(inputs=hidden_states) + return hidden_states + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "fc1", None) is not None: + with tf.name_scope(self.fc1.name): + self.fc1.build([None, None, self.config.hidden_size]) + if getattr(self, "fc2", None) is not None: + with tf.name_scope(self.fc2.name): + self.fc2.build([None, None, self.config.intermediate_size]) + + +class TFCLIPEncoderLayer(keras.layers.Layer): + def __init__(self, config: CLIPConfig, **kwargs): + super().__init__(**kwargs) + + self.embed_dim = config.hidden_size + self.self_attn = TFCLIPAttention(config, name="self_attn") + self.layer_norm1 = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm1") + self.mlp = TFCLIPMLP(config, name="mlp") + self.layer_norm2 = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm2") + + def call( + self, + hidden_states: tf.Tensor, + attention_mask: tf.Tensor, + causal_attention_mask: tf.Tensor, + output_attentions: bool, + training: bool = False, + ) -> tuple[tf.Tensor]: + """ + Args: + hidden_states (`tf.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`tf.Tensor`): attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + causal_attention_mask (`tf.Tensor`): causal attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + output_attentions (`bool`): + Whether or not to return the attentions tensors of all attention layers. See `outputs` under returned + tensors for more detail. + """ + residual = hidden_states + + hidden_states = self.layer_norm1(inputs=hidden_states) + attention_outputs = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + causal_attention_mask=causal_attention_mask, + output_attentions=output_attentions, + training=training, + ) + hidden_states = attention_outputs[0] + hidden_states = residual + hidden_states + + residual = hidden_states + hidden_states = self.layer_norm2(inputs=hidden_states) + hidden_states = self.mlp(hidden_states=hidden_states) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + attention_outputs[1:] # add attentions if we output them + + return outputs + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "self_attn", None) is not None: + with tf.name_scope(self.self_attn.name): + self.self_attn.build(None) + if getattr(self, "layer_norm1", None) is not None: + with tf.name_scope(self.layer_norm1.name): + self.layer_norm1.build([None, None, self.embed_dim]) + if getattr(self, "mlp", None) is not None: + with tf.name_scope(self.mlp.name): + self.mlp.build(None) + if getattr(self, "layer_norm2", None) is not None: + with tf.name_scope(self.layer_norm2.name): + self.layer_norm2.build([None, None, self.embed_dim]) + + +class TFCLIPEncoder(keras.layers.Layer): + """ + Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a + [`TFCLIPEncoderLayer`]. + + Args: + config: CLIPConfig + """ + + def __init__(self, config: CLIPConfig, **kwargs): + super().__init__(**kwargs) + + self.layers = [TFCLIPEncoderLayer(config, name=f"layers_._{i}") for i in range(config.num_hidden_layers)] + + def call( + self, + hidden_states: tf.Tensor, + attention_mask: tf.Tensor, + causal_attention_mask: tf.Tensor, + output_attentions: bool, + output_hidden_states: bool, + return_dict: bool, + training: bool = False, + ) -> TFBaseModelOutput | tuple[tf.Tensor]: + all_hidden_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + + for i, layer_module in enumerate(self.layers): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_outputs = layer_module( + hidden_states=hidden_states, + attention_mask=attention_mask, + causal_attention_mask=causal_attention_mask, + output_attentions=output_attentions, + training=training, + ) + hidden_states = layer_outputs[0] + + if output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + + # Add last layer + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None) + + return TFBaseModelOutput( + last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "layers", None) is not None: + for layer in self.layers: + with tf.name_scope(layer.name): + layer.build(None) + + +class TFCLIPTextTransformer(keras.layers.Layer): + def __init__(self, config: CLIPTextConfig, **kwargs): + super().__init__(**kwargs) + + self.embeddings = TFCLIPTextEmbeddings(config, name="embeddings") + self.encoder = TFCLIPEncoder(config, name="encoder") + self.final_layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="final_layer_norm") + + # For `pooled_output` computation + self.eos_token_id = config.eos_token_id + self.embed_dim = config.hidden_size + + def call( + self, + input_ids: TFModelInputType, + attention_mask: tf.Tensor, + position_ids: tf.Tensor, + output_attentions: bool, + output_hidden_states: bool, + return_dict: bool, + training: bool = False, + ) -> TFBaseModelOutputWithPooling | tuple[tf.Tensor]: + input_shape = shape_list(input_ids) + + embedding_output = self.embeddings(input_ids=input_ids, position_ids=position_ids) + + batch_size, seq_length = input_shape + # CLIP's text model uses causal mask, prepare it here. + # https://github.com/openai/CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clip/model.py#L324 + causal_attention_mask = self._build_causal_attention_mask(batch_size, seq_length, dtype=embedding_output.dtype) + + # check attention mask and invert + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + attention_mask = _expand_mask(attention_mask) + + encoder_outputs = self.encoder( + hidden_states=embedding_output, + attention_mask=attention_mask, + causal_attention_mask=causal_attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + sequence_output = encoder_outputs[0] + sequence_output = self.final_layer_norm(inputs=sequence_output) + + if self.eos_token_id == 2: + # The `eos_token_id` was incorrect before PR #24773: Let's keep what have been done here. + # A CLIP model with such `eos_token_id` in the config can't work correctly with extra new tokens added + # ------------------------------------------------------------ + # text_embeds.shape = [batch_size, n_ctx, transformer.width] + # take features from the eot embedding (eot_token is the highest number in each sequence) + pooled_output = tf.gather_nd( + params=sequence_output, + indices=tf.stack( + values=(tf.range(input_shape[0], dtype=tf.int64), tf.math.argmax(input_ids, axis=-1)), axis=1 + ), + ) + else: + # The config gets updated `eos_token_id` from PR #24773 (so the use of extra new tokens is possible) + pooled_output = tf.gather_nd( + params=sequence_output, + indices=tf.stack( + values=( + tf.range(input_shape[0], dtype=tf.int64), + tf.math.argmax(tf.cast(input_ids == self.eos_token_id, dtype=tf.int8), axis=-1), + ), + axis=1, + ), + ) + + if not return_dict: + return (sequence_output, pooled_output) + encoder_outputs[1:] + + return TFBaseModelOutputWithPooling( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + def _build_causal_attention_mask(self, batch_size, seq_length, dtype=tf.float32): + # It is possible with an unspecified sequence length for seq_length to be + # a runtime value, which is unsupported by tf.constant. Per the TensorFlow + # docs, tf.fill can handle runtime dynamic shapes: + # https://www.tensorflow.org/api_docs/python/tf/fill + diag = tf.cast(tf.fill((seq_length,), 0.0), dtype) + + # set an additive 2D attention mask with all places being masked + to_mask = tf.cast(tf.fill((seq_length, seq_length), -10000.0), dtype) + + # set diagonal & lower triangular parts to 0 (i.e. the places not to be masked) + # TIP: think the 2D matrix as the space of (query_seq, key_seq) + to_mask = tf.linalg.band_part(to_mask, 0, -1) + # to_mask = tf.linalg.band_part(to_mask, -1, 0) + to_mask = tf.linalg.set_diag(to_mask, diagonal=diag) + + return tf.broadcast_to(input=to_mask, shape=(batch_size, 1, seq_length, seq_length)) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "embeddings", None) is not None: + with tf.name_scope(self.embeddings.name): + self.embeddings.build(None) + if getattr(self, "encoder", None) is not None: + with tf.name_scope(self.encoder.name): + self.encoder.build(None) + if getattr(self, "final_layer_norm", None) is not None: + with tf.name_scope(self.final_layer_norm.name): + self.final_layer_norm.build([None, None, self.embed_dim]) + + +@keras_serializable +class TFCLIPTextMainLayer(keras.layers.Layer): + config_class = CLIPTextConfig + + def __init__(self, config: CLIPTextConfig, **kwargs): + super().__init__(**kwargs) + self.config = config + self.text_model = TFCLIPTextTransformer(config, name="text_model") + + def get_input_embeddings(self) -> keras.layers.Layer: + return self.text_model.embeddings + + def set_input_embeddings(self, value: tf.Variable): + self.text_model.embeddings.weight = value + self.text_model.embeddings.vocab_size = shape_list(value)[0] + + @unpack_inputs + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + training: bool = False, + ) -> TFBaseModelOutputWithPooling | tuple[tf.Tensor]: + if input_ids is None: + raise ValueError("You have to specify input_ids") + + input_shape = shape_list(input_ids) + + if attention_mask is None: + attention_mask = tf.fill(dims=input_shape, value=1) + + text_model_outputs = self.text_model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + return text_model_outputs + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "text_model", None) is not None: + with tf.name_scope(self.text_model.name): + self.text_model.build(None) + + +class TFCLIPVisionTransformer(keras.layers.Layer): + def __init__(self, config: CLIPVisionConfig, **kwargs): + super().__init__(**kwargs) + + self.embeddings = TFCLIPVisionEmbeddings(config, name="embeddings") + self.pre_layernorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="pre_layrnorm") + self.encoder = TFCLIPEncoder(config, name="encoder") + self.post_layernorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="post_layernorm") + self.embed_dim = config.hidden_size + + def call( + self, + pixel_values: TFModelInputType, + output_attentions: bool, + output_hidden_states: bool, + return_dict: bool, + training: bool = False, + ) -> TFBaseModelOutputWithPooling | tuple[tf.Tensor]: + embedding_output = self.embeddings(pixel_values=pixel_values) + embedding_output = self.pre_layernorm(inputs=embedding_output) + + encoder_outputs = self.encoder( + hidden_states=embedding_output, + attention_mask=None, + causal_attention_mask=None, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + sequence_output = encoder_outputs[0] + pooled_output = sequence_output[:, 0, :] + pooled_output = self.post_layernorm(inputs=pooled_output) + + if not return_dict: + return (sequence_output, pooled_output) + encoder_outputs[1:] + + return TFBaseModelOutputWithPooling( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "embeddings", None) is not None: + with tf.name_scope(self.embeddings.name): + self.embeddings.build(None) + if getattr(self, "pre_layernorm", None) is not None: + with tf.name_scope(self.pre_layernorm.name): + self.pre_layernorm.build([None, None, self.embed_dim]) + if getattr(self, "encoder", None) is not None: + with tf.name_scope(self.encoder.name): + self.encoder.build(None) + if getattr(self, "post_layernorm", None) is not None: + with tf.name_scope(self.post_layernorm.name): + self.post_layernorm.build([None, self.embed_dim]) + + +@keras_serializable +class TFCLIPVisionMainLayer(keras.layers.Layer): + config_class = CLIPVisionConfig + + def __init__(self, config: CLIPVisionConfig, **kwargs): + super().__init__(**kwargs) + self.config = config + self.vision_model = TFCLIPVisionTransformer(config, name="vision_model") + + def get_input_embeddings(self) -> keras.layers.Layer: + return self.vision_model.embeddings + + @unpack_inputs + def call( + self, + pixel_values: TFModelInputType | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + training: bool = False, + ) -> TFBaseModelOutputWithPooling | tuple[tf.Tensor]: + if pixel_values is None: + raise ValueError("You have to specify pixel_values") + + vision_model_outputs = self.vision_model( + pixel_values=pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + return vision_model_outputs + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "vision_model", None) is not None: + with tf.name_scope(self.vision_model.name): + self.vision_model.build(None) + + +@keras_serializable +class TFCLIPMainLayer(keras.layers.Layer): + config_class = CLIPConfig + + def __init__(self, config: CLIPConfig, **kwargs): + super().__init__(**kwargs) + + if not isinstance(config.text_config, CLIPTextConfig): + raise TypeError( + "config.text_config is expected to be of type CLIPTextConfig but is of type" + f" {type(config.text_config)}." + ) + + if not isinstance(config.vision_config, CLIPVisionConfig): + raise TypeError( + "config.vision_config is expected to be of type CLIPVisionConfig but is of type" + f" {type(config.vision_config)}." + ) + + self.config = config + + text_config = config.text_config + vision_config = config.vision_config + + self.projection_dim = config.projection_dim + + self.text_model = TFCLIPTextTransformer(text_config, name="text_model") + self.vision_model = TFCLIPVisionTransformer(vision_config, name="vision_model") + + self.visual_projection = keras.layers.Dense( + units=self.projection_dim, + kernel_initializer=get_initializer(vision_config.hidden_size**-0.5 * self.config.initializer_factor), + use_bias=False, + name="visual_projection", + ) + + self.text_projection = keras.layers.Dense( + units=self.projection_dim, + kernel_initializer=get_initializer(text_config.hidden_size**-0.5 * self.config.initializer_factor), + use_bias=False, + name="text_projection", + ) + self.text_embed_dim = text_config.hidden_size + self.vision_embed_dim = vision_config.hidden_size + + def build(self, input_shape: tf.TensorShape = None): + self.logit_scale = self.add_weight( + shape=(1,), + initializer=keras.initializers.Constant(self.config.logit_scale_init_value), + trainable=True, + name="logit_scale", + ) + + if self.built: + return + self.built = True + if getattr(self, "text_model", None) is not None: + with tf.name_scope(self.text_model.name): + self.text_model.build(None) + if getattr(self, "vision_model", None) is not None: + with tf.name_scope(self.vision_model.name): + self.vision_model.build(None) + if getattr(self, "visual_projection", None) is not None: + with tf.name_scope(self.visual_projection.name): + self.visual_projection.build([None, None, self.vision_embed_dim]) + if getattr(self, "text_projection", None) is not None: + with tf.name_scope(self.text_projection.name): + self.text_projection.build([None, None, self.text_embed_dim]) + + @unpack_inputs + def get_text_features( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + training: bool = False, + ) -> tf.Tensor: + if input_ids is None: + raise ValueError("You have to specify either input_ids") + + input_shape = shape_list(input_ids) + + if attention_mask is None: + attention_mask = tf.fill(dims=input_shape, value=1) + + text_outputs = self.text_model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + pooled_output = text_outputs[1] + text_features = self.text_projection(inputs=pooled_output) + + return text_features + + @unpack_inputs + def get_image_features( + self, + pixel_values: TFModelInputType | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + training: bool = False, + ) -> tf.Tensor: + if pixel_values is None: + raise ValueError("You have to specify pixel_values") + + vision_outputs = self.vision_model( + pixel_values=pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + pooled_output = vision_outputs[1] # pooled_output + image_features = self.visual_projection(inputs=pooled_output) + + return image_features + + @unpack_inputs + def call( + self, + input_ids: TFModelInputType | None = None, + pixel_values: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + return_loss: bool | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + training: bool = False, + ) -> TFCLIPOutput | tuple[tf.Tensor]: + if input_ids is None: + raise ValueError("You have to specify either input_ids") + if pixel_values is None: + raise ValueError("You have to specify pixel_values") + + input_shape = shape_list(input_ids) + + if attention_mask is None: + attention_mask = tf.fill(dims=input_shape, value=1) + + vision_outputs = self.vision_model( + pixel_values=pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + text_outputs = self.text_model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + image_embeds = vision_outputs[1] + image_embeds = self.visual_projection(inputs=image_embeds) + + text_embeds = text_outputs[1] + text_embeds = self.text_projection(inputs=text_embeds) + + # normalized features + image_embeds = image_embeds / tf.norm(tensor=image_embeds, ord="euclidean", axis=-1, keepdims=True) + text_embeds = text_embeds / tf.norm(tensor=text_embeds, ord="euclidean", axis=-1, keepdims=True) + + # cosine similarity as logits + logit_scale = tf.math.exp(self.logit_scale) + logits_per_text = tf.matmul(text_embeds, image_embeds, transpose_b=True) * logit_scale + logits_per_image = tf.transpose(logits_per_text) + + loss = None + if return_loss: + loss = clip_loss(logits_per_text) + loss = tf.reshape(loss, (1,)) + + if not return_dict: + output = (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs) + return (loss,) + output if loss is not None else output + + return TFCLIPOutput( + loss=loss, + logits_per_image=logits_per_image, + logits_per_text=logits_per_text, + text_embeds=text_embeds, + image_embeds=image_embeds, + text_model_output=text_outputs, + vision_model_output=vision_outputs, + ) + + +class TFCLIPPreTrainedModel(TFPreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = CLIPConfig + base_model_prefix = "clip" + _keys_to_ignore_on_load_missing = [r"position_ids"] + _keys_to_ignore_on_load_unexpected = [r"position_ids"] + + +CLIP_START_DOCSTRING = r""" + + This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and + behavior. + + + + TensorFlow models and layers in `transformers` accept two formats as input: + + - having all inputs as keyword arguments (like PyTorch models), or + - having all inputs as a list, tuple or dict in the first positional argument. + + The reason the second format is supported is that Keras methods prefer this format when passing inputs to models + and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just + pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second + format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with + the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first + positional argument: + + - a single Tensor with `input_ids` only and nothing else: `model(input_ids)` + - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring: + `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])` + - a dictionary with one or several input Tensors associated to the input names given in the docstring: + `model({"input_ids": input_ids, "token_type_ids": token_type_ids})` + + Note that when creating models and layers with + [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry + about any of this, as you can just pass inputs like you would to any other Python function! + + + + Args: + config ([`CLIPConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights. +""" + +CLIP_TEXT_INPUTS_DOCSTRING = r""" + Args: + input_ids (`np.ndarray`, `tf.Tensor`, `list[tf.Tensor]` ``dict[str, tf.Tensor]` or `dict[str, np.ndarray]` and each example must have the shape `({0})`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and + [`PreTrainedTokenizer.encode`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + position_ids (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + + [What are position IDs?](../glossary#position-ids) + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the + config will be used instead. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. This argument can be used only in eager mode, in graph mode the value in the config will be + used instead. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in + eager mode, in graph mode the value will always be set to True. + training (`bool`, *optional*, defaults to `False``): + Whether or not to use the model in training mode (some modules like dropout modules have different + behaviors between training and evaluation). +""" + +CLIP_VISION_INPUTS_DOCSTRING = r""" + Args: + pixel_values (`np.ndarray`, `tf.Tensor`, `list[tf.Tensor]` ``dict[str, tf.Tensor]` or `dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`): + Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See + [`CLIPImageProcessor.__call__`] for details. output_attentions (`bool`, *optional*): Whether or not to + return the attentions tensors of all attention layers. See `attentions` under returned tensors for more + detail. This argument can be used only in eager mode, in graph mode the value in the config will be used + instead. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. This argument can be used only in eager mode, in graph mode the value in the config will be + used instead. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in + eager mode, in graph mode the value will always be set to True. + training (`bool`, *optional*, defaults to `False``): + Whether or not to use the model in training mode (some modules like dropout modules have different + behaviors between training and evaluation). +""" + +CLIP_INPUTS_DOCSTRING = r""" + Args: + input_ids (`np.ndarray`, `tf.Tensor`, `list[tf.Tensor]` ``dict[str, tf.Tensor]` or `dict[str, np.ndarray]` and each example must have the shape `({0})`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and + [`PreTrainedTokenizer.encode`] for details. + + [What are input IDs?](../glossary#input-ids) + pixel_values (`np.ndarray`, `tf.Tensor`, `list[tf.Tensor]` `dict[str, tf.Tensor]` or `dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`): + Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See + [`CLIPImageProcessor.__call__`] for details. + attention_mask (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + position_ids (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + + [What are position IDs?](../glossary#position-ids) + return_loss (`bool`, *optional*): + Whether or not to return the contrastive loss. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the + config will be used instead. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. This argument can be used only in eager mode, in graph mode the value in the config will be + used instead. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in + eager mode, in graph mode the value will always be set to True. + training (`bool`, *optional*, defaults to `False``): + Whether or not to use the model in training mode (some modules like dropout modules have different + behaviors between training and evaluation). +""" + + +class TFCLIPTextModel(TFCLIPPreTrainedModel): + config_class = CLIPTextConfig + + def __init__(self, config: CLIPTextConfig, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + self.clip = TFCLIPTextMainLayer(config, name="clip") + + @unpack_inputs + @add_start_docstrings_to_model_forward(CLIP_TEXT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @replace_return_docstrings(output_type=TFBaseModelOutputWithPooling, config_class=CLIPTextConfig) + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + training: bool | None = False, + ) -> TFBaseModelOutputWithPooling | tuple[tf.Tensor]: + r""" + Returns: + + Examples: + + ```python + >>> from transformers import AutoTokenizer, TFCLIPTextModel + + >>> model = TFCLIPTextModel.from_pretrained("openai/clip-vit-base-patch32") + >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32") + + >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="tf") + + >>> outputs = model(**inputs) + >>> last_hidden_state = outputs.last_hidden_state + >>> pooled_output = outputs.pooler_output # pooled (EOS token) states + ```""" + + outputs = self.clip( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + return outputs + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "clip", None) is not None: + with tf.name_scope(self.clip.name): + self.clip.build(None) + + +class TFCLIPVisionModel(TFCLIPPreTrainedModel): + config_class = CLIPVisionConfig + main_input_name = "pixel_values" + + def __init__(self, config: CLIPVisionConfig, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + self.clip = TFCLIPVisionMainLayer(config, name="clip") + + @unpack_inputs + @add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=TFBaseModelOutputWithPooling, config_class=CLIPVisionConfig) + def call( + self, + pixel_values: TFModelInputType | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + training: bool | None = False, + ) -> TFBaseModelOutputWithPooling | tuple[tf.Tensor]: + r""" + Returns: + + Examples: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, TFCLIPVisionModel + + >>> model = TFCLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32") + >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> inputs = processor(images=image, return_tensors="tf") + + >>> outputs = model(**inputs) + >>> last_hidden_state = outputs.last_hidden_state + >>> pooled_output = outputs.pooler_output # pooled CLS states + ```""" + + outputs = self.clip( + pixel_values=pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + return outputs + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "clip", None) is not None: + with tf.name_scope(self.clip.name): + self.clip.build(None) + + +@add_start_docstrings(CLIP_START_DOCSTRING) +class TFCLIPModel(TFCLIPPreTrainedModel): + config_class = CLIPConfig + + def __init__(self, config: CLIPConfig, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + self.clip = TFCLIPMainLayer(config, name="clip") + + @unpack_inputs + @add_start_docstrings_to_model_forward(CLIP_TEXT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + def get_text_features( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + training: bool = False, + ) -> tf.Tensor: + r""" + Returns: + text_features (`tf.Tensor` of shape `(batch_size, output_dim`): The text embeddings obtained by applying + the projection layer to the pooled output of [`TFCLIPTextModel`]. + + Examples: + + ```python + >>> from transformers import AutoTokenizer, TFCLIPModel + + >>> model = TFCLIPModel.from_pretrained("openai/clip-vit-base-patch32") + >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32") + + >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="tf") + >>> text_features = model.get_text_features(**inputs) + ```""" + + text_features = self.clip.get_text_features( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + return text_features + + @unpack_inputs + @add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING) + def get_image_features( + self, + pixel_values: TFModelInputType | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + training: bool = False, + ) -> tf.Tensor: + r""" + Returns: + image_features (`tf.Tensor` of shape `(batch_size, output_dim`): The image embeddings obtained by applying + the projection layer to the pooled output of [`TFCLIPVisionModel`]. + + Examples: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, TFCLIPModel + + >>> model = TFCLIPModel.from_pretrained("openai/clip-vit-base-patch32") + >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> inputs = processor(images=image, return_tensors="tf") + + >>> image_features = model.get_image_features(**inputs) + ```""" + + image_features = self.clip.get_image_features( + pixel_values=pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + return image_features + + @unpack_inputs + @add_start_docstrings_to_model_forward(CLIP_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @replace_return_docstrings(output_type=TFCLIPOutput, config_class=CLIPConfig) + def call( + self, + input_ids: TFModelInputType | None = None, + pixel_values: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + return_loss: bool | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + training: bool = False, + ) -> TFCLIPOutput | tuple[tf.Tensor]: + r""" + Returns: + + Examples: + + ```python + >>> import tensorflow as tf + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, TFCLIPModel + + >>> model = TFCLIPModel.from_pretrained("openai/clip-vit-base-patch32") + >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> inputs = processor( + ... text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="tf", padding=True + ... ) + + >>> outputs = model(**inputs) + >>> logits_per_image = outputs.logits_per_image # this is the image-text similarity score + >>> probs = tf.nn.softmax(logits_per_image, axis=1) # we can take the softmax to get the label probabilities + ```""" + + outputs = self.clip( + input_ids=input_ids, + pixel_values=pixel_values, + attention_mask=attention_mask, + position_ids=position_ids, + return_loss=return_loss, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + return outputs + + def serving_output(self, output: TFCLIPOutput) -> TFCLIPOutput: + # TODO: As is this currently fails with saved_model=True, because + # TensorFlow cannot trace through nested dataclasses. Reference: + # https://github.com/huggingface/transformers/pull/16886 + return output + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "clip", None) is not None: + with tf.name_scope(self.clip.name): + self.clip.build(None) + + +__all__ = ["TFCLIPModel", "TFCLIPPreTrainedModel", "TFCLIPTextModel", "TFCLIPVisionModel"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/clip/processing_clip.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/clip/processing_clip.py new file mode 100644 index 0000000000000000000000000000000000000000..893346e798bb2df38e9f05e2692327fd316fb0ab --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/clip/processing_clip.py @@ -0,0 +1,73 @@ +# coding=utf-8 +# Copyright 2021 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Image/Text processor class for CLIP +""" + +import warnings + +from ...processing_utils import ProcessorMixin + + +class CLIPProcessor(ProcessorMixin): + r""" + Constructs a CLIP processor which wraps a CLIP image processor and a CLIP tokenizer into a single processor. + + [`CLIPProcessor`] offers all the functionalities of [`CLIPImageProcessor`] and [`CLIPTokenizerFast`]. See the + [`~CLIPProcessor.__call__`] and [`~CLIPProcessor.decode`] for more information. + + Args: + image_processor ([`CLIPImageProcessor`], *optional*): + The image processor is a required input. + tokenizer ([`AutoTokenizer`], *optional*): + The tokenizer is a required input. + """ + + attributes = ["image_processor", "tokenizer"] + image_processor_class = ("CLIPImageProcessor", "CLIPImageProcessorFast") + tokenizer_class = "AutoTokenizer" + + def __init__(self, image_processor=None, tokenizer=None, **kwargs): + feature_extractor = None + if "feature_extractor" in kwargs: + warnings.warn( + "The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`" + " instead.", + FutureWarning, + ) + feature_extractor = kwargs.pop("feature_extractor") + + image_processor = image_processor if image_processor is not None else feature_extractor + + super().__init__(image_processor, tokenizer) + + @property + def feature_extractor_class(self): + warnings.warn( + "`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.", + FutureWarning, + ) + return self.image_processor_class + + @property + def feature_extractor(self): + warnings.warn( + "`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.", + FutureWarning, + ) + return self.image_processor + + +__all__ = ["CLIPProcessor"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/clip/tokenization_clip.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/clip/tokenization_clip.py new file mode 100644 index 0000000000000000000000000000000000000000..625d26dc69605ac5432fab21a5f0da449cbf08e3 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/clip/tokenization_clip.py @@ -0,0 +1,519 @@ +# coding=utf-8 +# Copyright 2021 The Open AI Team Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes for CLIP.""" + +import json +import os +import unicodedata +from functools import lru_cache +from typing import Optional + +import regex as re + +from ...tokenization_utils import AddedToken, PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace +from ...utils import logging + + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = { + "vocab_file": "vocab.json", + "merges_file": "merges.txt", +} + + +@lru_cache +def bytes_to_unicode(): + """ + Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control + characters the bpe code barfs on. + + The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab + if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for + decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup + tables between utf-8 bytes and unicode strings. + """ + bs = ( + list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1)) + ) + cs = bs[:] + n = 0 + for b in range(2**8): + if b not in bs: + bs.append(b) + cs.append(2**8 + n) + n += 1 + cs = [chr(n) for n in cs] + return dict(zip(bs, cs)) + + +def get_pairs(word): + """ + Return set of symbol pairs in a word. + + Word is represented as tuple of symbols (symbols being variable-length strings). + """ + pairs = set() + prev_char = word[0] + for char in word[1:]: + pairs.add((prev_char, char)) + prev_char = char + return pairs + + +def whitespace_clean(text): + text = re.sub(r"\s+", " ", text) + text = text.strip() + return text + + +# Copied from transformers.models.bert.tokenization_bert.whitespace_tokenize +def whitespace_tokenize(text): + """Runs basic whitespace cleaning and splitting on a piece of text.""" + text = text.strip() + if not text: + return [] + tokens = text.split() + return tokens + + +# Copied from transformers.models.bert.tokenization_bert.BasicTokenizer +class BasicTokenizer: + """ + Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.). + + Args: + do_lower_case (`bool`, *optional*, defaults to `True`): + Whether or not to lowercase the input when tokenizing. + never_split (`Iterable`, *optional*): + Collection of tokens which will never be split during tokenization. Only has an effect when + `do_basic_tokenize=True` + tokenize_chinese_chars (`bool`, *optional*, defaults to `True`): + Whether or not to tokenize Chinese characters. + + This should likely be deactivated for Japanese (see this + [issue](https://github.com/huggingface/transformers/issues/328)). + strip_accents (`bool`, *optional*): + Whether or not to strip all accents. If this option is not specified, then it will be determined by the + value for `lowercase` (as in the original BERT). + do_split_on_punc (`bool`, *optional*, defaults to `True`): + In some instances we want to skip the basic punctuation splitting so that later tokenization can capture + the full context of the words, such as contractions. + """ + + def __init__( + self, + do_lower_case=True, + never_split=None, + tokenize_chinese_chars=True, + strip_accents=None, + do_split_on_punc=True, + ): + if never_split is None: + never_split = [] + self.do_lower_case = do_lower_case + self.never_split = set(never_split) + self.tokenize_chinese_chars = tokenize_chinese_chars + self.strip_accents = strip_accents + self.do_split_on_punc = do_split_on_punc + + def tokenize(self, text, never_split=None): + """ + Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer. + + Args: + never_split (`List[str]`, *optional*) + Kept for backward compatibility purposes. Now implemented directly at the base class level (see + [`PreTrainedTokenizer.tokenize`]) List of token not to split. + """ + # union() returns a new set by concatenating the two sets. + never_split = self.never_split.union(set(never_split)) if never_split else self.never_split + text = self._clean_text(text) + + # This was added on November 1st, 2018 for the multilingual and Chinese + # models. This is also applied to the English models now, but it doesn't + # matter since the English models were not trained on any Chinese data + # and generally don't have any Chinese data in them (there are Chinese + # characters in the vocabulary because Wikipedia does have some Chinese + # words in the English Wikipedia.). + if self.tokenize_chinese_chars: + text = self._tokenize_chinese_chars(text) + # prevents treating the same character with different unicode codepoints as different characters + unicode_normalized_text = unicodedata.normalize("NFC", text) + orig_tokens = whitespace_tokenize(unicode_normalized_text) + split_tokens = [] + for token in orig_tokens: + if token not in never_split: + if self.do_lower_case: + token = token.lower() + if self.strip_accents is not False: + token = self._run_strip_accents(token) + elif self.strip_accents: + token = self._run_strip_accents(token) + split_tokens.extend(self._run_split_on_punc(token, never_split)) + + output_tokens = whitespace_tokenize(" ".join(split_tokens)) + return output_tokens + + def _run_strip_accents(self, text): + """Strips accents from a piece of text.""" + text = unicodedata.normalize("NFD", text) + output = [] + for char in text: + cat = unicodedata.category(char) + if cat == "Mn": + continue + output.append(char) + return "".join(output) + + def _run_split_on_punc(self, text, never_split=None): + """Splits punctuation on a piece of text.""" + if not self.do_split_on_punc or (never_split is not None and text in never_split): + return [text] + chars = list(text) + i = 0 + start_new_word = True + output = [] + while i < len(chars): + char = chars[i] + if _is_punctuation(char): + output.append([char]) + start_new_word = True + else: + if start_new_word: + output.append([]) + start_new_word = False + output[-1].append(char) + i += 1 + + return ["".join(x) for x in output] + + def _tokenize_chinese_chars(self, text): + """Adds whitespace around any CJK character.""" + output = [] + for char in text: + cp = ord(char) + if self._is_chinese_char(cp): + output.append(" ") + output.append(char) + output.append(" ") + else: + output.append(char) + return "".join(output) + + def _is_chinese_char(self, cp): + """Checks whether CP is the codepoint of a CJK character.""" + # This defines a "chinese character" as anything in the CJK Unicode block: + # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) + # + # Note that the CJK Unicode block is NOT all Japanese and Korean characters, + # despite its name. The modern Korean Hangul alphabet is a different block, + # as is Japanese Hiragana and Katakana. Those alphabets are used to write + # space-separated words, so they are not treated specially and handled + # like the all of the other languages. + if ( + (cp >= 0x4E00 and cp <= 0x9FFF) + or (cp >= 0x3400 and cp <= 0x4DBF) + or (cp >= 0x20000 and cp <= 0x2A6DF) + or (cp >= 0x2A700 and cp <= 0x2B73F) + or (cp >= 0x2B740 and cp <= 0x2B81F) + or (cp >= 0x2B820 and cp <= 0x2CEAF) + or (cp >= 0xF900 and cp <= 0xFAFF) + or (cp >= 0x2F800 and cp <= 0x2FA1F) + ): + return True + + return False + + def _clean_text(self, text): + """Performs invalid character removal and whitespace cleanup on text.""" + output = [] + for char in text: + cp = ord(char) + if cp == 0 or cp == 0xFFFD or _is_control(char): + continue + if _is_whitespace(char): + output.append(" ") + else: + output.append(char) + return "".join(output) + + +class CLIPTokenizer(PreTrainedTokenizer): + """ + Construct a CLIP tokenizer. Based on byte-level Byte-Pair-Encoding. + + This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to + this superclass for more information regarding those methods. + + Args: + vocab_file (`str`): + Path to the vocabulary file. + merges_file (`str`): + Path to the merges file. + errors (`str`, *optional*, defaults to `"replace"`): + Paradigm to follow when decoding bytes to UTF-8. See + [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information. + unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + bos_token (`str`, *optional*, defaults to `"<|startoftext|>"`): + The beginning of sequence token. + eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`): + The end of sequence token. + pad_token (`str`, *optional*, defaults to `"<|endoftext|>"`): + The token used for padding, for example when batching sequences of different lengths. + """ + + vocab_files_names = VOCAB_FILES_NAMES + model_input_names = ["input_ids", "attention_mask"] + + def __init__( + self, + vocab_file, + merges_file, + errors="replace", + unk_token="<|endoftext|>", + bos_token="<|startoftext|>", + eos_token="<|endoftext|>", + pad_token="<|endoftext|>", # hack to enable padding + **kwargs, + ): + bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token + eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token + unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token + try: + import ftfy + + self.fix_text = ftfy.fix_text + except ImportError: + logger.info("ftfy or spacy is not installed using custom BasicTokenizer instead of ftfy.") + self.nlp = BasicTokenizer(strip_accents=False, do_split_on_punc=False) + self.fix_text = None + + with open(vocab_file, encoding="utf-8") as vocab_handle: + self.encoder = json.load(vocab_handle) + self.decoder = {v: k for k, v in self.encoder.items()} + self.errors = errors # how to handle errors in decoding + self.byte_encoder = bytes_to_unicode() + self.byte_decoder = {v: k for k, v in self.byte_encoder.items()} + with open(merges_file, encoding="utf-8") as merges_handle: + bpe_merges = merges_handle.read().strip().split("\n")[1 : 49152 - 256 - 2 + 1] + bpe_merges = [tuple(merge.split()) for merge in bpe_merges] + self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges)))) + self.cache = {"<|startoftext|>": "<|startoftext|>", "<|endoftext|>": "<|endoftext|>"} + + self.pat = re.compile( + r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""", + re.IGNORECASE, + ) + + super().__init__( + errors=errors, + unk_token=unk_token, + bos_token=bos_token, + eos_token=eos_token, + pad_token=pad_token, + **kwargs, + ) + + @property + def vocab_size(self): + return len(self.encoder) + + def get_vocab(self): + return dict(self.encoder, **self.added_tokens_encoder) + + def build_inputs_with_special_tokens( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None + ) -> list[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A CLIP sequence has the following format: + + - single sequence: `<|startoftext|> X <|endoftext|>` + + Pairs of sequences are not the expected use case, but they will be handled without a separator. + + Args: + token_ids_0 (`list[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`list[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `list[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + bos_token = [self.bos_token_id] + eos_token = [self.eos_token_id] + + if token_ids_1 is None: + return bos_token + token_ids_0 + eos_token + return bos_token + token_ids_0 + eos_token + eos_token + token_ids_1 + eos_token + + def get_special_tokens_mask( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False + ) -> list[int]: + """ + Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer `prepare_for_model` method. + + Args: + token_ids_0 (`list[int]`): + List of IDs. + token_ids_1 (`list[int]`, *optional*): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not the token list is already formatted with special tokens for the model. + + Returns: + `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + + if already_has_special_tokens: + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) + + if token_ids_1 is None: + return [1] + ([0] * len(token_ids_0)) + [1] + return [1] + ([0] * len(token_ids_0)) + [1] + [1] + ([0] * len(token_ids_1)) + [1] + + def create_token_type_ids_from_sequences( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None + ) -> list[int]: + """ + Create a mask from the two sequences passed. CLIP does not make use of token type ids, therefore a list of + zeros is returned. + + Args: + token_ids_0 (`list[int]`): + List of IDs. + token_ids_1 (`list[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `list[int]`: List of zeros. + """ + bos_token = [self.bos_token_id] + eos_token = [self.eos_token_id] + + if token_ids_1 is None: + return len(bos_token + token_ids_0 + eos_token) * [0] + return len(bos_token + token_ids_0 + eos_token + eos_token + token_ids_1 + eos_token) * [0] + + def bpe(self, token): + if token in self.cache: + return self.cache[token] + word = tuple(token[:-1]) + (token[-1] + "",) + pairs = get_pairs(word) + + if not pairs: + return token + "" + + while True: + bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf"))) + if bigram not in self.bpe_ranks: + break + first, second = bigram + new_word = [] + i = 0 + while i < len(word): + try: + j = word.index(first, i) + except ValueError: + new_word.extend(word[i:]) + break + else: + new_word.extend(word[i:j]) + i = j + + if word[i] == first and i < len(word) - 1 and word[i + 1] == second: + new_word.append(first + second) + i += 2 + else: + new_word.append(word[i]) + i += 1 + new_word = tuple(new_word) + word = new_word + if len(word) == 1: + break + else: + pairs = get_pairs(word) + word = " ".join(word) + self.cache[token] = word + return word + + def _tokenize(self, text): + """Tokenize a string.""" + bpe_tokens = [] + if self.fix_text is None: + text = " ".join(self.nlp.tokenize(text)) + else: + text = whitespace_clean(self.fix_text(text)).lower() + + for token in re.findall(self.pat, text): + token = "".join( + self.byte_encoder[b] for b in token.encode("utf-8") + ) # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case) + bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" ")) + return bpe_tokens + + def _convert_token_to_id(self, token): + """Converts a token (str) in an id using the vocab.""" + return self.encoder.get(token, self.encoder.get(self.unk_token)) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.decoder.get(index) + + def convert_tokens_to_string(self, tokens): + """Converts a sequence of tokens (string) in a single string.""" + text = "".join(tokens) + byte_array = bytearray([self.byte_decoder[c] for c in text]) + text = byte_array.decode("utf-8", errors=self.errors).replace("", " ").strip() + return text + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]: + if not os.path.isdir(save_directory): + logger.error(f"Vocabulary path ({save_directory}) should be a directory") + return + vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + merge_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"] + ) + + with open(vocab_file, "w", encoding="utf-8") as f: + f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n") + + index = 0 + with open(merge_file, "w", encoding="utf-8") as writer: + writer.write("#version: 0.2\n") + for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]): + if index != token_index: + logger.warning( + f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive." + " Please check that the tokenizer is not corrupted!" + ) + index = token_index + writer.write(" ".join(bpe_tokens) + "\n") + index += 1 + + return vocab_file, merge_file + + +__all__ = ["CLIPTokenizer"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/clip/tokenization_clip_fast.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/clip/tokenization_clip_fast.py new file mode 100644 index 0000000000000000000000000000000000000000..c859d4572df76638ab881113ff187e75dd211de0 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/clip/tokenization_clip_fast.py @@ -0,0 +1,164 @@ +# coding=utf-8 +# Copyright 2021 The Open AI Team Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes for OpenAI GPT.""" + +from typing import Optional + +from tokenizers import pre_tokenizers + +from ...tokenization_utils_fast import PreTrainedTokenizerFast +from ...utils import logging +from .tokenization_clip import CLIPTokenizer + + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"} + + +class CLIPTokenizerFast(PreTrainedTokenizerFast): + """ + Construct a "fast" CLIP tokenizer (backed by HuggingFace's *tokenizers* library). Based on byte-level + Byte-Pair-Encoding. + + This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should + refer to this superclass for more information regarding those methods. + + Args: + vocab_file (`str`, *optional*): + Path to the vocabulary file. + merges_file (`str`, *optional*): + Path to the merges file. + tokenizer_file (`str`, *optional*): + The path to a tokenizer file to use instead of the vocab file. + unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + bos_token (`str`, *optional*, defaults to `"<|startoftext|>"`): + The beginning of sequence token. + eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`): + The end of sequence token. + pad_token (`str`, *optional*, defaults to `"<|endoftext|>"`): + The token used for padding, for example when batching sequences of different lengths. + """ + + vocab_files_names = VOCAB_FILES_NAMES + model_input_names = ["input_ids", "attention_mask"] + slow_tokenizer_class = CLIPTokenizer + + def __init__( + self, + vocab_file=None, + merges_file=None, + tokenizer_file=None, + unk_token="<|endoftext|>", + bos_token="<|startoftext|>", + eos_token="<|endoftext|>", + pad_token="<|endoftext|>", # hack to enable padding + **kwargs, + ): + super().__init__( + vocab_file, + merges_file, + tokenizer_file=tokenizer_file, + unk_token=unk_token, + bos_token=bos_token, + eos_token=eos_token, + pad_token=pad_token, + **kwargs, + ) + + if not isinstance(self.backend_tokenizer.pre_tokenizer, pre_tokenizers.Sequence): + raise TypeError( + "The `backend_tokenizer` provided does not match the expected format. The CLIP tokenizer has been" + " heavily modified from transformers version 4.17.0. You need to convert the tokenizer you are using" + " to be compatible with this version.The easiest way to do so is" + ' `CLIPTokenizerFast.from_pretrained("path_to_local_folder_or_hub_repo, from_slow=True)`. If you want' + " to use your existing tokenizer, you will have to revert to a version prior to 4.17.0 of" + " transformers." + ) + self._wrap_decode_method_backend_tokenizer() + + # Very ugly hack to enable padding to have a correct decoding see https://github.com/huggingface/tokenizers/issues/872 + def _wrap_decode_method_backend_tokenizer(self): + orig_decode_method = self.backend_tokenizer.decode + + ## define this as a local variable to avoid circular reference + ## See: https://github.com/huggingface/transformers/issues/30930 + end_of_word_suffix = self.backend_tokenizer.model.end_of_word_suffix + + def new_decode_method(*args, **kwargs): + text = orig_decode_method(*args, **kwargs) + text = text.replace(end_of_word_suffix, " ").strip() + return text + + self.backend_tokenizer.decode = new_decode_method + + def build_inputs_with_special_tokens( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None + ) -> list[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A CLIP sequence has the following format: + + - single sequence: `<|startoftext|> X <|endoftext|>` + + Pairs of sequences are not the expected use case, but they will be handled without a separator. + + Args: + token_ids_0 (`list[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`list[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `list[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + bos_token = [self.bos_token_id] + eos_token = [self.eos_token_id] + + if token_ids_1 is None: + return bos_token + token_ids_0 + eos_token + return bos_token + token_ids_0 + eos_token + eos_token + token_ids_1 + eos_token + + def create_token_type_ids_from_sequences( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None + ) -> list[int]: + """ + Create a mask from the two sequences passed. CLIP does not make use of token type ids, therefore a list of + zeros is returned. + + Args: + token_ids_0 (`list[int]`): + List of IDs. + token_ids_1 (`list[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `list[int]`: List of zeros. + """ + bos_token = [self.bos_token_id] + eos_token = [self.eos_token_id] + + if token_ids_1 is None: + return len(bos_token + token_ids_0 + eos_token) * [0] + return len(bos_token + token_ids_0 + eos_token + eos_token + token_ids_1 + eos_token) * [0] + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]: + files = self._tokenizer.model.save(save_directory, name=filename_prefix) + return tuple(files) + + +__all__ = ["CLIPTokenizerFast"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/code_llama/__pycache__/__init__.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/code_llama/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d1fba79fab29dd96bef5f6c312cd46a79ce62003 Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/code_llama/__pycache__/__init__.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/code_llama/__pycache__/tokenization_code_llama.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/code_llama/__pycache__/tokenization_code_llama.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ff23477c2d15e0775723ebf61579012f3dfbb8d0 Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/code_llama/__pycache__/tokenization_code_llama.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/code_llama/__pycache__/tokenization_code_llama_fast.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/code_llama/__pycache__/tokenization_code_llama_fast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..16517b95465d8f9f19a71987cbf660577857a232 Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/code_llama/__pycache__/tokenization_code_llama_fast.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/cohere/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/cohere/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ad2d57500c44322b6c749a313f07c07e11f8f20f --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/cohere/__init__.py @@ -0,0 +1,28 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_cohere import * + from .modeling_cohere import * + from .tokenization_cohere_fast import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/cohere/configuration_cohere.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/cohere/configuration_cohere.py new file mode 100644 index 0000000000000000000000000000000000000000..c78d1e9bf8a11c45246a5cb391769566dc69ce50 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/cohere/configuration_cohere.py @@ -0,0 +1,217 @@ +# coding=utf-8 +# Copyright 2024 Cohere team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Cohere model configuration""" + +from ...configuration_utils import PretrainedConfig +from ...modeling_rope_utils import rope_config_validation +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class CohereConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`CohereModel`]. It is used to instantiate an Cohere + model according to the specified arguments, defining the model architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. Instantiating a configuration + with the defaults will yield a similar configuration to that of the [CohereForAI/c4ai-command-r-v01](https://huggingface.co/CohereForAI/c4ai-command-r-v01) model. + + + Args: + vocab_size (`int`, *optional*, defaults to 256000): + Vocabulary size of the Cohere model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`CohereModel`] + hidden_size (`int`, *optional*, defaults to 8192): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 22528): + Dimension of the MLP representations. + logit_scale (`float`, *optional*, defaults to 0.0625): + The scaling factor for the output logits. + num_hidden_layers (`int`, *optional*, defaults to 40): + Number of hidden layers in the Transformer decoder. + num_attention_heads (`int`, *optional*, defaults to 64): + Number of attention heads for each attention layer in the Transformer decoder. + num_key_value_heads (`int`, *optional*): + This is the number of key_value heads that should be used to implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When + converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed + by meanpooling all the original heads within that group. For more details, check out [this + paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to + `num_attention_heads`. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the decoder. + max_position_embeddings (`int`, *optional*, defaults to 8192): + The maximum sequence length that this model might ever be used with. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-05): + The epsilon used by the layer normalization. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + pad_token_id (`int`, *optional*, defaults to 0): + Padding token id. + bos_token_id (`int`, *optional*, defaults to 5): + Beginning of stream token id. + eos_token_id (`int`, *optional*, defaults to 255001): + End of stream token id. + tie_word_embeddings (`bool`, *optional*, defaults to `True`): + Whether to tie weight embeddings + rope_theta (`float`, *optional*, defaults to 10000.0): + The base period of the RoPE embeddings. + rope_scaling (`Dict`, *optional*): + Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type + and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value + accordingly. + Expected contents: + `rope_type` (`str`): + The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', + 'llama3'], with 'default' being the original RoPE implementation. + `factor` (`float`, *optional*): + Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In + most scaling types, a `factor` of x will enable the model to handle sequences of length x * + original maximum pre-trained length. + `original_max_position_embeddings` (`int`, *optional*): + Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during + pretraining. + `attention_factor` (`float`, *optional*): + Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention + computation. If unspecified, it defaults to value recommended by the implementation, using the + `factor` field to infer the suggested value. + `beta_fast` (`float`, *optional*): + Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear + ramp function. If unspecified, it defaults to 32. + `beta_slow` (`float`, *optional*): + Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear + ramp function. If unspecified, it defaults to 1. + `short_factor` (`list[float]`, *optional*): + Only used with 'longrope'. The scaling factor to be applied to short contexts (< + `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden + size divided by the number of attention heads divided by 2 + `long_factor` (`list[float]`, *optional*): + Only used with 'longrope'. The scaling factor to be applied to long contexts (< + `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden + size divided by the number of attention heads divided by 2 + `low_freq_factor` (`float`, *optional*): + Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE + `high_freq_factor` (`float`, *optional*): + Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE + attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`): + Whether to use a bias in the query, key, value and output projection layers during self-attention. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + use_qk_norm (`bool`, *optional*, defaults to `False`): + Whether to use query-key normalization in the attention + + ```python + >>> from transformers import CohereModel, CohereConfig + + >>> # Initializing a Cohere model configuration + >>> configuration = CohereConfig() + + >>> # Initializing a model from the Cohere configuration + >>> model = CohereModel(configuration) # doctest: +SKIP + + >>> # Accessing the model configuration + >>> configuration = model.config # doctest: +SKIP + ```""" + + model_type = "cohere" + keys_to_ignore_at_inference = ["past_key_values"] + base_model_tp_plan = { + "layers.*.self_attn.q_proj": "colwise", + "layers.*.self_attn.k_proj": "colwise", + "layers.*.self_attn.v_proj": "colwise", + "layers.*.self_attn.o_proj": "rowwise", + "layers.*.mlp.gate_proj": "colwise", + "layers.*.mlp.up_proj": "colwise", + "layers.*.mlp.down_proj": "rowwise", + } + base_model_pp_plan = { + "embed_tokens": (["input_ids"], ["inputs_embeds"]), + "layers": (["hidden_states", "attention_mask"], ["hidden_states"]), + "norm": (["hidden_states"], ["hidden_states"]), + } + + def __init__( + self, + vocab_size=256000, + hidden_size=8192, + intermediate_size=22528, + logit_scale=0.0625, + num_hidden_layers=40, + num_attention_heads=64, + num_key_value_heads=None, + hidden_act="silu", + max_position_embeddings=8192, + initializer_range=0.02, + layer_norm_eps=1e-5, + use_cache=True, + pad_token_id=0, + bos_token_id=5, + eos_token_id=255001, + tie_word_embeddings=True, + rope_theta=10000.0, + rope_scaling=None, + attention_bias=False, + attention_dropout=0.0, + use_qk_norm=False, + **kwargs, + ): + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.logit_scale = logit_scale + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + + # for backward compatibility + if num_key_value_heads is None: + num_key_value_heads = num_attention_heads + + self.num_key_value_heads = num_key_value_heads + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + self.use_cache = use_cache + self.rope_theta = rope_theta + self.rope_scaling = rope_scaling + self.attention_bias = attention_bias + self.attention_dropout = attention_dropout + self.use_qk_norm = use_qk_norm + + # Validate the correctness of rotary position embeddings parameters + rope_config_validation(self) + + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) + + +__all__ = ["CohereConfig"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/cohere/modeling_cohere.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/cohere/modeling_cohere.py new file mode 100644 index 0000000000000000000000000000000000000000..1dfa0ce0be33a6f3383e76d81de4f3895ab70bea --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/cohere/modeling_cohere.py @@ -0,0 +1,534 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/cohere/modular_cohere.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_cohere.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright 2024 Cohere team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This file is based on the LLama model definition file in transformers + + +from typing import Callable, Optional, Union + +import torch +from torch import nn + +from ...activations import ACT2FN +from ...cache_utils import Cache, DynamicCache +from ...generation import GenerationMixin +from ...masking_utils import create_causal_mask +from ...modeling_flash_attention_utils import FlashAttentionKwargs +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast +from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update +from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel +from ...processing_utils import Unpack +from ...utils import TransformersKwargs, auto_docstring, can_return_tuple +from ...utils.deprecation import deprecate_kwarg +from ...utils.generic import check_model_inputs +from .configuration_cohere import CohereConfig + + +class CohereLayerNorm(nn.Module): + def __init__(self, hidden_size=None, eps=1e-5, bias=False): + """The hidden size can be a tuple or an int. The tuple is used for QKNorm to normalize across head_dim""" + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + mean = hidden_states.mean(-1, keepdim=True) + variance = (hidden_states - mean).pow(2).mean(-1, keepdim=True) + hidden_states = (hidden_states - mean) * torch.rsqrt(variance + self.variance_epsilon) + hidden_states = self.weight.to(torch.float32) * hidden_states + return hidden_states.to(input_dtype) + + +class CohereRotaryEmbedding(nn.Module): + inv_freq: torch.Tensor # fix linting for `register_buffer` + + def __init__(self, config: CohereConfig, device=None): + super().__init__() + # BC: "rope_type" was originally "type" + if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): + self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) + else: + self.rope_type = "default" + self.max_seq_len_cached = config.max_position_embeddings + self.original_max_seq_len = config.max_position_embeddings + + self.config = config + self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + + inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) + self.original_inv_freq = self.inv_freq + + @torch.no_grad() + @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) + def forward(self, x, position_ids): + inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1) + position_ids_expanded = position_ids[:, None, :].float() + + device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" + with torch.autocast(device_type=device_type, enabled=False): # Force float32 + freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) + emb = torch.repeat_interleave(freqs, 2, dim=-1) # diff from Llama: we interleave() instead of cat() + cos = emb.cos() * self.attention_scaling + sin = emb.sin() * self.attention_scaling + + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) + + +class CohereMLP(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.intermediate_size = config.intermediate_size + self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) + self.act_fn = ACT2FN[config.hidden_act] + + def forward(self, x): + down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) + return down_proj + + +def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: + """ + This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, + num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) + """ + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) + + +def eager_attention_forward( + module: nn.Module, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attention_mask: Optional[torch.Tensor], + scaling: float, + dropout: float = 0.0, + **kwargs: Unpack[TransformersKwargs], +): + key_states = repeat_kv(key, module.num_key_value_groups) + value_states = repeat_kv(value, module.num_key_value_groups) + + attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling + if attention_mask is not None: + causal_mask = attention_mask[:, :, :, : key_states.shape[-2]] + attn_weights = attn_weights + causal_mask + + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype) + attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) + attn_output = torch.matmul(attn_weights, value_states) + attn_output = attn_output.transpose(1, 2).contiguous() + + return attn_output, attn_weights + + +def rotate_half(x): + # Split and rotate. Note that this function is different from e.g. Llama. + x1 = x[..., ::2] + x2 = x[..., 1::2] + rot_x = torch.stack([-x2, x1], dim=-1).flatten(-2) + return rot_x + + +def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1): + """Applies Rotary Position Embedding to the query and key tensors. + + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + position_ids (`torch.Tensor`, *optional*): + Deprecated and unused. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + dtype = q.dtype + q = q.float() + k = k.float() + cos = cos.unsqueeze(unsqueeze_dim) + sin = sin.unsqueeze(unsqueeze_dim) + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed.to(dtype=dtype), k_embed.to(dtype=dtype) + + +class CohereAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config: CohereConfig, layer_idx: Optional[int] = None): + super().__init__() + self.config = config + self.layer_idx = layer_idx + self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads) + self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads + self.scaling = self.head_dim**-0.5 + self.attention_dropout = config.attention_dropout + self.is_causal = True + + self.q_proj = nn.Linear( + config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias + ) + self.k_proj = nn.Linear( + config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias + ) + self.v_proj = nn.Linear( + config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias + ) + self.o_proj = nn.Linear( + config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias + ) + self.use_qk_norm = config.use_qk_norm + if self.use_qk_norm: + # When sharding the model using Tensor Parallelism, need to be careful to use n_local_heads + self.q_norm = CohereLayerNorm( + hidden_size=(config.num_attention_heads, self.head_dim), eps=config.layer_norm_eps + ) + self.k_norm = CohereLayerNorm( + hidden_size=(config.num_key_value_heads, self.head_dim), eps=config.layer_norm_eps + ) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + position_embeddings: tuple[torch.Tensor, torch.Tensor], + attention_mask: Optional[torch.Tensor], + past_key_values: Optional[Cache] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Unpack[FlashAttentionKwargs], + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: + input_shape = hidden_states.shape[:-1] + hidden_shape = (*input_shape, -1, self.head_dim) + + query_states = self.q_proj(hidden_states).view(hidden_shape) + key_states = self.k_proj(hidden_states).view(hidden_shape) + value_states = self.v_proj(hidden_states).view(hidden_shape) + + if self.use_qk_norm: # main diff from Llama + query_states = self.q_norm(query_states) + key_states = self.k_norm(key_states) + + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + cos, sin = position_embeddings + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) + + if past_key_values is not None: + # sin and cos are specific to RoPE models; position_ids needed for the static cache + cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} + key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs) + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + attn_output, attn_weights = attention_interface( + self, + query_states, + key_states, + value_states, + attention_mask, + dropout=0.0 if not self.training else self.attention_dropout, + scaling=self.scaling, + **kwargs, + ) + + attn_output = attn_output.reshape(*input_shape, -1).contiguous() + attn_output = self.o_proj(attn_output) + return attn_output, attn_weights + + +class CohereDecoderLayer(GradientCheckpointingLayer): + def __init__(self, config: CohereConfig, layer_idx: int): + super().__init__() + self.hidden_size = config.hidden_size + self.self_attn = CohereAttention(config=config, layer_idx=layer_idx) + self.mlp = CohereMLP(config) + self.input_layernorm = CohereLayerNorm(hidden_size=(config.hidden_size), eps=config.layer_norm_eps) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + use_cache: Optional[bool] = False, + cache_position: Optional[torch.LongTensor] = None, + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + **kwargs: Unpack[FlashAttentionKwargs], + ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]: + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`, *optional*): + attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1, + query_sequence_length, key_sequence_length)` if default attention is used. + past_key_values (`Cache`, *optional*): cached past key and value projection states + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*): + Indices depicting the position of the input sequence tokens in the sequence + position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*): + Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`, + with `head_dim` being the embedding dimension of each attention head. + """ + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + + hidden_states_attention, _ = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + use_cache=use_cache, + cache_position=cache_position, + position_embeddings=position_embeddings, + **kwargs, + ) + + hidden_states_mlp = self.mlp(hidden_states) + hidden_states = residual + hidden_states_attention + hidden_states_mlp + return hidden_states + + +@auto_docstring +class CoherePreTrainedModel(PreTrainedModel): + config: CohereConfig + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["CohereDecoderLayer"] + _skip_keys_device_placement = ["past_key_values"] + _supports_flash_attn = True + _supports_sdpa = True + _supports_flex_attn = True + + _can_compile_fullgraph = True + _supports_attention_backend = True + _can_record_outputs = { + "hidden_states": CohereDecoderLayer, + "attentions": CohereAttention, + } + + +@auto_docstring +class CohereModel(CoherePreTrainedModel): + def __init__(self, config: CohereConfig): + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) + self.layers = nn.ModuleList( + [CohereDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + ) + self.norm = CohereLayerNorm(hidden_size=(config.hidden_size), eps=config.layer_norm_eps) + self.rotary_emb = CohereRotaryEmbedding(config=config) + self.gradient_checkpointing = False + + # Initialize weights and apply final processing + self.post_init() + + @check_model_inputs + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + cache_position: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> BaseModelOutputWithPast: + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") + + if inputs_embeds is None: + inputs_embeds: torch.Tensor = self.embed_tokens(input_ids) + + if use_cache and past_key_values is None: + past_key_values = DynamicCache(config=self.config) + + if cache_position is None: + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + cache_position: torch.Tensor = torch.arange( + past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device + ) + + if position_ids is None: + position_ids = cache_position.unsqueeze(0) + + causal_mask = create_causal_mask( + config=self.config, + input_embeds=inputs_embeds, + attention_mask=attention_mask, + cache_position=cache_position, + past_key_values=past_key_values, + position_ids=position_ids, + ) + + hidden_states = inputs_embeds + position_embeddings = self.rotary_emb(hidden_states, position_ids) + + for decoder_layer in self.layers[: self.config.num_hidden_layers]: + hidden_states = decoder_layer( + hidden_states, + attention_mask=causal_mask, + position_ids=position_ids, + past_key_values=past_key_values, + cache_position=cache_position, + position_embeddings=position_embeddings, + **kwargs, + ) + + hidden_states = self.norm(hidden_states) + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=past_key_values, + ) + + +@auto_docstring +class CohereForCausalLM(CoherePreTrainedModel, GenerationMixin): + _tied_weights_keys = ["lm_head.weight"] + _tp_plan = {"lm_head": "colwise_rep"} + _pp_plan = {"lm_head": (["hidden_states"], ["logits"])} + + def __init__(self, config): + super().__init__(config) + self.model = CohereModel(config) + self.vocab_size = config.vocab_size + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + self.logit_scale = config.logit_scale + self.tie_word_embeddings = config.tie_word_embeddings + + # Initialize weights and apply final processing + self.post_init() + + @can_return_tuple + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Union[Cache, list[torch.FloatTensor]]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + logits_to_keep: Union[int, torch.Tensor] = 0, + **kwargs: Unpack[TransformersKwargs], + ) -> CausalLMOutputWithPast: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Example: + + ```python + >> from transformers import AutoTokenizer, CohereForCausalLM + + >> model = CohereForCausalLM.from_pretrained("CohereForAI/c4ai-command-r-v01") + >> tokenizer = AutoTokenizer.from_pretrained("CohereForAI/c4ai-command-r-v01") + + >> prompt = "Hey, are you conscious? Can you talk to me?" + >> inputs = tokenizer(prompt, return_tensors="pt") + + >> # Generate + >> generate_ids = model.generate(inputs.input_ids, max_length=30) + >> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you." + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs: BaseModelOutputWithPast = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + cache_position=cache_position, + **kwargs, + ) + + hidden_states = outputs.last_hidden_state + # Only compute necessary logits, and do not upcast them to float if we are not computing the loss + slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep + logits = self.lm_head(hidden_states[:, slice_indices, :]) + logits = logits * self.logit_scale # main diff from Llama + + loss = None + if labels is not None: + loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs) + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +__all__ = ["CohereForCausalLM", "CohereModel", "CoherePreTrainedModel"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/cohere/modular_cohere.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/cohere/modular_cohere.py new file mode 100644 index 0000000000000000000000000000000000000000..daa12a15ed268967df7a3852424ba98944a646b1 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/cohere/modular_cohere.py @@ -0,0 +1,355 @@ +# coding=utf-8 +# Copyright 2024 Cohere team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This file is based on the LLama model definition file in transformers + +"""PyTorch Cohere model.""" + +from typing import Callable, Optional, Union + +import torch +from torch import nn + +from ...cache_utils import Cache +from ...modeling_flash_attention_utils import FlashAttentionKwargs +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast +from ...modeling_rope_utils import dynamic_rope_update +from ...modeling_utils import ALL_ATTENTION_FUNCTIONS +from ...processing_utils import Unpack +from ...utils import TransformersKwargs, logging +from ...utils.deprecation import deprecate_kwarg +from ..llama.modeling_llama import ( + LlamaAttention, + LlamaForCausalLM, + LlamaMLP, + LlamaModel, + LlamaRotaryEmbedding, + eager_attention_forward, +) +from .configuration_cohere import CohereConfig + + +logger = logging.get_logger(__name__) + + +class CohereLayerNorm(nn.Module): + def __init__(self, hidden_size=None, eps=1e-5, bias=False): + """The hidden size can be a tuple or an int. The tuple is used for QKNorm to normalize across head_dim""" + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + mean = hidden_states.mean(-1, keepdim=True) + variance = (hidden_states - mean).pow(2).mean(-1, keepdim=True) + hidden_states = (hidden_states - mean) * torch.rsqrt(variance + self.variance_epsilon) + hidden_states = self.weight.to(torch.float32) * hidden_states + return hidden_states.to(input_dtype) + + +class CohereRotaryEmbedding(LlamaRotaryEmbedding): + @torch.no_grad() + @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) + def forward(self, x, position_ids): + inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1) + position_ids_expanded = position_ids[:, None, :].float() + + device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" + with torch.autocast(device_type=device_type, enabled=False): # Force float32 + freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) + emb = torch.repeat_interleave(freqs, 2, dim=-1) # diff from Llama: we interleave() instead of cat() + cos = emb.cos() * self.attention_scaling + sin = emb.sin() * self.attention_scaling + + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) + + +def rotate_half(x): + # Split and rotate. Note that this function is different from e.g. Llama. + x1 = x[..., ::2] + x2 = x[..., 1::2] + rot_x = torch.stack([-x2, x1], dim=-1).flatten(-2) + return rot_x + + +def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1): + """Applies Rotary Position Embedding to the query and key tensors. + + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + position_ids (`torch.Tensor`, *optional*): + Deprecated and unused. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + dtype = q.dtype + q = q.float() + k = k.float() + cos = cos.unsqueeze(unsqueeze_dim) + sin = sin.unsqueeze(unsqueeze_dim) + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed.to(dtype=dtype), k_embed.to(dtype=dtype) + + +class CohereMLP(LlamaMLP): + def __init__(self, config): + super().__init__(config) + self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) + + +class CohereAttention(LlamaAttention): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config: CohereConfig, layer_idx: Optional[int] = None): + super().__init__(config, layer_idx) + self.use_qk_norm = config.use_qk_norm + if self.use_qk_norm: + # When sharding the model using Tensor Parallelism, need to be careful to use n_local_heads + self.q_norm = CohereLayerNorm( + hidden_size=(config.num_attention_heads, self.head_dim), eps=config.layer_norm_eps + ) + self.k_norm = CohereLayerNorm( + hidden_size=(config.num_key_value_heads, self.head_dim), eps=config.layer_norm_eps + ) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + position_embeddings: tuple[torch.Tensor, torch.Tensor], + attention_mask: Optional[torch.Tensor], + past_key_values: Optional[Cache] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Unpack[FlashAttentionKwargs], + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: + input_shape = hidden_states.shape[:-1] + hidden_shape = (*input_shape, -1, self.head_dim) + + query_states = self.q_proj(hidden_states).view(hidden_shape) + key_states = self.k_proj(hidden_states).view(hidden_shape) + value_states = self.v_proj(hidden_states).view(hidden_shape) + + if self.use_qk_norm: # main diff from Llama + query_states = self.q_norm(query_states) + key_states = self.k_norm(key_states) + + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + cos, sin = position_embeddings + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) + + if past_key_values is not None: + # sin and cos are specific to RoPE models; position_ids needed for the static cache + cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} + key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs) + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + attn_output, attn_weights = attention_interface( + self, + query_states, + key_states, + value_states, + attention_mask, + dropout=0.0 if not self.training else self.attention_dropout, + scaling=self.scaling, + **kwargs, + ) + + attn_output = attn_output.reshape(*input_shape, -1).contiguous() + attn_output = self.o_proj(attn_output) + return attn_output, attn_weights + + +class CohereDecoderLayer(GradientCheckpointingLayer): + def __init__(self, config: CohereConfig, layer_idx: int): + super().__init__() + self.hidden_size = config.hidden_size + self.self_attn = CohereAttention(config=config, layer_idx=layer_idx) + self.mlp = CohereMLP(config) + self.input_layernorm = CohereLayerNorm(hidden_size=(config.hidden_size), eps=config.layer_norm_eps) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + use_cache: Optional[bool] = False, + cache_position: Optional[torch.LongTensor] = None, + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + **kwargs: Unpack[FlashAttentionKwargs], + ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]: + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`, *optional*): + attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1, + query_sequence_length, key_sequence_length)` if default attention is used. + past_key_values (`Cache`, *optional*): cached past key and value projection states + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*): + Indices depicting the position of the input sequence tokens in the sequence + position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*): + Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`, + with `head_dim` being the embedding dimension of each attention head. + """ + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + + hidden_states_attention, _ = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + use_cache=use_cache, + cache_position=cache_position, + position_embeddings=position_embeddings, + **kwargs, + ) + + hidden_states_mlp = self.mlp(hidden_states) + hidden_states = residual + hidden_states_attention + hidden_states_mlp + return hidden_states + + +class CohereModel(LlamaModel): + def __init__(self, config: CohereConfig): + super().__init__(config) + self.layers = nn.ModuleList( + [CohereDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + ) + self.rotary_emb = CohereRotaryEmbedding(config=config) + self.norm = CohereLayerNorm(hidden_size=(config.hidden_size), eps=config.layer_norm_eps) + + +class CohereForCausalLM(LlamaForCausalLM): + def __init__(self, config): + super().__init__(config) + self.model = CohereModel(config) + self.logit_scale = config.logit_scale + self.tie_word_embeddings = config.tie_word_embeddings + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Union[Cache, list[torch.FloatTensor]]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + logits_to_keep: Union[int, torch.Tensor] = 0, + **kwargs: Unpack[TransformersKwargs], + ) -> CausalLMOutputWithPast: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Example: + + ```python + >> from transformers import AutoTokenizer, CohereForCausalLM + + >> model = CohereForCausalLM.from_pretrained("CohereForAI/c4ai-command-r-v01") + >> tokenizer = AutoTokenizer.from_pretrained("CohereForAI/c4ai-command-r-v01") + + >> prompt = "Hey, are you conscious? Can you talk to me?" + >> inputs = tokenizer(prompt, return_tensors="pt") + + >> # Generate + >> generate_ids = model.generate(inputs.input_ids, max_length=30) + >> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you." + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs: BaseModelOutputWithPast = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + cache_position=cache_position, + **kwargs, + ) + + hidden_states = outputs.last_hidden_state + # Only compute necessary logits, and do not upcast them to float if we are not computing the loss + slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep + logits = self.lm_head(hidden_states[:, slice_indices, :]) + logits = logits * self.logit_scale # main diff from Llama + + loss = None + if labels is not None: + loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs) + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +__all__ = [ + "CohereForCausalLM", + "CohereModel", + "CoherePreTrainedModel", # noqa: F822 +] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/cohere/tokenization_cohere_fast.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/cohere/tokenization_cohere_fast.py new file mode 100644 index 0000000000000000000000000000000000000000..fd240b97848033ab77e984f85d91aa63e190df0c --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/cohere/tokenization_cohere_fast.py @@ -0,0 +1,512 @@ +# coding=utf-8 +# Copyright 2024 Cohere team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This file is based on the tokenization_llama_fast.py file in transformers + +import pickle +from typing import Literal, Union + +from tokenizers import processors + +from ...tokenization_utils_base import BatchEncoding +from ...tokenization_utils_fast import PreTrainedTokenizerFast +from ...utils import logging + + +logger = logging.get_logger(__name__) +VOCAB_FILES_NAMES = {"tokenizer_file": "tokenizer.json"} + +PRETRAINED_VOCAB_FILES_MAP = { + "tokenizer_file": { + "Cohere/Command-nightly": "https://huggingface.co/Cohere/Command-nightly/blob/main/tokenizer.json", + }, +} + +# fmt: off +DEFAULT_SYSTEM_PROMPT = "You are Command-R, a brilliant, sophisticated, AI-assistant trained to assist human users by providing thorough responses. You are trained by Cohere." +DEFAULT_RAG_PREAMBLE = """## Task and Context +You help people answer their questions and other requests interactively. You will be asked a very wide array of requests on all kinds of topics. You will be equipped with a wide range of search engines or similar tools to help you, which you use to research your answer. You should focus on serving the user's needs as best you can, which will be wide-ranging. + +## Style Guide +Unless the user asks for a different style of answer, you should answer in full sentences, using proper grammar and spelling.""" +# fmt: on + + +class CohereTokenizerFast(PreTrainedTokenizerFast): + """ + Construct a Cohere tokenizer. Based on byte-level Byte-Pair-Encoding. + + This uses notably ByteFallback and NFC normalization. + + ```python + >>> from transformers import AutoTokenizer + + >>> tokenizer = AutoTokenizer.from_pretrained("CohereForAI/c4ai-command-r-v01") + >>> tokenizer.encode("Hello this is a test") + [5, 28339, 2075, 1801, 1671, 3282] + ``` + + If you want to change the `bos_token` or the `eos_token`, make sure to specify them when initializing the model, or + call `tokenizer.update_post_processor()` to make sure that the post-processing is correctly done (otherwise the + values of the first token and final token of an encoded sequence will not be correct). For more details, checkout + [post-processors] (https://huggingface.co/docs/tokenizers/api/post-processors) documentation. + + You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer, but since + the model was not pretrained this way, it might yield a decrease in performance. + + + + When used with `is_split_into_words=True`, this tokenizer needs to be instantiated with `add_prefix_space=True`. + + + + This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should + refer to this superclass for more information regarding those methods. + + Args: + vocab_file (`str`, *optional*): + Path to the vocabulary file. + merges_file (`str`, *optional*): + Path to the merges file. + tokenizer_file (`str`, *optional*): + [tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that + contains everything needed to load the tokenizer. + clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`): + Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like + extra spaces. + unk_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `""`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + bos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `""`): + The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token. + eos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|END_OF_TURN_TOKEN|>"`): + The end of sequence token. + add_bos_token (`bool`, *optional*, defaults to `True`): + Whether or not to add an `bos_token` at the start of sequences. + add_eos_token (`bool`, *optional*, defaults to `False`): + Whether or not to add an `eos_token` at the end of sequences. + use_default_system_prompt (`bool`, *optional*, defaults to `False`): + Whether or not the default system prompt for Cohere tokenizer should be used. + add_prefix_space (`bool`, *optional*, defaults to `False`): + Whether or not the tokenizer should automatically add a prefix space + """ + + vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + padding_side = "left" + model_input_names = ["input_ids", "attention_mask"] + slow_tokenizer_class = None + # No `max_model_input_sizes` + + def __init__( + self, + vocab_file=None, + merges_file=None, + tokenizer_file=None, + clean_up_tokenization_spaces=False, + unk_token="", + bos_token="", + eos_token="<|END_OF_TURN_TOKEN|>", + add_bos_token=True, + add_eos_token=False, + use_default_system_prompt=False, + add_prefix_space=False, + **kwargs, + ): + super().__init__( + vocab_file=vocab_file, + merges_file=merges_file, + tokenizer_file=tokenizer_file, + clean_up_tokenization_spaces=clean_up_tokenization_spaces, + unk_token=unk_token, + bos_token=bos_token, + eos_token=eos_token, + add_bos_token=add_bos_token, + add_eos_token=add_eos_token, + use_default_system_prompt=use_default_system_prompt, + add_prefix_space=add_prefix_space, + **kwargs, + ) + self._add_bos_token = add_bos_token + self._add_eos_token = add_eos_token + self.update_post_processor() + self.use_default_system_prompt = use_default_system_prompt + self.vocab_file = vocab_file + self.grounded_generation_template = kwargs.pop("grounded_generation_template", None) + self.tool_use_template = kwargs.pop("tool_use_template", None) + + # TODO @ArthurZucker this can only work one way for now, to update later-on. Tests should also properly + # check this as they were green before. + pre_tok_state = pickle.dumps(self.backend_tokenizer.pre_tokenizer) + decoder_state = pickle.dumps(self.backend_tokenizer.decoder) + + if add_prefix_space: + pre_tok_state = pre_tok_state.replace(b'"add_prefix_space":false', b'"add_prefix_space": true') + decoder_state = decoder_state.replace(b'"add_prefix_space":false', b'"add_prefix_space": true') + self.backend_tokenizer.pre_tokenizer = pickle.loads(pre_tok_state) + self.backend_tokenizer.decoder = pickle.loads(decoder_state) + + self.add_prefix_space = add_prefix_space + + def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding: + is_split_into_words = kwargs.get("is_split_into_words", False) + if not (self.add_prefix_space or not is_split_into_words): + raise Exception( + f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True to use it with" + " pretokenized inputs." + ) + + return super()._batch_encode_plus(*args, **kwargs) + + def _encode_plus(self, *args, **kwargs) -> BatchEncoding: + is_split_into_words = kwargs.get("is_split_into_words", False) + + if not (self.add_prefix_space or not is_split_into_words): + raise Exception( + f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True to use it with" + " pretokenized inputs." + ) + + return super()._encode_plus(*args, **kwargs) + + def update_post_processor(self): + """ + Updates the underlying post processor with the current `bos_token` and `eos_token`. + """ + bos = self.bos_token + bos_token_id = self.bos_token_id + if bos is None and self.add_bos_token: + raise ValueError("add_bos_token = True but bos_token = None") + + eos = self.eos_token + eos_token_id = self.eos_token_id + if eos is None and self.add_eos_token: + raise ValueError("add_eos_token = True but eos_token = None") + + single = f"{(bos + ':0 ') if self.add_bos_token else ''}$A:0{(' ' + eos + ':0') if self.add_eos_token else ''}" + pair = f"{single}{(' ' + bos + ':1') if self.add_bos_token else ''} $B:1{(' ' + eos + ':1') if self.add_eos_token else ''}" + + special_tokens = [] + if self.add_bos_token: + special_tokens.append((bos, bos_token_id)) + if self.add_eos_token: + special_tokens.append((eos, eos_token_id)) + self._tokenizer.post_processor = processors.TemplateProcessing( + single=single, pair=pair, special_tokens=special_tokens + ) + + @property + def add_eos_token(self): + return self._add_eos_token + + @property + def add_bos_token(self): + return self._add_bos_token + + @add_eos_token.setter + def add_eos_token(self, value): + self._add_eos_token = value + self.update_post_processor() + + @add_bos_token.setter + def add_bos_token(self, value): + self._add_bos_token = value + self.update_post_processor() + + def apply_tool_use_template( + self, + conversation: list[dict[str, str]], + tools: list[dict], + **kwargs, + ) -> Union[str, list[int]]: + """Create a Command-R tool-use prompt. + + Once rendered, the prompt instructs the model to generate a list of actions to perform on a set of user supplied tools + to help carry out the user's requests. + + Conceptually, this works in the same way as `apply_chat_format`, but takes an additional `tools` parameter. + + Converts a chat in the form of a list of dictionaries with `"role"` and `"content"` keys and a list of available + tools for the model to use into a prompt string, or a list of token ids. + This method will use the tokenizer's `default_tool_use_template` template specified at the class level. + You can override the default template using the `tool_use_template` kwarg but the quality of your results may decrease. + + Args: + conversation (list[dict[str, str]]): A list of dicts + with "role" and "content" keys, representing the chat history so far. + tools (list[Dict]): a list of tools to render into the prompt for the model to choose from. + See an example at the bottom of the docstring. + The format should be: + * name (str): The name of the tool to be called. Valid names contain only the characters a-z, + A-Z, 0-9, _ and must not begin with a digit. + * description (str): The description of what the tool does, the model uses the description to + choose when and how to call the function. + * parameter_definitions (list[Dict]): The input parameters of the tool. Accepts a dictionary + where the key is the name of the parameter and the value is the parameter spec. + Valid parameter names contain only the characters a-z, A-Z, 0-9, _ and must not begin with a digit. + Parameter specs are as follows: + * description (str): The description of the parameter. + * type (str): the type of the parameter - most effective for python builtin data types, such as 'str', 'bool' + * required: boolean: Denotes whether the parameter is always present (required) or not. Defaults to not required. + add_generation_prompt (bool, *optional*): Whether to end the prompt with the token(s) that indicate + the start of an assistant message. This is useful when you want to generate a response from the model. + Note that this argument will be passed to the chat template, and so it must be supported in the + template for this argument to have any effect. + tokenize (`bool`, defaults to `True`): + Whether to tokenize the output. If `False`, the output will be a string. + padding (`bool`, defaults to `False`): + Whether to pad sequences to the maximum length. Has no effect if tokenize is `False`. + truncation (`bool`, defaults to `False`): + Whether to truncate sequences at the maximum length. Has no effect if tokenize is `False`. + max_length (`int`, *optional*): + Maximum length (in tokens) to use for padding or truncation. Has no effect if tokenize is `False`. If + not specified, the tokenizer's `max_length` attribute will be used as a default. + return_tensors (`str` or [`~utils.TensorType`], *optional*): + If set, will return tensors of a particular framework. Has no effect if tokenize is `False`. Acceptable + values are: + - `'tf'`: Return TensorFlow `tf.Tensor` objects. + - `'pt'`: Return PyTorch `torch.Tensor` objects. + - `'np'`: Return NumPy `np.ndarray` objects. + - `'jax'`: Return JAX `jnp.ndarray` objects. + return_dict (`bool`, *optional*, defaults to `False`): + Whether to return a dictionary with named outputs. Has no effect if tokenize is `False`. + **tokenizer_kwargs: Additional kwargs to pass to the tokenizer. + + Returns: + `str`: A rendered prompt string. + or if tokenize=True: + `list[int]`: A list of token ids representing the tokenized chat so far, including control tokens. This + output is ready to pass to the model, either directly or via methods like `generate()`. + + Examples: + + ```python + >> tokenizer = CohereTokenizerFast.from_pretrained("CohereForAI/c4ai-command-r-v01") + >> tools = [ + { + "name": "internet_search", + "description": "Returns a list of relevant document snippets for a textual query retrieved from the internet", + "parameter_definitions": { + "query": { + "description": "Query to search the internet with", + "type": "str", + "required": True + } + } + }, + { + "name': "directly_answer", + "description": "Calls a standard (un-augmented) AI chatbot to generate a response given the conversation history", + "parameter_definitions": {} + } + ] + >> conversation = [ + {"role": "user", "content": "Whats the biggest penguin in the world?"} + ] + >> # render the prompt, ready for user to inspect, or for input into the model: + >> prompt = tokenizer.apply_tool_use_template(conversation, tools=tools, tokenize=False, add_generation_prompt=True) + >> print(prompt) + <|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|># Safety Preamble + The instructions in this section override those in the task description and style guide sections. Don't answer questions that are harmful or immoral. + + # System Preamble + ## Basic Rules + You are a powerful conversational AI trained by Cohere to help people. You are augmented by a number of tools, and your job is to use and consume the output of these tools to best help the user. You will see a conversation history between yourself and a user, ending with an utterance from the user. You will then see a specific instruction instructing you what kind of response to generate. When you answer the user's requests, you cite your sources in your answers, according to those instructions. + + # User Preamble + ## Task and Context + You help people answer their questions and other requests interactively. You will be asked a very wide array of requests on all kinds of topics. You will be equipped with a wide range of search engines or similar tools to help you, which you use to research your answer. You should focus on serving the user's needs as best you can, which will be wide-ranging. + + ## Style Guide + Unless the user asks for a different style of answer, you should answer in full sentences, using proper grammar and spelling. + + ## Available Tools + Here is a list of tools that you have available to you: + + \\`\\`\\`python + def internet_search(query: str) -> list[Dict]: + \"\"\"Returns a list of relevant document snippets for a textual query retrieved from the internet + + Args: + query (str): Query to search the internet with + \"\"\" + pass + \\`\\`\\` + + \\`\\`\\`python + def directly_answer() -> list[Dict]: + \"\"\"Calls a standard (un-augmented) AI chatbot to generate a response given the conversation history + \"\"\" + pass + \\`\\`\\`<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Whats the biggest penguin in the world?<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>Write 'Action:' followed by a json-formatted list of actions that you want to perform in order to produce a good response to the user's last input. You can use any of the supplied tools any number of times, but you should aim to execute the minimum number of necessary actions for the input. You should use the `directly-answer` tool if calling the other tools is unnecessary. The list of actions you want to call should be formatted as a list of json objects, for example: + \\`\\`\\`json + [ + { + "tool_name": title of the tool in the specification, + "parameters": a dict of parameters to input into the tool as they are defined in the specs, or {} if it takes no parameters + } + ]\\`\\`\\`<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|> + ``` + >> inputs = tokenizer.encode(prompt, add_special_tokens=False, return_tensors='pt') + >> outputs = model.generate(inputs, max_new_tokens=128) + >> print(tokenizer.decode(outputs[0])) + Action: ```json + [ + { + "tool_name": "internet_search", + "parameters": { + "query": "biggest penguin in the world" + } + } + ] + ``` + """ + return self.apply_chat_template( + conversation, + chat_template="tool_use", + tools=tools, + **kwargs, + ) + + def apply_grounded_generation_template( + self, + conversation: list[dict[str, str]], + documents: list[dict], + citation_mode: Literal["fast", "accurate"] = "accurate", + **kwargs, + ) -> Union[str, list[int]]: + """Create a Command-R grounded generation (aka RAG) prompt. + + Once rendered, the prompt instructs the model to generate a response with citations in, based on supplied documents. + + Conceptually, this works in the same way as `apply_chat_format`, but takes additional `documents` + and parameter `citation_mode` parameters. + + Converts a list of dictionaries with `"role"` and `"content"` keys and a list of + documents for the model to ground its response on into a prompt string, or a list of token ids. + This method will use the tokenizer's `grounded_generation_template` template specified at the class level. + You can override the default template using the `grounded_generation_template` kwarg but the quality of your results may decrease. + + Args: + conversation (list[dict[str, str]]): A list of dicts + with "role" and "content" keys, representing the chat history so far. + documents (list[dict[str, str]): A list of dicts, representing documents or tool outputs to ground your + generation on. A document is a semistructured dict, with a string to string mapping. Common fields are + `url`, `title`, `snippet` etc but should be descriptive of the key. They will get rendered into the prompt. + citation_mode: either "accurate" (prompt the model to generate an answer first, then rewrite it with citation + spans in) or "fast", where the prompt instructs the model to generate an answer with citations in directly. + The former has higher quality citations, the latter requires fewer tokens to be generated. + add_generation_prompt (bool, *optional*): Whether to end the prompt with the token(s) that indicate + the start of an assistant message. This is useful when you want to generate a response from the model. + Note that this argument will be passed to the chat template, and so it must be supported in the + template for this argument to have any effect. + tokenize (`bool`, defaults to `True`): + Whether to tokenize the output. If `False`, the output will be a string. + padding (`bool`, defaults to `False`): + Whether to pad sequences to the maximum length. Has no effect if tokenize is `False`. + truncation (`bool`, defaults to `False`): + Whether to truncate sequences at the maximum length. Has no effect if tokenize is `False`. + max_length (`int`, *optional*): + Maximum length (in tokens) to use for padding or truncation. Has no effect if tokenize is `False`. If + not specified, the tokenizer's `max_length` attribute will be used as a default. + return_tensors (`str` or [`~utils.TensorType`], *optional*): + If set, will return tensors of a particular framework. Has no effect if tokenize is `False`. Acceptable + values are: + - `'tf'`: Return TensorFlow `tf.Tensor` objects. + - `'pt'`: Return PyTorch `torch.Tensor` objects. + - `'np'`: Return NumPy `np.ndarray` objects. + - `'jax'`: Return JAX `jnp.ndarray` objects. + return_dict (`bool`, *optional*, defaults to `False`): + Whether to return a dictionary with named outputs. Has no effect if tokenize is `False`. + **tokenizer_kwargs: Additional kwargs to pass to the tokenizer. + + Returns: + `str`: A rendered prompt string. + or if tokenize=True: + `list[int]`: A list of token ids representing the tokenized chat so far, including control tokens. This + output is ready to pass to the model, either directly or via methods like `generate()`. + + Examples: + + ```python + >> tokenizer = CohereTokenizerFast.from_pretrained('CohereForAI/c4ai-command-r-v01') + + >> # define documents: + >> documents = [ + { "title": "Tall penguins", "text": "Emperor penguins are the tallest." }, + { "title": "Penguin habitats", "text": "Emperor penguins only live in Antarctica."} + ] + >> # define a conversation: + >> conversation = [ + {"role": "user", "content": "Whats the biggest penguin in the world?"} + ] + >> # render the prompt, ready for user to inspect, or for input into the model: + >> grounded_generation_prompt = tokenizer.apply_grounded_generation_template(conversation, documents=documents, tokenize=False, add_generation_prompt=True) + >> print(grounded_generation_prompt) + <|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|># Safety Preamble + The instructions in this section override those in the task description and style guide sections. Don't answer questions that are harmful or immoral. + + ## Basic Rules + You are a powerful conversational AI trained by Cohere to help people. You are augmented by a number of tools, and your job is to use and consume the output of these tools to best help the user. You will see a conversation history between yourself and a user, ending with an utterance from the user. You will then see a specific instruction instructing you what kind of response to generate. When you answer the user's requests, you cite your sources in your answers, according to those instructions. + + # User Preamble + ## Task and Context + You help people answer their questions and other requests interactively. You will be asked a very wide array of requests on all kinds of topics. You will be equipped with a wide range of search engines or similar tools to help you, which you use to research your answer. You should focus on serving the user's needs as best you can, which will be wide-ranging. + + ## Style Guide + Unless the user asks for a different style of answer, you should answer in full sentences, using proper grammar and spelling.<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Whats the biggest penguin in the world?<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|> + Document: 0 + title: Tall penguins + text: Emperor penguins are the tallest. + + Document: 1 + title: Penguin habitats + text: Emperor penguins only live in Antarctica. + <|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>Carefully perform the following instructions, in order, starting each with a new line. + Firstly, Decide which of the retrieved documents are relevant to the user's last input by writing 'Relevant Documents:' followed by comma-separated list of document numbers. If none are relevant, you should instead write 'None'. + Secondly, Decide which of the retrieved documents contain facts that should be cited in a good answer to the user's last input by writing 'Cited Documents:' followed a comma-separated list of document numbers. If you dont want to cite any of them, you should instead write 'None'. + Thirdly, Write 'Answer:' followed by a response to the user's last input in high quality natural english. Use the retrieved documents to help you. Do not insert any citations or grounding markup. + Finally, Write 'Grounded answer:' followed by a response to the user's last input in high quality natural english. Use the symbols and to indicate when a fact comes from a document in the search result, e.g my fact for a fact from document 0.<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>''' + ``` + >> inputs = tokenizer.encode(prompt, add_special_tokens=False, return_tensors='pt') + >> outputs = model.generate(inputs, max_new_tokens=128) + >> print(tokenizer.decode(outputs[0])) + Relevant Documents: 0,1 + Cited Documents: 0,1 + Answer: The Emperor Penguin is the tallest or biggest penguin in the world. It is a bird that lives only in Antarctica and grows to a height of around 122 centimetres. + Grounded answer: The Emperor Penguin is the tallest or biggest penguin in the world. It is a bird that lives only in Antarctica and grows to a height of around 122 centimetres. + """ + return self.apply_chat_template( + conversation, + chat_template="rag", + documents=documents, + citation_mode=citation_mode, + **kwargs, + ) + + # TODO ArthurZ let's rely on the template processor instead, refactor all fast tokenizers + def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): + bos_token_id = [self.bos_token_id] if self.add_bos_token else [] + eos_token_id = [self.eos_token_id] if self.add_eos_token else [] + + output = bos_token_id + token_ids_0 + eos_token_id + + if token_ids_1 is not None: + output = output + bos_token_id + token_ids_1 + eos_token_id + + return output + + +__all__ = ["CohereTokenizerFast"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/cohere2_vision/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/cohere2_vision/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..9b20eb3c1e0a8b42669761c9c45ca292b490df27 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/cohere2_vision/__init__.py @@ -0,0 +1,29 @@ +# Copyright 2025 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_cohere2_vision import * + from .image_processing_cohere2_vision_fast import * + from .modeling_cohere2_vision import * + from .processing_cohere2_vision import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/cohere2_vision/configuration_cohere2_vision.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/cohere2_vision/configuration_cohere2_vision.py new file mode 100644 index 0000000000000000000000000000000000000000..acc40fcf85711961a12f9536e6f4c0c97e94f59e --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/cohere2_vision/configuration_cohere2_vision.py @@ -0,0 +1,82 @@ +# Copyright 2025 the Cohere Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ...configuration_utils import PretrainedConfig +from ..auto import CONFIG_MAPPING, AutoConfig + + +class Cohere2VisionConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`Cohere2VisionForConditionalGeneration`]. It is used to instantiate an + Cohere2 Vision model according to the specified arguments, defining the model architecture. + + [CohereLabs/command-a-vision-07-2025](https://huggingface.co/CohereLabs/command-a-vision-07-2025) + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + vision_config (`Union[AutoConfig, dict]`, *optional*, defaults to `SiglipVisionConfig`): + The config object or dictionary of the vision backbone. + text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `Cohere2Config`): + The config object or dictionary of the text backbone. + downsample_factor (`int`, *optional*, defaults to 2): + The factor by which to downsample the input image. + image_token_id (`int`, *optional*, defaults to 255036): + The token ID to use as placeholder for the image input. + alignment_intermediate_size (`int`, *optional*, defaults to 36864): + The size of the intermediate layer for alignment. + """ + + model_type = "cohere2_vision" + sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig} + + def __init__( + self, + vision_config=None, + text_config=None, + downsample_factor=2, + image_token_id=255036, + alignment_intermediate_size=36864, + **kwargs, + ): + super().__init__(**kwargs) + self.downsample_factor = downsample_factor + self.image_token_id = image_token_id + self.alignment_intermediate_size = alignment_intermediate_size + + if isinstance(vision_config, dict): + vision_config["model_type"] = vision_config.get("model_type", "siglip_vision_model") + vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config) + elif vision_config is None: + vision_config = CONFIG_MAPPING["siglip_vision_model"]( + hidden_size=1152, + intermediate_size=3072, + image_size=512, + num_hidden_layers=27, + num_attention_heads=12, + ) + + self.vision_config = vision_config + + if isinstance(text_config, dict): + text_config["model_type"] = text_config.get("model_type", "cohere2") + text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config) + elif text_config is None: + text_config = CONFIG_MAPPING["cohere2"](tie_word_embeddings=True) + + self.text_config = text_config + + +__all__ = ["Cohere2VisionConfig"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py new file mode 100644 index 0000000000000000000000000000000000000000..322e98dbd0f592fd152a2f87c3c5b9849d4dee29 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py @@ -0,0 +1,306 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/cohere2_vision/modular_cohere2_vision.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_cohere2_vision.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright 2025 the Cohere Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from functools import lru_cache +from typing import Optional, Union + +import numpy as np +import torch +from torchvision.transforms.v2 import functional as F + +from ...image_processing_utils import BatchFeature +from ...image_processing_utils_fast import ( + BaseImageProcessorFast, + DefaultFastImageProcessorKwargs, + group_images_by_shape, + reorder_images, +) +from ...image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, ImageInput, PILImageResampling, SizeDict +from ...processing_utils import Unpack +from ...utils import TensorType, auto_docstring + + +class Cohere2VisionFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): + """ + crop_to_patches (`bool`, *optional*, defaults to `False`): + Whether to crop the image to patches. Can be overridden by the `crop_to_patches` parameter in the + `preprocess` method. + min_patches (`int`, *optional*, defaults to 1): + The minimum number of patches to be extracted from the image. Only has an effect if `crop_to_patches` is + set to `True`. Can be overridden by the `min_patches` parameter in the `preprocess` method. + max_patches (`int`, *optional*, defaults to 12): + The maximum number of patches to be extracted from the image. Only has an effect if `crop_to_patches` is + set to `True`. Can be overridden by the `max_patches` parameter in the `preprocess` method. + """ + + crop_to_patches: Optional[bool] + min_patches: Optional[int] + max_patches: Optional[int] + + +@lru_cache(maxsize=10) +def get_all_supported_aspect_ratios(max_image_tiles: int) -> list[tuple[int, int]]: + """ + Computes all allowed aspect ratios for a given maximum number of input tiles. + + This function calculates all possible arrangements of tiles that can be formed + within the constraint of the maximum number of tiles. Each arrangement is + represented by its aspect ratio (width/height) and the corresponding tile configuration. + + Args: + max_image_tiles (`int`): + The maximum number of tiles allowed. + + Returns: + `list[tuple[int, int]]`: A list of tuples, each tuple representing a valid (width, height) + configuration in terms of number of tiles. + + Example: + >>> get_all_supported_aspect_ratios(4) + [(1, 1), (1, 2), (1, 3), (1, 4), (2, 1), (2, 2), (3, 1), (4, 1)] + + """ + aspect_ratios = [] + for width in range(1, max_image_tiles + 1): + for height in range(1, max_image_tiles + 1): + if width * height <= max_image_tiles: + aspect_ratios.append((width, height)) + return aspect_ratios + + +def get_optimal_tiled_canvas( + original_image_size: tuple[int, int], + target_tile_size: tuple[int, int], + min_image_tiles: int, + max_image_tiles: int, +) -> tuple[int, int]: + possible_resolutions = get_all_supported_aspect_ratios(max_image_tiles) + possible_resolutions = sorted(possible_resolutions, key=lambda x: x[0] * x[1]) + image_height, image_width = original_image_size + patch_size_height, patch_size_width = target_tile_size # (height == width) + + candidate_resolutions = np.array(possible_resolutions) * patch_size_height + original_size = np.stack([image_height, image_width]) + required_scales = candidate_resolutions / original_size + required_scale = np.min(required_scales, axis=-1, keepdims=True) # [n_resolutions, 1] + if np.all(required_scale < 1): + # We are forced to downscale, so try to minimize the amount of downscaling + best_grid = possible_resolutions[np.argmax(required_scale)] + else: + # Pick the resolution that required the least upscaling so that it most closely fits the image + required_scale = np.where(required_scale < 1.0, 10e9, required_scale) + best_grid = possible_resolutions[np.argmin(required_scale)] + return best_grid + + +@auto_docstring +class Cohere2VisionImageProcessorFast(BaseImageProcessorFast): + resample = PILImageResampling.BICUBIC + image_mean = OPENAI_CLIP_MEAN + image_std = OPENAI_CLIP_STD + size = {"height": 512, "width": 512} + do_resize = True + do_rescale = True + do_normalize = True + do_convert_rgb = True + crop_to_patches = True + min_patches = 1 + max_patches = 12 + valid_kwargs = Cohere2VisionFastImageProcessorKwargs + patch_size = 16 + + def __init__(self, **kwargs: Unpack[Cohere2VisionFastImageProcessorKwargs]): + super().__init__(**kwargs) + + @auto_docstring + def preprocess(self, images: ImageInput, **kwargs: Unpack[Cohere2VisionFastImageProcessorKwargs]) -> BatchFeature: + return super().preprocess(images, **kwargs) + + def crop_image_to_patches( + self, + images: "torch.Tensor", + min_patches: int, + max_patches: int, + use_thumbnail: bool = True, + patch_size: Optional[Union[tuple, int, dict]] = None, + interpolation: Optional["F.InterpolationMode"] = None, + ): + """ + Crop the images to patches and return a list of cropped images. + The number of patches and their grid arrangement are determined by the original image size, + the target patch size and the minimum and maximum number of patches. + The aspect ratio of the patches grid is chosen to be the closest to the original image aspect ratio. + + Args: + images (`torch.Tensor`): + The images to be cropped. + min_patches (`int`): + The minimum number of patches to be extracted from the image. + max_patches (`int`): + The maximum number of patches to be extracted from the image. + use_thumbnail (`bool`, *optional*, defaults to `True`): + Whether to add a thumbnail image to the list of cropped patches. + patch_size (`int`, `tuple[int, int]`, `dict`, *optional*): + The size of the output patches. + The format of the image data. If `None`, the format is inferred from the input image. + + Returns: + list[`PIL.Image.Image`] or list[np.ndarray]: The list of cropped images. + """ + patch_size_height, patch_size_width = patch_size.height, patch_size.width + original_height, original_width = images.shape[-2:] + # find the closest aspect ratio to the target + num_columns, num_rows = get_optimal_tiled_canvas( + (original_height, original_width), (patch_size_height, patch_size_width), min_patches, max_patches + ) + + # calculate the target width and height + target_width = patch_size_width * num_columns + target_height = patch_size_height * num_rows + num_blocks = num_columns * num_rows + + # resize the image so that each patch is of patch_size + resized_image = self.resize( + images, SizeDict(height=target_height, width=target_width), interpolation=interpolation + ) + # split the image into patches + processed_images = [] + for i in range(num_blocks): + column = i % num_columns + row = i // num_columns + box = ( + column * patch_size_width, + row * patch_size_height, + (column + 1) * patch_size_width, + (row + 1) * patch_size_height, + ) + # split the image + patch_image = resized_image[..., box[1] : box[3], box[0] : box[2]] + processed_images.append(patch_image) + + if use_thumbnail and len(processed_images) != 1: + thumbnail_img = self.resize(images, patch_size, interpolation=interpolation) + processed_images.append(thumbnail_img) + + processed_images = torch.stack(processed_images, dim=0).transpose(0, 1).contiguous() + + return processed_images + + def _preprocess( + self, + images: list["torch.Tensor"], + do_resize: bool, + size: SizeDict, + crop_to_patches: bool, + min_patches: int, + max_patches: int, + interpolation: Optional["F.InterpolationMode"], + do_center_crop: bool, + crop_size: SizeDict, + do_rescale: bool, + rescale_factor: float, + do_normalize: bool, + image_mean: Optional[Union[float, list[float]]], + image_std: Optional[Union[float, list[float]]], + disable_grouping: Optional[bool], + return_tensors: Optional[Union[str, TensorType]], + **kwargs, + ) -> BatchFeature: + if crop_to_patches: + grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping) + processed_images_grouped = {} + num_patches = {} + for shape, stacked_images in grouped_images.items(): + stacked_images = self.crop_image_to_patches( + stacked_images, + min_patches, + max_patches, + patch_size=size, + interpolation=interpolation, + ) + processed_images_grouped[shape] = stacked_images + num_patches[shape] = [stacked_images.shape[1]] * stacked_images.shape[0] + images = reorder_images(processed_images_grouped, grouped_images_index) + images = [image for images_list in images for image in images_list] + num_patches = reorder_images(num_patches, grouped_images_index) + else: + num_patches = [1] * len(images) + + # Group images by size for batched resizing + grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping) + resized_images_grouped = {} + for shape, stacked_images in grouped_images.items(): + if do_resize: + stacked_images = self.resize(image=stacked_images, size=size, interpolation=interpolation) + resized_images_grouped[shape] = stacked_images + resized_images = reorder_images(resized_images_grouped, grouped_images_index) + + # Group images by size for further processing + # Needed in case do_resize is False, or resize returns images with different sizes + grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping) + processed_images_grouped = {} + for shape, stacked_images in grouped_images.items(): + if do_center_crop: + stacked_images = self.center_crop(stacked_images, crop_size) + # Fused rescale and normalize + stacked_images = self.rescale_and_normalize( + stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std + ) + processed_images_grouped[shape] = stacked_images + + processed_images = reorder_images(processed_images_grouped, grouped_images_index) + processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images + + return BatchFeature( + data={"pixel_values": processed_images, "num_patches": num_patches}, tensor_type=return_tensors + ) + + def get_number_of_image_patches(self, height: int, width: int, images_kwargs=None): + """ + A utility that returns number patches for a given image size. + + Args: + height (`int`): + Height of the input image. + width (`int`): + Width of the input image. + images_kwargs (`dict`, *optional*) + Any kwargs to override defaults of the image processor. + Returns: + `int`: Number of patches per image. + """ + min_patches = images_kwargs.get("min_patches", self.min_patches) + max_patches = images_kwargs.get("max_patches", self.max_patches) + patch_size = images_kwargs.get("patch_size", self.size) + crop_to_patches = images_kwargs.get("crop_to_patches", self.crop_to_patches) + + num_patches = 1 + if crop_to_patches and max_patches > 1: + num_columns, num_rows = get_optimal_tiled_canvas( + (height, width), (patch_size["height"], patch_size["width"]), min_patches, max_patches + ) + if num_columns * num_rows > 1: + num_patches += num_columns * num_rows + + return num_patches + + +__all__ = ["Cohere2VisionImageProcessorFast"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/cohere2_vision/modeling_cohere2_vision.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/cohere2_vision/modeling_cohere2_vision.py new file mode 100644 index 0000000000000000000000000000000000000000..1dc993967b5c767fe93f63fccb954b42a52a1c28 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/cohere2_vision/modeling_cohere2_vision.py @@ -0,0 +1,425 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/cohere2_vision/modular_cohere2_vision.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_cohere2_vision.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright 2025 the Cohere Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from dataclasses import dataclass +from typing import Optional, Union + +import torch +from torch import nn + +from ...cache_utils import Cache +from ...generation import GenerationMixin +from ...modeling_flash_attention_utils import FlashAttentionKwargs +from ...modeling_outputs import BaseModelOutputWithPast, ModelOutput +from ...modeling_utils import PreTrainedModel +from ...processing_utils import Unpack +from ...utils import TransformersKwargs, auto_docstring +from ...utils.generic import check_model_inputs +from ..auto import AutoModel +from .configuration_cohere2_vision import Cohere2VisionConfig + + +class Cohere2VisionMultiModalProjector(nn.Module): + def __init__(self, config: Cohere2VisionConfig): + super().__init__() + self.config = config + self.downsample_factor = config.downsample_factor + self.intermediate_size = config.alignment_intermediate_size + self.linear_1 = nn.Linear( + config.vision_config.hidden_size * (config.downsample_factor**2), self.intermediate_size, bias=True + ) + self.act = nn.SiLU() + self.linear_2 = nn.Linear(self.intermediate_size // 2, config.text_config.hidden_size, bias=True) + + def pixel_shuffle(self, image_features): # B, S, D + batch_size, seq_length, feature_dim = image_features.shape + height = width = int(seq_length**0.5) + image_features = image_features.reshape(image_features.shape[0], width, height, -1) + channels = image_features.shape[-1] + image_features = image_features.reshape( + batch_size, width, int(height / self.downsample_factor), int(channels * self.downsample_factor) + ) + image_features = image_features.permute(0, 2, 1, 3) + image_features = image_features.reshape( + batch_size, int(height / self.downsample_factor), int(width / self.downsample_factor), -1 + ) + image_features = image_features.permute(0, 2, 1, 3) + return image_features + + def forward(self, image_features): + image_features = self.pixel_shuffle(image_features) + hidden_states = self.linear_1(image_features) + + # Split along last dimension and apply SwiGLU + x, gate = hidden_states.chunk(2, dim=-1) + hidden_states = self.act(gate) * x + + hidden_states = self.linear_2(hidden_states) + return hidden_states + + +@dataclass +@auto_docstring( + custom_intro=""" + Base class for Cohere2Vision outputs, with hidden states and attentions. + """ +) +class Cohere2VisionModelOutputWithPast(BaseModelOutputWithPast): + r""" + past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache). + + Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see + `past_key_values` input) to speed up sequential decoding. + image_hidden_states (`torch.FloatTensor`, *optional*): + A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`. + image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state. + """ + + image_hidden_states: Optional[torch.FloatTensor] = None + + +@dataclass +@auto_docstring( + custom_intro=""" + Base class for Cohere2Vision causal language model (or autoregressive) outputs. + """ +) +class Cohere2VisionCausalLMOutputWithPast(ModelOutput): + r""" + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Language modeling loss (for next-token prediction). + logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache). + + Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see + `past_key_values` input) to speed up sequential decoding. + image_hidden_states (`torch.FloatTensor`, *optional*): + A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`. + image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state. + """ + + loss: Optional[torch.FloatTensor] = None + logits: Optional[torch.FloatTensor] = None + past_key_values: Optional[Cache] = None + hidden_states: Optional[tuple[torch.FloatTensor]] = None + attentions: Optional[tuple[torch.FloatTensor]] = None + image_hidden_states: Optional[torch.FloatTensor] = None + + +@auto_docstring +class Cohere2VisionPreTrainedModel(PreTrainedModel): + config: Cohere2VisionConfig + base_model_prefix = "" + supports_gradient_checkpointing = True + _skip_keys_device_placement = "past_key_values" + + _supports_flash_attn = True + _supports_sdpa = True + _can_compile_fullgraph = False + _supports_flex_attn = True + _supports_attention_backend = True + _can_record_outputs = { + "hidden_states": "DecoderLayer", + "attentions": "Attention", + } + + +@auto_docstring( + custom_intro=""" + The Cohere2Vision model which consists of a vision backbone and a language model, without a language modeling head. + """ +) +class Cohere2VisionModel(Cohere2VisionPreTrainedModel): + _checkpoint_conversion_mapping = {} + + def __init__(self, config: Cohere2VisionConfig): + super().__init__(config) + self.vision_tower = AutoModel.from_config(config.vision_config) + + self.multi_modal_projector = Cohere2VisionMultiModalProjector(config) + self.language_model = AutoModel.from_config(config.text_config) + self.post_init() + + def get_input_embeddings(self): + return self.language_model.get_input_embeddings() + + def set_input_embeddings(self, value): + self.language_model.set_input_embeddings(value) + + def set_decoder(self, decoder): + self.language_model = decoder + + def get_decoder(self): + return self.language_model + + def get_image_features(self, pixel_values: torch.FloatTensor): + """ + Obtains image last hidden states from the vision tower and apply multimodal projection. + + Args: + pixel_values (`torch.FloatTensor]` of shape `(batch_size, num_patches, channels, height, width)`) + The tensors corresponding to the input images. + Returns: + image_features (List[`torch.Tensor`]): List of image feature tensor, each contains all the visual feature of all patches + and are of shape `(num_patches, image_length, embed_dim)`). + """ + + image_features = self.vision_tower(pixel_values, output_hidden_states=True) + selected_image_feature = image_features.last_hidden_state + image_features = self.multi_modal_projector(selected_image_feature) + return image_features + + def get_placeholder_mask( + self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor, image_features: torch.FloatTensor + ): + """ + Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is + equal to the length of multimodal features. If the lengths are different, an error is raised. + """ + if input_ids is None: + special_image_mask = inputs_embeds == self.get_input_embeddings()( + torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device) + ) + special_image_mask = special_image_mask.all(-1) + else: + special_image_mask = input_ids == self.config.image_token_id + + n_image_tokens = special_image_mask.sum() + special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device) + n_image_features = image_features.shape[0] * image_features.shape[1] + if inputs_embeds[special_image_mask].numel() != image_features.numel(): + raise ValueError( + f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}" + ) + return special_image_mask + + @check_model_inputs + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Unpack[FlashAttentionKwargs], + ) -> Union[tuple, Cohere2VisionModelOutputWithPast]: + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") + + if inputs_embeds is None: + inputs_embeds = self.get_input_embeddings()(input_ids) + + if pixel_values is not None: + image_features = self.get_image_features(pixel_values) + image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype) + special_image_mask = self.get_placeholder_mask( + input_ids, inputs_embeds=inputs_embeds, image_features=image_features + ) + inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features) + + outputs = self.language_model( + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + cache_position=cache_position, + **kwargs, + ) + + return Cohere2VisionModelOutputWithPast( + last_hidden_state=outputs.last_hidden_state, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + image_hidden_states=image_features if pixel_values is not None else None, + ) + + +@auto_docstring( + custom_intro=""" + The COHERE2_VISION model which consists of a vision backbone and a language model. + """ +) +class Cohere2VisionForConditionalGeneration(Cohere2VisionPreTrainedModel, GenerationMixin): + _checkpoint_conversion_mapping = {} + _tied_weights_keys = ["lm_head.weight"] + + def __init__(self, config: Cohere2VisionConfig): + super().__init__(config) + self.model = Cohere2VisionModel(config) + self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False) + self.post_init() + + def get_input_embeddings(self): + return self.model.get_input_embeddings() + + def set_input_embeddings(self, value): + self.model.set_input_embeddings(value) + + def get_output_embeddings(self) -> nn.Module: + return self.lm_head + + def set_decoder(self, decoder): + self.model.set_decoder(decoder) + + def get_decoder(self): + return self.model.get_decoder() + + def get_image_features(self, pixel_values: torch.FloatTensor): + return self.model.get_image_features(pixel_values=pixel_values) + + # Make modules available through conditional class for BC + @property + def language_model(self): + return self.model.language_model + + @property + def vision_tower(self): + return self.model.vision_tower + + @property + def multi_modal_projector(self): + return self.model.multi_modal_projector + + @check_model_inputs + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + logits_to_keep: Union[int, torch.Tensor] = 0, + image_sizes: Optional[torch.Tensor] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> Union[tuple, Cohere2VisionCausalLMOutputWithPast]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Example: + + ```python + >>> from transformers import AutoProcessor, Cohere2VisionForConditionalGeneration + >>> import torch + + >>> processor = AutoProcessor.from_pretrained("CohereLabs/command-a-vision-07-2025", use_fast=True) + >>> model = Cohere2VisionForConditionalGeneration.from_pretrained("CohereLabs/command-a-vision-07-2025", device_map="auto") + + >>> messages = [ + ... { + ... "role": "user", + ... "content": [ + ... { + ... "type": "image", + ... "url": "https://images.pexels.com/photos/1108099/pexels-photo-1108099.jpeg", + ... }, + ... {"type": "text", "text": "what is in this image?"}, + ... ], + ... }, + ... ] + + >>> inputs = processor.apply_chat_template( + ... messages, padding=True, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", + ... ).to(model.device) + + >>> gen_tokens = model.generate(**inputs, max_new_tokens=300, do_sample=True, temperature=0.3) + >>> processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True) + ```""" + outputs = self.model( + input_ids=input_ids, + pixel_values=pixel_values, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + cache_position=cache_position, + image_sizes=image_sizes, + **kwargs, + ) + + hidden_states = outputs[0] + # Only compute necessary logits, and do not upcast them to float if we are not computing the loss + slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep + logits = self.lm_head(hidden_states[:, slice_indices, :]) + + loss = None + if labels is not None: + loss = self.loss_function( + logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **kwargs + ) + + return Cohere2VisionCausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + image_hidden_states=outputs.image_hidden_states, + ) + + def prepare_inputs_for_generation( + self, + input_ids, + past_key_values=None, + inputs_embeds=None, + pixel_values=None, + attention_mask=None, + cache_position=None, + logits_to_keep=None, + **kwargs, + ): + # Overwritten -- in specific circumstances we don't want to forward image inputs to the model + + model_inputs = super().prepare_inputs_for_generation( + input_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + cache_position=cache_position, + logits_to_keep=logits_to_keep, + **kwargs, + ) + + if cache_position[0] == 0: + # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore + # Otherwise we need pixel values to be passed to model + model_inputs["pixel_values"] = pixel_values + + return model_inputs + + +__all__ = ["Cohere2VisionForConditionalGeneration", "Cohere2VisionPreTrainedModel", "Cohere2VisionModel"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/cohere2_vision/modular_cohere2_vision.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/cohere2_vision/modular_cohere2_vision.py new file mode 100644 index 0000000000000000000000000000000000000000..7ef20305b99e6e6bd68bf02a7e67cd642e2c9ca1 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/cohere2_vision/modular_cohere2_vision.py @@ -0,0 +1,318 @@ +# coding=utf-8 +# Copyright 2025 the Cohere Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch AyaVision model.""" + +from functools import lru_cache +from typing import Optional, Union + +import numpy as np +import torch +from torch import nn + +from transformers.models.aya_vision.modeling_aya_vision import ( + AyaVisionCausalLMOutputWithPast, + AyaVisionForConditionalGeneration, + AyaVisionModel, + AyaVisionModelOutputWithPast, +) +from transformers.models.got_ocr2.image_processing_got_ocr2_fast import GotOcr2ImageProcessorFast + +from ...cache_utils import Cache +from ...modeling_flash_attention_utils import FlashAttentionKwargs +from ...processing_utils import Unpack +from ...utils import TransformersKwargs, auto_docstring, logging +from ...utils.generic import check_model_inputs +from .configuration_cohere2_vision import Cohere2VisionConfig + + +logger = logging.get_logger(__name__) + + +class Cohere2VisionMultiModalProjector(nn.Module): + def __init__(self, config: Cohere2VisionConfig): + super().__init__() + self.config = config + self.downsample_factor = config.downsample_factor + self.intermediate_size = config.alignment_intermediate_size + self.linear_1 = nn.Linear( + config.vision_config.hidden_size * (config.downsample_factor**2), self.intermediate_size, bias=True + ) + self.act = nn.SiLU() + self.linear_2 = nn.Linear(self.intermediate_size // 2, config.text_config.hidden_size, bias=True) + + def pixel_shuffle(self, image_features): # B, S, D + batch_size, seq_length, feature_dim = image_features.shape + height = width = int(seq_length**0.5) + image_features = image_features.reshape(image_features.shape[0], width, height, -1) + channels = image_features.shape[-1] + image_features = image_features.reshape( + batch_size, width, int(height / self.downsample_factor), int(channels * self.downsample_factor) + ) + image_features = image_features.permute(0, 2, 1, 3) + image_features = image_features.reshape( + batch_size, int(height / self.downsample_factor), int(width / self.downsample_factor), -1 + ) + image_features = image_features.permute(0, 2, 1, 3) + return image_features + + def forward(self, image_features): + image_features = self.pixel_shuffle(image_features) + hidden_states = self.linear_1(image_features) + + # Split along last dimension and apply SwiGLU + x, gate = hidden_states.chunk(2, dim=-1) + hidden_states = self.act(gate) * x + + hidden_states = self.linear_2(hidden_states) + return hidden_states + + +class Cohere2VisionModelOutputWithPast(AyaVisionModelOutputWithPast): + pass + + +class Cohere2VisionCausalLMOutputWithPast(AyaVisionCausalLMOutputWithPast): + pass + + +class Cohere2VisionModel(AyaVisionModel): + _checkpoint_conversion_mapping = {} + + def get_image_features(self, pixel_values: torch.FloatTensor): + """ + Obtains image last hidden states from the vision tower and apply multimodal projection. + + Args: + pixel_values (`torch.FloatTensor]` of shape `(batch_size, num_patches, channels, height, width)`) + The tensors corresponding to the input images. + Returns: + image_features (List[`torch.Tensor`]): List of image feature tensor, each contains all the visual feature of all patches + and are of shape `(num_patches, image_length, embed_dim)`). + """ + + image_features = self.vision_tower(pixel_values, output_hidden_states=True) + selected_image_feature = image_features.last_hidden_state + image_features = self.multi_modal_projector(selected_image_feature) + return image_features + + @check_model_inputs + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Unpack[FlashAttentionKwargs], + ) -> Union[tuple, Cohere2VisionModelOutputWithPast]: + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") + + if inputs_embeds is None: + inputs_embeds = self.get_input_embeddings()(input_ids) + + if pixel_values is not None: + image_features = self.get_image_features(pixel_values) + image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype) + special_image_mask = self.get_placeholder_mask( + input_ids, inputs_embeds=inputs_embeds, image_features=image_features + ) + inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features) + + outputs = self.language_model( + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + cache_position=cache_position, + **kwargs, + ) + + return Cohere2VisionModelOutputWithPast( + last_hidden_state=outputs.last_hidden_state, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + image_hidden_states=image_features if pixel_values is not None else None, + ) + + +class Cohere2VisionForConditionalGeneration(AyaVisionForConditionalGeneration): + _checkpoint_conversion_mapping = {} + + def get_image_features(self, pixel_values: torch.FloatTensor): + return self.model.get_image_features(pixel_values=pixel_values) + + @check_model_inputs + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + logits_to_keep: Union[int, torch.Tensor] = 0, + image_sizes: Optional[torch.Tensor] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> Union[tuple, Cohere2VisionCausalLMOutputWithPast]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Example: + + ```python + >>> from transformers import AutoProcessor, Cohere2VisionForConditionalGeneration + >>> import torch + + >>> processor = AutoProcessor.from_pretrained("CohereLabs/command-a-vision-07-2025", use_fast=True) + >>> model = Cohere2VisionForConditionalGeneration.from_pretrained("CohereLabs/command-a-vision-07-2025", device_map="auto") + + >>> messages = [ + ... { + ... "role": "user", + ... "content": [ + ... { + ... "type": "image", + ... "url": "https://images.pexels.com/photos/1108099/pexels-photo-1108099.jpeg", + ... }, + ... {"type": "text", "text": "what is in this image?"}, + ... ], + ... }, + ... ] + + >>> inputs = processor.apply_chat_template( + ... messages, padding=True, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", + ... ).to(model.device) + + >>> gen_tokens = model.generate(**inputs, max_new_tokens=300, do_sample=True, temperature=0.3) + >>> processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True) + ```""" + outputs = self.model( + input_ids=input_ids, + pixel_values=pixel_values, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + cache_position=cache_position, + image_sizes=image_sizes, + **kwargs, + ) + + hidden_states = outputs[0] + # Only compute necessary logits, and do not upcast them to float if we are not computing the loss + slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep + logits = self.lm_head(hidden_states[:, slice_indices, :]) + + loss = None + if labels is not None: + loss = self.loss_function( + logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **kwargs + ) + + return Cohere2VisionCausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + image_hidden_states=outputs.image_hidden_states, + ) + + +@lru_cache(maxsize=10) +def get_all_supported_aspect_ratios(max_image_tiles: int) -> list[tuple[int, int]]: + """ + Computes all allowed aspect ratios for a given maximum number of input tiles. + + This function calculates all possible arrangements of tiles that can be formed + within the constraint of the maximum number of tiles. Each arrangement is + represented by its aspect ratio (width/height) and the corresponding tile configuration. + + Args: + max_image_tiles (`int`): + The maximum number of tiles allowed. + + Returns: + `list[tuple[int, int]]`: A list of tuples, each tuple representing a valid (width, height) + configuration in terms of number of tiles. + + Example: + >>> get_all_supported_aspect_ratios(4) + [(1, 1), (1, 2), (1, 3), (1, 4), (2, 1), (2, 2), (3, 1), (4, 1)] + + """ + aspect_ratios = [] + for width in range(1, max_image_tiles + 1): + for height in range(1, max_image_tiles + 1): + if width * height <= max_image_tiles: + aspect_ratios.append((width, height)) + return aspect_ratios + + +def get_optimal_tiled_canvas( + original_image_size: tuple[int, int], + target_tile_size: tuple[int, int], + min_image_tiles: int, + max_image_tiles: int, +) -> tuple[int, int]: + possible_resolutions = get_all_supported_aspect_ratios(max_image_tiles) + possible_resolutions = sorted(possible_resolutions, key=lambda x: x[0] * x[1]) + image_height, image_width = original_image_size + patch_size_height, patch_size_width = target_tile_size # (height == width) + + candidate_resolutions = np.array(possible_resolutions) * patch_size_height + original_size = np.stack([image_height, image_width]) + required_scales = candidate_resolutions / original_size + required_scale = np.min(required_scales, axis=-1, keepdims=True) # [n_resolutions, 1] + if np.all(required_scale < 1): + # We are forced to downscale, so try to minimize the amount of downscaling + best_grid = possible_resolutions[np.argmax(required_scale)] + else: + # Pick the resolution that required the least upscaling so that it most closely fits the image + required_scale = np.where(required_scale < 1.0, 10e9, required_scale) + best_grid = possible_resolutions[np.argmin(required_scale)] + return best_grid + + +@auto_docstring +class Cohere2VisionImageProcessorFast(GotOcr2ImageProcessorFast): + size = {"height": 512, "width": 512} + min_patches = 1 + max_patches = 12 + crop_to_patches = True + patch_size = 16 + + +__all__ = [ + "Cohere2VisionForConditionalGeneration", + "Cohere2VisionPreTrainedModel", # noqa: F822 + "Cohere2VisionModel", + "Cohere2VisionImageProcessorFast", +] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/cohere2_vision/processing_cohere2_vision.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/cohere2_vision/processing_cohere2_vision.py new file mode 100644 index 0000000000000000000000000000000000000000..b72e1512ead9612c9e601101326f532ed3bf0d1a --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/cohere2_vision/processing_cohere2_vision.py @@ -0,0 +1,216 @@ +# coding=utf-8 +# Copyright 2025 HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Optional, Union + +import numpy as np + +from ...image_processing_utils import BatchFeature +from ...image_utils import ImageInput +from ...processing_utils import ImagesKwargs, MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack +from ...tokenization_utils_base import PreTokenizedInput, TextInput + + +class Cohere2VisionImagesKwargs(ImagesKwargs, total=False): + max_patches: Optional[int] + + +class Cohere2VisionProcessorKwargs(ProcessingKwargs, total=False): + images_kwargs: Cohere2VisionImagesKwargs + _defaults = { + "text_kwargs": { + "padding_side": "left", + "padding": True, + "return_mm_token_type_ids": False, + }, + } + + +class Cohere2VisionProcessor(ProcessorMixin): + r""" + Constructs a Cohere2Vision processor which wraps a [`AutoImageProcessor`] and + [`PretrainedTokenizerFast`] tokenizer into a single processor that inherits both the image processor and + tokenizer functionalities. See the [`~Cohere2VisionProcessor.__call__`] and [`~Cohere2VisionProcessor.decode`] for more information. + Args: + image_processor ([`AutoImageProcessor`], *optional*): + The image processor is a required input. + tokenizer ([`PreTrainedTokenizer`, `PreTrainedTokenizerFast`], *optional*): + The tokenizer is a required input. + chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages + in a chat into a tokenizable string. + """ + + attributes = ["image_processor", "tokenizer"] + image_processor_class = "AutoImageProcessor" + tokenizer_class = "AutoTokenizer" + + def __init__( + self, + image_processor=None, + tokenizer=None, + chat_template=None, + **kwargs, + ): + super().__init__(image_processor, tokenizer, chat_template=chat_template) + + self.patch_size = self.image_processor.patch_size + self.boi_token = tokenizer.boi_token + self.eoi_token = tokenizer.eoi_token + self.image_token = tokenizer.image_token + self.img_line_break_token = tokenizer.img_line_break_token + self.image_token_id = tokenizer.image_token_id + + self.image_ids = tokenizer.convert_tokens_to_ids( + [ + self.image_token, + self.boi_token, + self.eoi_token, + self.img_line_break_token, + ] + ) + + def __call__( + self, + images: Optional[ImageInput] = None, + text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None, + **kwargs: Unpack[Cohere2VisionProcessorKwargs], + ) -> BatchFeature: + """ + Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text` + and `kwargs` arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizerFast.__call__`] to encode the text. + To prepare the vision inputs, this method forwards the `images` and `kwargs` arguments to + GotOcr2ImageProcessor's [`~GotOcr2ImageProcessor.__call__`] if `images` is not `None`. + + Args: + images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`): + The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch + tensor. Both channels-first and channels-last formats are supported. + text (`str`, `list[str]`, `list[list[str]]`): + The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings + (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set + `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). + return_tensors (`str` or [`~utils.TensorType`], *optional*): + If set, will return tensors of a particular framework. Acceptable values are: + - `'tf'`: Return TensorFlow `tf.constant` objects. + - `'pt'`: Return PyTorch `torch.Tensor` objects. + - `'np'`: Return NumPy `np.ndarray` objects. + - `'jax'`: Return JAX `jnp.ndarray` objects. + + Returns: + [`BatchFeature`]: A [`BatchFeature`] with the following fields: + + - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`. + - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when + `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not + `None`). + - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`. + """ + if text is None: + raise ValueError("You have to specify text.") + elif not isinstance(text, (list, tuple)): + text = [text] + + output_kwargs = self._merge_kwargs( + Cohere2VisionProcessorKwargs, + tokenizer_init_kwargs=self.tokenizer.init_kwargs, + **kwargs, + ) + + # Process images + image_inputs = {} + if images is not None: + image_inputs = self.image_processor(images=images, **output_kwargs["images_kwargs"]) + batch_num_patches = iter(image_inputs.pop("num_patches")) + processed_text = [] + for sample in text: + while self.image_token in sample: + num_patches = next(batch_num_patches) + img_patches_per_tile = int(self.patch_size**2) + + img_string = f"{self.boi_token}" + for idx in range(1, num_patches): + img_string += "" * img_patches_per_tile + self.img_line_break_token + img_string += "" * img_patches_per_tile + self.img_line_break_token + img_string += f"{self.eoi_token}" + + sample = sample.replace(self.image_token, img_string, 1) + processed_text.append(sample) + text = [sample.replace("", self.image_token) for sample in processed_text] + + return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None) + return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", False) + text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"], return_tensors=None) + + if return_mm_token_type_ids: + array_ids = np.array(text_inputs["input_ids"]) + mm_token_type_ids = np.zeros_like(text_inputs["input_ids"]) + mm_token_type_ids[np.isin(array_ids, self.image_ids)] = 1 + text_inputs["mm_token_type_ids"] = mm_token_type_ids.tolist() + + return BatchFeature(data={**text_inputs, **image_inputs}, tensor_type=return_tensors) + + def _get_num_multimodal_tokens(self, image_sizes=None, **kwargs): + """ + Computes the number of placeholder tokens needed for multimodal inputs with the given sizes. + + Args: + image_sizes (`list[list[int]]`, *optional*): + The input sizes formatted as (height, width) per each image. + + Returns: + `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided + input modalities, along with other useful data. + """ + + vision_data = {} + if image_sizes is not None: + images_kwargs = Cohere2VisionProcessorKwargs._defaults.get("images_kwargs", {}) + images_kwargs.update(kwargs) + + num_image_patches = [ + self.image_processor.get_number_of_image_patches(*image_size, images_kwargs) + for image_size in image_sizes + ] + + token_per_patch = int(self.patch_size**2) + num_image_tokens = [ + 2 + sum(token_per_patch + 1 for _ in range(num_patches)) for num_patches in num_image_patches + ] # Add +2 and +1 for BOI/EOI and image break tokens + vision_data.update({"num_image_tokens": num_image_tokens, "num_image_patches": num_image_patches}) + + return MultiModalData(**vision_data) + + def batch_decode(self, *args, **kwargs): + """ + This method forwards all its arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please + refer to the docstring of this method for more information. + """ + return self.tokenizer.batch_decode(*args, **kwargs) + + def decode(self, *args, **kwargs): + """ + This method forwards all its arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to + the docstring of this method for more information. + """ + return self.tokenizer.decode(*args, **kwargs) + + @property + def model_input_names(self): + tokenizer_input_names = self.tokenizer.model_input_names + image_processor_input_names = self.image_processor.model_input_names + return list(tokenizer_input_names) + list(image_processor_input_names) + + +__all__ = ["Cohere2VisionProcessor"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/colpali/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/colpali/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..fa1b63fd009803e27cc25ebe591ab4fa2cd32cb9 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/colpali/__init__.py @@ -0,0 +1,28 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_colpali import * + from .modeling_colpali import * + from .processing_colpali import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/colpali/configuration_colpali.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/colpali/configuration_colpali.py new file mode 100644 index 0000000000000000000000000000000000000000..be7eaf47b428ace15aa9923406450cca1892d66b --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/colpali/configuration_colpali.py @@ -0,0 +1,102 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""ColPali model configuration""" + +import logging +from copy import deepcopy + +from ...configuration_utils import PretrainedConfig +from ..auto import CONFIG_MAPPING, AutoConfig + + +logger = logging.getLogger(__name__) + + +class ColPaliConfig(PretrainedConfig): + r""" + Configuration class to store the configuration of a [`ColPaliForRetrieval`]. It is used to instantiate an instance + of `ColPaliForRetrieval` according to the specified arguments, defining the model architecture following the methodology + from the "ColPali: Efficient Document Retrieval with Vision Language Models" paper. + + Creating a configuration with the default settings will result in a configuration where the VLM backbone is set to the + default PaliGemma configuration, i.e the one from [vidore/colpali-v1.2](https://huggingface.co/vidore/colpali-v1.2). + + Note that contrarily to what the class name suggests (actually the name refers to the ColPali **methodology**), you can + use a different VLM backbone model than PaliGemma by passing the corresponding VLM configuration to the class constructor. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + vlm_config (`PretrainedConfig`, *optional*): + Configuration of the VLM backbone model. + text_config (`PretrainedConfig`, *optional*): + Configuration of the text backbone model. Overrides the `text_config` attribute of the `vlm_config` if provided. + embedding_dim (`int`, *optional*, defaults to 128): + Dimension of the multi-vector embeddings produced by the model. + + Example: + + ```python + from transformers.models.colpali import ColPaliConfig, ColPaliForRetrieval + + config = ColPaliConfig() + model = ColPaliForRetrieval(config) + ``` + """ + + model_type = "colpali" + sub_configs = {"vlm_config": PretrainedConfig, "text_config": AutoConfig} + + def __init__( + self, + vlm_config=None, + text_config=None, + embedding_dim: int = 128, + **kwargs, + ): + if vlm_config is None: + vlm_config = CONFIG_MAPPING["paligemma"]() + logger.info( + "`vlm_config` is `None`. Initializing `vlm_config` with the `PaliGemmaConfig` with default values." + ) + elif isinstance(vlm_config, dict): + vlm_config = deepcopy(vlm_config) + if "model_type" not in vlm_config: + raise KeyError( + "The `model_type` key is missing in the `vlm_config` dictionary. Please provide the model type." + ) + elif vlm_config["model_type"] not in CONFIG_MAPPING: + raise ValueError( + f"The model type `{vlm_config['model_type']}` is not supported. Please provide a valid model type." + ) + vlm_config = CONFIG_MAPPING[vlm_config["model_type"]](**vlm_config) + elif not isinstance(vlm_config, PretrainedConfig): + raise TypeError( + f"Invalid type for `vlm_config`. Expected `PretrainedConfig`, `dict`, or `None`, but got {type(vlm_config)}." + ) + + self.vlm_config = vlm_config + self.text_config = text_config if text_config is not None else vlm_config.text_config + if isinstance(self.text_config, dict): + text_config["model_type"] = text_config.get("model_type", "gemma") + self.text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config) + + self.embedding_dim = embedding_dim + + super().__init__(**kwargs) + + +__all__ = ["ColPaliConfig"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/colpali/modeling_colpali.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/colpali/modeling_colpali.py new file mode 100644 index 0000000000000000000000000000000000000000..fe92252b9a8012beddbb778405124ea6e993c030 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/colpali/modeling_colpali.py @@ -0,0 +1,214 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch ColPali model""" + +from dataclasses import dataclass +from typing import Optional + +import torch +from torch import nn + +from transformers import AutoModelForImageTextToText + +from ...cache_utils import Cache +from ...modeling_utils import PreTrainedModel +from ...utils import ModelOutput, auto_docstring, can_return_tuple +from .configuration_colpali import ColPaliConfig + + +@auto_docstring +class ColPaliPreTrainedModel(PreTrainedModel): + config: ColPaliConfig + base_model_prefix = "model" + _no_split_modules = [] + _supports_sdpa = True + _supports_flash_attn = True + _supports_flex_attn = True + + def _init_weights(self, module): + std = ( + self.config.initializer_range + if hasattr(self.config, "initializer_range") + else self.config.vlm_config.text_config.initializer_range + ) + + if isinstance(module, (nn.Linear, nn.Conv2d)): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + +@dataclass +@auto_docstring( + custom_intro=""" + Base class for ColPali embeddings output. + """ +) +class ColPaliForRetrievalOutput(ModelOutput): + r""" + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Language modeling loss (for next-token prediction). + embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + The embeddings of the model. + past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache). + + Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see + `past_key_values` input) to speed up sequential decoding. + image_hidden_states (`torch.FloatTensor`, *optional*): + A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`. + image_hidden_states of the model produced by the vision encoder after projecting last hidden state. + """ + + loss: Optional[torch.FloatTensor] = None + embeddings: Optional[torch.Tensor] = None + past_key_values: Optional[Cache] = None + hidden_states: Optional[tuple[torch.FloatTensor]] = None + attentions: Optional[tuple[torch.FloatTensor]] = None + image_hidden_states: Optional[torch.FloatTensor] = None + + +@auto_docstring( + custom_intro=""" + The ColPali architecture leverages VLMs to construct efficient multi-vector embeddings directly + from document images (“screenshots”) for document retrieval. The model is trained to maximize the similarity + between these document embeddings and the corresponding query embeddings, using the late interaction method + introduced in ColBERT. + + Using ColPali removes the need for potentially complex and brittle layout recognition and OCR pipelines with a + single model that can take into account both the textual and visual content (layout, charts, etc.) of a document. + + ColPali is part of the ColVision model family, which was first introduced in the following paper: + [*ColPali: Efficient Document Retrieval with Vision Language Models*](https://huggingface.co/papers/2407.01449). + """ +) +class ColPaliForRetrieval(ColPaliPreTrainedModel): + _checkpoint_conversion_mapping = { + "vlm.language_model.model": "vlm.model.language_model", + "vlm.vision_tower": "vlm.model.vision_tower", + "vlm.multi_modal_projector": "vlm.model.multi_modal_projector", + "vlm.language_model.lm_head": "vlm.lm_head", + } + + def __init__(self, config: ColPaliConfig): + super().__init__(config) + self.config = config + self.vocab_size = config.vlm_config.text_config.vocab_size + + self.vlm = AutoModelForImageTextToText.from_config(config.vlm_config) + self._tied_weights_keys = [f"vlm.language_model.{k}" for k in (self.vlm._tied_weights_keys or [])] + + self.embedding_dim = self.config.embedding_dim + self.embedding_proj_layer = nn.Linear( + self.config.vlm_config.text_config.hidden_size, + self.embedding_dim, + ) + + self.post_init() + + @can_return_tuple + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + **kwargs, + ) -> ColPaliForRetrievalOutput: + if pixel_values is not None: + pixel_values = pixel_values.to(dtype=self.dtype) + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + vlm_output = self.vlm.model( + input_ids=input_ids, + attention_mask=attention_mask, + pixel_values=pixel_values, + output_hidden_states=True, + return_dict=True, + output_attentions=output_attentions, + **kwargs, + ) + vlm_hidden_states = vlm_output.hidden_states if output_hidden_states else None + vlm_image_hidden_states = vlm_output.image_hidden_states if pixel_values is not None else None + + last_hidden_states = vlm_output[0] # (batch_size, sequence_length, hidden_size) + proj_dtype = self.embedding_proj_layer.weight.dtype + embeddings = self.embedding_proj_layer(last_hidden_states.to(proj_dtype)) # (batch_size, sequence_length, dim) + + # L2 normalization + embeddings = embeddings / embeddings.norm(dim=-1, keepdim=True) # (batch_size, sequence_length, dim) + + if attention_mask is not None: + embeddings = embeddings * attention_mask.unsqueeze(-1) # (batch_size, sequence_length, dim) + + return ColPaliForRetrievalOutput( + embeddings=embeddings, + past_key_values=vlm_output.past_key_values, + hidden_states=vlm_hidden_states, + attentions=vlm_output.attentions, + image_hidden_states=vlm_image_hidden_states, + ) + + def get_input_embeddings(self): + return self.vlm.get_input_embeddings() + + def set_input_embeddings(self, value): + self.vlm.set_input_embeddings(value) + + def get_output_embeddings(self): + return self.vlm.get_output_embeddings() + + def set_output_embeddings(self, new_embeddings): + self.vlm.set_output_embeddings(new_embeddings) + + def tie_weights(self): + return self.vlm.tie_weights() + + def resize_token_embeddings( + self, + new_num_tokens: Optional[int] = None, + pad_to_multiple_of: Optional[int] = None, + mean_resizing: bool = True, + ) -> nn.Embedding: + model_embeds = self.vlm.resize_token_embeddings( + new_num_tokens=new_num_tokens, + pad_to_multiple_of=pad_to_multiple_of, + mean_resizing=mean_resizing, + ) + + self.config.vlm_config.text_config.vocab_size = model_embeds.num_embeddings + self.config.vlm_config.vocab_size = model_embeds.num_embeddings + self.vlm.vocab_size = model_embeds.num_embeddings + self.vocab_size = model_embeds.num_embeddings + + return model_embeds + + +__all__ = [ + "ColPaliForRetrieval", + "ColPaliPreTrainedModel", +] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/colpali/modular_colpali.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/colpali/modular_colpali.py new file mode 100644 index 0000000000000000000000000000000000000000..cf28475f4b3cd9298b958277678489f91c6b5c4d --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/colpali/modular_colpali.py @@ -0,0 +1,344 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from typing import Optional, Union + +from transformers.models.paligemma.processing_paligemma import IMAGE_TOKEN, PaliGemmaProcessor, build_string_from_input + +from ...feature_extraction_utils import BatchFeature +from ...image_utils import ImageInput, make_flat_list_of_images +from ...processing_utils import ProcessingKwargs, Unpack +from ...tokenization_utils_base import PreTokenizedInput, TextInput +from ...utils import is_torch_available, logging + + +if is_torch_available(): + import torch + +logger = logging.get_logger(__name__) + + +class ColPaliProcessorKwargs(ProcessingKwargs, total=False): + _defaults = { + "text_kwargs": { + "padding": "longest", + }, + "images_kwargs": { + "data_format": "channels_first", + "do_convert_rgb": True, + }, + "common_kwargs": {"return_tensors": "pt"}, + } + + +class ColPaliProcessor(PaliGemmaProcessor): + r""" + Constructs a ColPali processor which wraps a PaliGemmaProcessor and special methods to process images and queries, as + well as to compute the late-interaction retrieval score. + + [`ColPaliProcessor`] offers all the functionalities of [`PaliGemmaProcessor`]. See the [`~PaliGemmaProcessor.__call__`] + for more information. + + Args: + image_processor ([`SiglipImageProcessor`], *optional*): + The image processor is a required input. + tokenizer ([`LlamaTokenizerFast`], *optional*): + The tokenizer is a required input. + chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages + in a chat into a tokenizable string. + visual_prompt_prefix (`str`, *optional*, defaults to `"Describe the image."`): + A string that gets tokenized and prepended to the image tokens. + query_prefix (`str`, *optional*, defaults to `"Question: "`): + A prefix to be used for the query. + """ + + def __init__( + self, + image_processor=None, + tokenizer=None, + chat_template=None, + visual_prompt_prefix: str = "Describe the image.", + query_prefix: str = "Question: ", + ): + super().__init__(image_processor=image_processor, tokenizer=tokenizer, chat_template=chat_template) + self.visual_prompt_prefix = visual_prompt_prefix + self.query_prefix = query_prefix + + @property + def query_augmentation_token(self) -> str: + """ + Return the query augmentation token. + + Query augmentation buffers are used as reasoning buffers during inference. + """ + return self.tokenizer.pad_token + + def __call__( + self, + images: Optional[ImageInput] = None, + text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None, + audio=None, + videos=None, + **kwargs: Unpack[ColPaliProcessorKwargs], + ) -> BatchFeature: + """ + Main method to prepare for the model either (1) one or several texts, either (2) one or several image(s). This method is a custom + wrapper around the PaliGemmaProcessor's [`~PaliGemmaProcessor.__call__`] method adapted for the ColPali model. It cannot process + both text and images at the same time. + + When preparing the text(s), this method forwards the `text` and `kwargs` arguments to LlamaTokenizerFast's + [`~LlamaTokenizerFast.__call__`]. + When preparing the image(s), this method forwards the `images` and `kwargs` arguments to SiglipImageProcessor's + [`~SiglipImageProcessor.__call__`]. + Please refer to the docstring of the above two methods for more information. + + Args: + images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`): + The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch + tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a + number of channels, H and W are image height and width. + text (`str`, `list[str]`, `list[list[str]]`): + The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings + (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set + `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). + return_tensors (`str` or [`~utils.TensorType`], *optional*): + If set, will return tensors of a particular framework. Acceptable values are: + + - `'tf'`: Return TensorFlow `tf.constant` objects. + - `'pt'`: Return PyTorch `torch.Tensor` objects. + - `'np'`: Return NumPy `np.ndarray` objects. + - `'jax'`: Return JAX `jnp.ndarray` objects. + + Returns: + [`BatchFeature`]: A [`BatchFeature`] with the following fields: + + - **input_ids** -- List of token ids to be fed to a model. + - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when + `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not + `None`). + - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`. + """ + output_kwargs = self._merge_kwargs( + ColPaliProcessorKwargs, + tokenizer_init_kwargs=self.tokenizer.init_kwargs, + **kwargs, + ) + suffix = output_kwargs["text_kwargs"].pop("suffix", None) + + return_token_type_ids = suffix is not None + + if text is None and images is None: + raise ValueError("Either text or images must be provided") + if text is not None and images is not None: + raise ValueError("Only one of text or images can be processed at a time") + + if images is not None: + images = self.image_processor.fetch_images(images) + images = make_flat_list_of_images(images) + texts_doc = [self.visual_prompt_prefix] * len(images) + images = [image.convert("RGB") for image in images] + + input_strings = [ + build_string_from_input( + prompt=prompt, + bos_token=self.tokenizer.bos_token, + image_seq_len=self.image_seq_length, + image_token=IMAGE_TOKEN, + num_images=len(image_list) if isinstance(image_list, list) else 1, + ) + for prompt, image_list in zip(texts_doc, images) + ] + pixel_values = self.image_processor(images, **output_kwargs["images_kwargs"])["pixel_values"] + + # max_length has to account for the image tokens + if output_kwargs["text_kwargs"].get("max_length", None) is not None: + output_kwargs["text_kwargs"]["max_length"] += self.image_seq_length + + inputs = self.tokenizer( + input_strings, + return_token_type_ids=False, + **output_kwargs["text_kwargs"], + ) + + return_data = {**inputs, "pixel_values": pixel_values} + + if return_token_type_ids: + labels = inputs["input_ids"].masked_fill(inputs["token_type_ids"] == 0, -100) + return_data.update({"labels": labels}) + + return BatchFeature(data=return_data) + + elif text is not None: + if isinstance(text, str): + text = [text] + elif not (isinstance(text, list) and isinstance(text[0], str)): + raise ValueError("Text must be a string or a list of strings") + + if suffix is None: + suffix = self.query_augmentation_token * 10 + + texts_query: list[str] = [] + for query in text: + query = self.tokenizer.bos_token + self.query_prefix + query + suffix + "\n" + texts_query.append(query) + + output_kwargs["text_kwargs"]["max_length"] = output_kwargs["text_kwargs"].get("max_length", 50) + + batch_query = self.tokenizer( + texts_query, + return_token_type_ids=False, + **output_kwargs["text_kwargs"], + ) + + return batch_query + + def process_images( + self, + images: Optional[ImageInput] = None, + **kwargs: Unpack[ColPaliProcessorKwargs], + ) -> BatchFeature: + """ + Prepare for the model one or several image(s). This method is a wrapper around the `__call__` method of the ColPaliProcessor's + [`ColPaliProcessor.__call__`]. + + This method forwards the `images` and `kwargs` arguments to the image processor. + + Args: + images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`): + The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch + tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a + number of channels, H and W are image height and width. + return_tensors (`str` or [`~utils.TensorType`], *optional*): + If set, will return tensors of a particular framework. Acceptable values are: + + - `'tf'`: Return TensorFlow `tf.constant` objects. + - `'pt'`: Return PyTorch `torch.Tensor` objects. + - `'np'`: Return NumPy `np.ndarray` objects. + - `'jax'`: Return JAX `jnp.ndarray` objects. + + Returns: + [`BatchFeature`]: A [`BatchFeature`] with the following fields: + + - **input_ids** -- List of token ids to be fed to a model. + - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when + `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not + `None`). + - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`. + """ + return self.__call__(images=images, **kwargs) + + def process_queries( + self, + text: Union[TextInput, list[TextInput]], + **kwargs: Unpack[ColPaliProcessorKwargs], + ) -> BatchFeature: + """ + Prepare for the model one or several texts. This method is a wrapper around the `__call__` method of the ColPaliProcessor's + [`ColPaliProcessor.__call__`]. + + This method forwards the `text` and `kwargs` arguments to the tokenizer. + + Args: + text (`str`, `list[str]`, `list[list[str]]`): + The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings + (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set + `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). + return_tensors (`str` or [`~utils.TensorType`], *optional*): + If set, will return tensors of a particular framework. Acceptable values are: + + - `'tf'`: Return TensorFlow `tf.constant` objects. + - `'pt'`: Return PyTorch `torch.Tensor` objects. + - `'np'`: Return NumPy `np.ndarray` objects. + - `'jax'`: Return JAX `jnp.ndarray` objects. + + Returns: + [`BatchFeature`]: A [`BatchFeature`] with the following fields: + + - **input_ids** -- List of token ids to be fed to a model. + - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when + `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not + `None`). + """ + return self.__call__(text=text, **kwargs) + + def score_retrieval( + self, + query_embeddings: Union["torch.Tensor", list["torch.Tensor"]], + passage_embeddings: Union["torch.Tensor", list["torch.Tensor"]], + batch_size: int = 128, + output_dtype: Optional["torch.dtype"] = None, + output_device: Union["torch.device", str] = "cpu", + ) -> "torch.Tensor": + """ + Compute the late-interaction/MaxSim score (ColBERT-like) for the given multi-vector + query embeddings (`qs`) and passage embeddings (`ps`). For ColPali, a passage is the + image of a document page. + + Because the embedding tensors are multi-vector and can thus have different shapes, they + should be fed as: + (1) a list of tensors, where the i-th tensor is of shape (sequence_length_i, embedding_dim) + (2) a single tensor of shape (n_passages, max_sequence_length, embedding_dim) -> usually + obtained by padding the list of tensors. + + Args: + query_embeddings (`Union[torch.Tensor, list[torch.Tensor]`): Query embeddings. + passage_embeddings (`Union[torch.Tensor, list[torch.Tensor]`): Passage embeddings. + batch_size (`int`, *optional*, defaults to 128): Batch size for computing scores. + output_dtype (`torch.dtype`, *optional*, defaults to `torch.float32`): The dtype of the output tensor. + If `None`, the dtype of the input embeddings is used. + output_device (`torch.device` or `str`, *optional*, defaults to "cpu"): The device of the output tensor. + + Returns: + `torch.Tensor`: A tensor of shape `(n_queries, n_passages)` containing the scores. The score + tensor is saved on the "cpu" device. + """ + + if len(query_embeddings) == 0: + raise ValueError("No queries provided") + if len(passage_embeddings) == 0: + raise ValueError("No passages provided") + + if query_embeddings[0].device != passage_embeddings[0].device: + raise ValueError("Queries and passages must be on the same device") + + if query_embeddings[0].dtype != passage_embeddings[0].dtype: + raise ValueError("Queries and passages must have the same dtype") + + if output_dtype is None: + output_dtype = query_embeddings[0].dtype + + scores: list[torch.Tensor] = [] + + for i in range(0, len(query_embeddings), batch_size): + batch_scores: list[torch.Tensor] = [] + batch_queries = torch.nn.utils.rnn.pad_sequence( + query_embeddings[i : i + batch_size], batch_first=True, padding_value=0 + ) + for j in range(0, len(passage_embeddings), batch_size): + batch_passages = torch.nn.utils.rnn.pad_sequence( + passage_embeddings[j : j + batch_size], batch_first=True, padding_value=0 + ) + batch_scores.append( + torch.einsum("bnd,csd->bcns", batch_queries, batch_passages).max(dim=3)[0].sum(dim=2) + ) + scores.append(torch.cat(batch_scores, dim=1).to(output_dtype).to(output_device)) + + return torch.cat(scores, dim=0) + + +__all__ = [ + "ColPaliProcessor", +] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/colpali/processing_colpali.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/colpali/processing_colpali.py new file mode 100644 index 0000000000000000000000000000000000000000..b3f758a00006cb8abec225b3308d2a30594a781a --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/colpali/processing_colpali.py @@ -0,0 +1,410 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/colpali/modular_colpali.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_colpali.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright 2024 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from typing import Optional, Union + +from ...feature_extraction_utils import BatchFeature +from ...image_utils import ImageInput, make_flat_list_of_images +from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack +from ...tokenization_utils_base import AddedToken, PreTokenizedInput, TextInput +from ...utils import is_torch_available + + +if is_torch_available(): + import torch + + +class ColPaliProcessorKwargs(ProcessingKwargs, total=False): + _defaults = { + "text_kwargs": { + "padding": "longest", + }, + "images_kwargs": { + "data_format": "channels_first", + "do_convert_rgb": True, + }, + "common_kwargs": {"return_tensors": "pt"}, + } + + +IMAGE_TOKEN = "" +EXTRA_TOKENS = [f"4}>" for i in range(1024)] + [f"3}>" for i in range(128)] + + +def build_string_from_input(prompt, bos_token, image_seq_len, image_token, num_images): + """ + Builds a string from the input prompt and image tokens. + For example, for the call: + build_string_from_input( + prompt="Prefix str" + bos_token="", + image_seq_len=3, + image_token="", + ) + The output will be: + "Initial str" + Args: + prompt (`list[Union[str, ImageInput]]`): The input prompt. + bos_token (`str`): The beginning of sentence token. + image_seq_len (`int`): The length of the image sequence. + image_token (`str`): The image token. + num_images (`int`): Number of images in the prompt. + """ + return f"{image_token * image_seq_len * num_images}{bos_token}{prompt}\n" + + +class ColPaliProcessor(ProcessorMixin): + r""" + Constructs a ColPali processor which wraps a PaliGemmaProcessor and special methods to process images and queries, as + well as to compute the late-interaction retrieval score. + + [`ColPaliProcessor`] offers all the functionalities of [`PaliGemmaProcessor`]. See the [`~PaliGemmaProcessor.__call__`] + for more information. + + Args: + image_processor ([`SiglipImageProcessor`], *optional*): + The image processor is a required input. + tokenizer ([`LlamaTokenizerFast`], *optional*): + The tokenizer is a required input. + chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages + in a chat into a tokenizable string. + visual_prompt_prefix (`str`, *optional*, defaults to `"Describe the image."`): + A string that gets tokenized and prepended to the image tokens. + query_prefix (`str`, *optional*, defaults to `"Question: "`): + A prefix to be used for the query. + """ + + attributes = ["image_processor", "tokenizer"] + image_processor_class = ("SiglipImageProcessor", "SiglipImageProcessorFast") + tokenizer_class = ("GemmaTokenizer", "GemmaTokenizerFast") + + def __init__( + self, + image_processor=None, + tokenizer=None, + chat_template=None, + visual_prompt_prefix: str = "Describe the image.", + query_prefix: str = "Question: ", + ): + super().__init__(image_processor, tokenizer, chat_template=chat_template) + if not hasattr(image_processor, "image_seq_length"): + raise ValueError("Image processor is missing an `image_seq_length` attribute.") + + self.image_seq_length = image_processor.image_seq_length + + if not hasattr(tokenizer, "image_token"): + image_token = AddedToken(IMAGE_TOKEN, normalized=False, special=True) + tokens_to_add = {"additional_special_tokens": [image_token]} + tokenizer.add_special_tokens(tokens_to_add) + self.image_token_id = tokenizer.convert_tokens_to_ids(IMAGE_TOKEN) + self.image_token = IMAGE_TOKEN + else: + self.image_token_id = tokenizer.image_token_id + self.image_token = tokenizer.image_token + + tokenizer.add_tokens(EXTRA_TOKENS) + tokenizer.add_bos_token = False + tokenizer.add_eos_token = False + self.visual_prompt_prefix = visual_prompt_prefix + self.query_prefix = query_prefix + + def __call__( + self, + images: Optional[ImageInput] = None, + text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None, + audio=None, + videos=None, + **kwargs: Unpack[ColPaliProcessorKwargs], + ) -> BatchFeature: + """ + Main method to prepare for the model either (1) one or several texts, either (2) one or several image(s). This method is a custom + wrapper around the PaliGemmaProcessor's [`~PaliGemmaProcessor.__call__`] method adapted for the ColPali model. It cannot process + both text and images at the same time. + + When preparing the text(s), this method forwards the `text` and `kwargs` arguments to LlamaTokenizerFast's + [`~LlamaTokenizerFast.__call__`]. + When preparing the image(s), this method forwards the `images` and `kwargs` arguments to SiglipImageProcessor's + [`~SiglipImageProcessor.__call__`]. + Please refer to the docstring of the above two methods for more information. + + Args: + images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`): + The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch + tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a + number of channels, H and W are image height and width. + text (`str`, `list[str]`, `list[list[str]]`): + The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings + (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set + `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). + return_tensors (`str` or [`~utils.TensorType`], *optional*): + If set, will return tensors of a particular framework. Acceptable values are: + + - `'tf'`: Return TensorFlow `tf.constant` objects. + - `'pt'`: Return PyTorch `torch.Tensor` objects. + - `'np'`: Return NumPy `np.ndarray` objects. + - `'jax'`: Return JAX `jnp.ndarray` objects. + + Returns: + [`BatchFeature`]: A [`BatchFeature`] with the following fields: + + - **input_ids** -- List of token ids to be fed to a model. + - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when + `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not + `None`). + - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`. + """ + output_kwargs = self._merge_kwargs( + ColPaliProcessorKwargs, + tokenizer_init_kwargs=self.tokenizer.init_kwargs, + **kwargs, + ) + suffix = output_kwargs["text_kwargs"].pop("suffix", None) + + return_token_type_ids = suffix is not None + + if text is None and images is None: + raise ValueError("Either text or images must be provided") + if text is not None and images is not None: + raise ValueError("Only one of text or images can be processed at a time") + + if images is not None: + images = self.image_processor.fetch_images(images) + images = make_flat_list_of_images(images) + texts_doc = [self.visual_prompt_prefix] * len(images) + images = [image.convert("RGB") for image in images] + + input_strings = [ + build_string_from_input( + prompt=prompt, + bos_token=self.tokenizer.bos_token, + image_seq_len=self.image_seq_length, + image_token=IMAGE_TOKEN, + num_images=len(image_list) if isinstance(image_list, list) else 1, + ) + for prompt, image_list in zip(texts_doc, images) + ] + pixel_values = self.image_processor(images, **output_kwargs["images_kwargs"])["pixel_values"] + + # max_length has to account for the image tokens + if output_kwargs["text_kwargs"].get("max_length", None) is not None: + output_kwargs["text_kwargs"]["max_length"] += self.image_seq_length + + inputs = self.tokenizer( + input_strings, + return_token_type_ids=False, + **output_kwargs["text_kwargs"], + ) + + return_data = {**inputs, "pixel_values": pixel_values} + + if return_token_type_ids: + labels = inputs["input_ids"].masked_fill(inputs["token_type_ids"] == 0, -100) + return_data.update({"labels": labels}) + + return BatchFeature(data=return_data) + + elif text is not None: + if isinstance(text, str): + text = [text] + elif not (isinstance(text, list) and isinstance(text[0], str)): + raise ValueError("Text must be a string or a list of strings") + + if suffix is None: + suffix = self.query_augmentation_token * 10 + + texts_query: list[str] = [] + for query in text: + query = self.tokenizer.bos_token + self.query_prefix + query + suffix + "\n" + texts_query.append(query) + + output_kwargs["text_kwargs"]["max_length"] = output_kwargs["text_kwargs"].get("max_length", 50) + + batch_query = self.tokenizer( + texts_query, + return_token_type_ids=False, + **output_kwargs["text_kwargs"], + ) + + return batch_query + + def _get_num_multimodal_tokens(self, image_sizes=None, **kwargs): + """ + Computes the number of placeholder tokens needed for multimodal inputs with the given sizes. + + Args: + image_sizes (list[list[str]], *optional*): + The input sizes formatted as (height, width) per each image. + Returns: + `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided + input modalities, along with other useful data. + """ + vision_data = {} + if image_sizes is not None: + num_image_tokens = [self.image_seq_length] * len(image_sizes) + num_image_patches = [1] * len(image_sizes) + vision_data.update({"num_image_tokens": num_image_tokens, "num_image_patches": num_image_patches}) + return MultiModalData(**vision_data) + + @property + def query_augmentation_token(self) -> str: + """ + Return the query augmentation token. + + Query augmentation buffers are used as reasoning buffers during inference. + """ + return self.tokenizer.pad_token + + def process_images( + self, + images: Optional[ImageInput] = None, + **kwargs: Unpack[ColPaliProcessorKwargs], + ) -> BatchFeature: + """ + Prepare for the model one or several image(s). This method is a wrapper around the `__call__` method of the ColPaliProcessor's + [`ColPaliProcessor.__call__`]. + + This method forwards the `images` and `kwargs` arguments to the image processor. + + Args: + images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`): + The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch + tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a + number of channels, H and W are image height and width. + return_tensors (`str` or [`~utils.TensorType`], *optional*): + If set, will return tensors of a particular framework. Acceptable values are: + + - `'tf'`: Return TensorFlow `tf.constant` objects. + - `'pt'`: Return PyTorch `torch.Tensor` objects. + - `'np'`: Return NumPy `np.ndarray` objects. + - `'jax'`: Return JAX `jnp.ndarray` objects. + + Returns: + [`BatchFeature`]: A [`BatchFeature`] with the following fields: + + - **input_ids** -- List of token ids to be fed to a model. + - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when + `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not + `None`). + - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`. + """ + return self.__call__(images=images, **kwargs) + + def process_queries( + self, + text: Union[TextInput, list[TextInput]], + **kwargs: Unpack[ColPaliProcessorKwargs], + ) -> BatchFeature: + """ + Prepare for the model one or several texts. This method is a wrapper around the `__call__` method of the ColPaliProcessor's + [`ColPaliProcessor.__call__`]. + + This method forwards the `text` and `kwargs` arguments to the tokenizer. + + Args: + text (`str`, `list[str]`, `list[list[str]]`): + The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings + (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set + `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). + return_tensors (`str` or [`~utils.TensorType`], *optional*): + If set, will return tensors of a particular framework. Acceptable values are: + + - `'tf'`: Return TensorFlow `tf.constant` objects. + - `'pt'`: Return PyTorch `torch.Tensor` objects. + - `'np'`: Return NumPy `np.ndarray` objects. + - `'jax'`: Return JAX `jnp.ndarray` objects. + + Returns: + [`BatchFeature`]: A [`BatchFeature`] with the following fields: + + - **input_ids** -- List of token ids to be fed to a model. + - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when + `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not + `None`). + """ + return self.__call__(text=text, **kwargs) + + def score_retrieval( + self, + query_embeddings: Union["torch.Tensor", list["torch.Tensor"]], + passage_embeddings: Union["torch.Tensor", list["torch.Tensor"]], + batch_size: int = 128, + output_dtype: Optional["torch.dtype"] = None, + output_device: Union["torch.device", str] = "cpu", + ) -> "torch.Tensor": + """ + Compute the late-interaction/MaxSim score (ColBERT-like) for the given multi-vector + query embeddings (`qs`) and passage embeddings (`ps`). For ColPali, a passage is the + image of a document page. + + Because the embedding tensors are multi-vector and can thus have different shapes, they + should be fed as: + (1) a list of tensors, where the i-th tensor is of shape (sequence_length_i, embedding_dim) + (2) a single tensor of shape (n_passages, max_sequence_length, embedding_dim) -> usually + obtained by padding the list of tensors. + + Args: + query_embeddings (`Union[torch.Tensor, list[torch.Tensor]`): Query embeddings. + passage_embeddings (`Union[torch.Tensor, list[torch.Tensor]`): Passage embeddings. + batch_size (`int`, *optional*, defaults to 128): Batch size for computing scores. + output_dtype (`torch.dtype`, *optional*, defaults to `torch.float32`): The dtype of the output tensor. + If `None`, the dtype of the input embeddings is used. + output_device (`torch.device` or `str`, *optional*, defaults to "cpu"): The device of the output tensor. + + Returns: + `torch.Tensor`: A tensor of shape `(n_queries, n_passages)` containing the scores. The score + tensor is saved on the "cpu" device. + """ + + if len(query_embeddings) == 0: + raise ValueError("No queries provided") + if len(passage_embeddings) == 0: + raise ValueError("No passages provided") + + if query_embeddings[0].device != passage_embeddings[0].device: + raise ValueError("Queries and passages must be on the same device") + + if query_embeddings[0].dtype != passage_embeddings[0].dtype: + raise ValueError("Queries and passages must have the same dtype") + + if output_dtype is None: + output_dtype = query_embeddings[0].dtype + + scores: list[torch.Tensor] = [] + + for i in range(0, len(query_embeddings), batch_size): + batch_scores: list[torch.Tensor] = [] + batch_queries = torch.nn.utils.rnn.pad_sequence( + query_embeddings[i : i + batch_size], batch_first=True, padding_value=0 + ) + for j in range(0, len(passage_embeddings), batch_size): + batch_passages = torch.nn.utils.rnn.pad_sequence( + passage_embeddings[j : j + batch_size], batch_first=True, padding_value=0 + ) + batch_scores.append( + torch.einsum("bnd,csd->bcns", batch_queries, batch_passages).max(dim=3)[0].sum(dim=2) + ) + scores.append(torch.cat(batch_scores, dim=1).to(output_dtype).to(output_device)) + + return torch.cat(scores, dim=0) + + +__all__ = ["ColPaliProcessor"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/colqwen2/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/colqwen2/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..d313293c0a0a7b5c56ac5bec1e54e8eb1e5a6e4a --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/colqwen2/__init__.py @@ -0,0 +1,28 @@ +# Copyright 2025 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_colqwen2 import * + from .modeling_colqwen2 import * + from .processing_colqwen2 import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/colqwen2/configuration_colqwen2.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/colqwen2/configuration_colqwen2.py new file mode 100644 index 0000000000000000000000000000000000000000..21f6e46f1f0037a58503b7a0432f6c3f16657f58 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/colqwen2/configuration_colqwen2.py @@ -0,0 +1,92 @@ +# Copyright 2025 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from copy import deepcopy +from typing import Any + +from ...configuration_utils import PretrainedConfig +from ...utils import logging +from ..auto import CONFIG_MAPPING + + +logger = logging.get_logger(__name__) + + +class ColQwen2Config(PretrainedConfig): + r""" + Configuration class to store the configuration of a [`ColQ2en2ForRetrieval`]. It is used to instantiate an instance + of `ColQwen2ForRetrieval` according to the specified arguments, defining the model architecture following the methodology + from the "ColPali: Efficient Document Retrieval with Vision Language Models" paper. + + Instantiating a configuration with the defaults will yield a similar configuration to the vision encoder used by the pre-trained + ColQwen2-v1.0 model, e.g. [vidore/colqwen2-v1.0-hf](https://huggingface.co/vidore/colqwen2-v1.0-hf). + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + vlm_config (`PretrainedConfig`, *optional*): + Configuration of the VLM backbone model. + embedding_dim (`int`, *optional*, defaults to 128): + Dimension of the multi-vector embeddings produced by the model. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + Example: + + ```python + from transformers.models.colqwen2 import ColQwen2Config, ColQwen2ForRetrieval + + config = ColQwen2Config() + model = ColQwen2ForRetrieval(config) + ``` + """ + + model_type = "colqwen2" + sub_configs: dict[str, Any] = {"vlm_config": PretrainedConfig} + + def __init__( + self, + vlm_config=None, + embedding_dim: int = 128, + initializer_range: float = 0.02, + **kwargs, + ): + if vlm_config is None: + vlm_config = CONFIG_MAPPING["qwen2_vl"]() + logger.info( + "`vlm_config` is `None`. Initializing `vlm_config` with the `Qwen2VLConfig` with default values." + ) + elif isinstance(vlm_config, dict): + vlm_config = deepcopy(vlm_config) + if "model_type" not in vlm_config: + raise KeyError( + "The `model_type` key is missing in the `vlm_config` dictionary. Please provide the model type." + ) + vlm_config = CONFIG_MAPPING[vlm_config["model_type"]](**vlm_config) + elif not isinstance(vlm_config, PretrainedConfig): + raise TypeError( + f"Invalid type for `vlm_config`. Expected `PretrainedConfig`, `dict`, or `None`, but got {type(vlm_config)}." + ) + + self.vlm_config = vlm_config + self.embedding_dim = embedding_dim + self.initializer_range = initializer_range + super().__init__(**kwargs) + + def get_text_config(self, *args, **kwargs) -> PretrainedConfig: + return self.vlm_config.get_text_config(*args, **kwargs) + + +__all__ = ["ColQwen2Config"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/colqwen2/modeling_colqwen2.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/colqwen2/modeling_colqwen2.py new file mode 100644 index 0000000000000000000000000000000000000000..fc0f585531aedd4703bac149ebef450fd529c548 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/colqwen2/modeling_colqwen2.py @@ -0,0 +1,248 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/colqwen2/modular_colqwen2.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_colqwen2.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright 2025 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass +from typing import Optional + +from torch import nn + +from transformers import AutoModelForImageTextToText + +from ...cache_utils import Cache +from ...modeling_utils import PreTrainedModel +from ...utils import ModelOutput, auto_docstring, can_return_tuple, is_torch_available +from .configuration_colqwen2 import ColQwen2Config + + +if is_torch_available(): + import torch + + +@auto_docstring +class ColQwen2PreTrainedModel(PreTrainedModel): + config: ColQwen2Config + base_model_prefix = "model" + _no_split_modules = [] + _supports_sdpa = True + _supports_flash_attn = True + _supports_flex_attn = True + + def _init_weights(self, module): + std = ( + self.config.initializer_range + if hasattr(self.config, "initializer_range") + else self.config.vlm_config.text_config.initializer_range + ) + + if isinstance(module, (nn.Linear, nn.Conv2d)): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + +@dataclass +@auto_docstring( + custom_intro=""" + Base class for ColQwen2 embeddings output. + """ +) +class ColQwen2ForRetrievalOutput(ModelOutput): + r""" + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Language modeling loss (for next-token prediction). + embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + The embeddings of the model. + past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache). + + Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see + `past_key_values` input) to speed up sequential decoding. + """ + + loss: Optional[torch.FloatTensor] = None + embeddings: Optional[torch.Tensor] = None + past_key_values: Optional[Cache] = None + hidden_states: Optional[tuple[torch.FloatTensor]] = None + attentions: Optional[tuple[torch.FloatTensor]] = None + + +@auto_docstring( + custom_intro=""" + Following the ColPali approach, ColQwen2 leverages VLMs to construct efficient multi-vector embeddings directly + from document images (“screenshots”) for document retrieval. The model is trained to maximize the similarity + between these document embeddings and the corresponding query embeddings, using the late interaction method + introduced in ColBERT. + + Using ColQwen2 removes the need for potentially complex and brittle layout recognition and OCR pipelines with + a single model that can take into account both the textual and visual content (layout, charts, ...) of a document. + + ColQwen2 is part of the ColVision model family, which was introduced with ColPali in the following paper: + [*ColPali: Efficient Document Retrieval with Vision Language Models*](https://huggingface.co/papers/2407.01449). + """ +) +class ColQwen2ForRetrieval(ColQwen2PreTrainedModel): + _checkpoint_conversion_mapping = {} + + def __init__(self, config: ColQwen2Config): + super().__init__(config) + self.config = config + self.vocab_size = config.vlm_config.text_config.vocab_size + + self.vlm = AutoModelForImageTextToText.from_config(config.vlm_config) + + self.embedding_dim = self.config.embedding_dim + self.embedding_proj_layer = nn.Linear( + self.config.vlm_config.text_config.hidden_size, + self.embedding_dim, + ) + self._tied_weights_keys = [f"vlm.{k}" for k in (self.vlm._tied_weights_keys or [])] + + self.post_init() + + @can_return_tuple + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + labels: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + pixel_values: Optional[torch.Tensor] = None, + image_grid_thw: Optional[torch.LongTensor] = None, + cache_position: Optional[torch.LongTensor] = None, + ) -> ColQwen2ForRetrievalOutput: + r""" + image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*): + The temporal, height and width of feature shape of each image in LLM. + """ + # Handle the custom "pixel_values" input obtained with `ColQwen2Processor` through unpadding + if pixel_values is not None and image_grid_thw is not None: + # NOTE: image_grid_thw: (batch_size, 3) where image_grid_thw[i] = (num_patches_h, num_patches_w, temporal_patch_size) + offsets = image_grid_thw[:, 1] * image_grid_thw[:, 2] # (num_patches_h, num_patches_w) + pixel_values = torch.cat( + [pixel_sequence[:offset] for pixel_sequence, offset in zip(pixel_values, offsets)], + dim=0, + ) # (num_patches_h * num_patches_w, pixel_values) + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + position_ids, rope_deltas = self.vlm.model.get_rope_index( + input_ids=input_ids, + image_grid_thw=image_grid_thw, + video_grid_thw=None, + attention_mask=attention_mask, + ) + + # Custom data preparation to fix an issue with the gradient flow when training with multiple GPUs. + if inputs_embeds is None: + inputs_embeds = self.vlm.language_model.embed_tokens(input_ids) + + if pixel_values is not None: + pixel_values = pixel_values.type(self.vlm.visual.get_dtype()) + image_embeds = self.vlm.visual(pixel_values, grid_thw=image_grid_thw) + image_mask = ( + (input_ids == self.config.vlm_config.image_token_id).unsqueeze(-1).expand_as(inputs_embeds) + ) + image_embeds = image_embeds.to(inputs_embeds.device, inputs_embeds.dtype) + inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds) + + vlm_output = self.vlm.model( + input_ids=None, + position_ids=position_ids, + attention_mask=attention_mask, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + cache_position=cache_position, + ) + + vlm_hidden_states = vlm_output.hidden_states if output_hidden_states else None + + last_hidden_states = vlm_output[0] # (batch_size, sequence_length, hidden_size) + proj_dtype = self.embedding_proj_layer.weight.dtype + embeddings = self.embedding_proj_layer(last_hidden_states.to(proj_dtype)) # (batch_size, sequence_length, dim) + + # L2 normalization + embeddings = embeddings / embeddings.norm(dim=-1, keepdim=True) # (batch_size, sequence_length, dim) + if attention_mask is not None: + embeddings = embeddings * attention_mask.unsqueeze(-1) # (batch_size, sequence_length, dim) + + return ColQwen2ForRetrievalOutput( + embeddings=embeddings, + past_key_values=vlm_output.past_key_values, + hidden_states=vlm_hidden_states, + attentions=vlm_output.attentions, + ) + + def get_input_embeddings(self): + return self.vlm.get_input_embeddings() + + def set_input_embeddings(self, value): + self.vlm.set_input_embeddings(value) + + def get_output_embeddings(self): + return self.vlm.get_output_embeddings() + + def set_output_embeddings(self, new_embeddings): + self.vlm.set_output_embeddings(new_embeddings) + + def tie_weights(self): + return self.vlm.tie_weights() + + def resize_token_embeddings( + self, + new_num_tokens: Optional[int] = None, + pad_to_multiple_of: Optional[int] = None, + mean_resizing: bool = True, + ) -> nn.Embedding: + model_embeds = self.vlm.resize_token_embeddings( + new_num_tokens=new_num_tokens, + pad_to_multiple_of=pad_to_multiple_of, + mean_resizing=mean_resizing, + ) + + self.config.vlm_config.text_config.vocab_size = model_embeds.num_embeddings + self.config.vlm_config.vocab_size = model_embeds.num_embeddings + self.vlm.vocab_size = model_embeds.num_embeddings + self.vocab_size = model_embeds.num_embeddings + + return model_embeds + + +__all__ = ["ColQwen2ForRetrieval", "ColQwen2PreTrainedModel"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/colqwen2/modular_colqwen2.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/colqwen2/modular_colqwen2.py new file mode 100644 index 0000000000000000000000000000000000000000..f3ae79abf6fa2e3825a84b5c65e6e6fc416caa1d --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/colqwen2/modular_colqwen2.py @@ -0,0 +1,410 @@ +# coding=utf-8 +# Copyright 2025 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass +from typing import Optional, Union + +from ...cache_utils import Cache +from ...feature_extraction_utils import BatchFeature +from ...image_utils import ImageInput, is_valid_image +from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack +from ...tokenization_utils_base import PreTokenizedInput, TextInput +from ...utils import ModelOutput, auto_docstring, can_return_tuple, is_torch_available, logging +from ..colpali.modeling_colpali import ColPaliForRetrieval, ColPaliPreTrainedModel +from ..colpali.processing_colpali import ColPaliProcessor +from .configuration_colqwen2 import ColQwen2Config + + +if is_torch_available(): + import torch + +logger = logging.get_logger(__name__) + + +class ColQwen2ProcessorKwargs(ProcessingKwargs, total=False): + _defaults = { + "text_kwargs": { + "padding": "longest", + }, + "images_kwargs": { + "data_format": "channels_first", + "do_convert_rgb": True, + }, + "common_kwargs": {"return_tensors": "pt"}, + } + + +class ColQwen2Processor(ColPaliProcessor): + r""" + Constructs a ColQwen2 processor which wraps a Qwen2VLProcessor and special methods to process images and queries, as + well as to compute the late-interaction retrieval score. + + [`ColQwen2Processor`] offers all the functionalities of [`Qwen2VLProcessor`]. See the [`~Qwen2VLProcessor.__call__`] + for more information. + + Args: + image_processor ([`Qwen2VLImageProcessor`], *optional*): + The image processor is a required input. + tokenizer ([`Qwen2TokenizerFast`], *optional*): + The tokenizer is a required input. + chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages + in a chat into a tokenizable string. + visual_prompt_prefix (`str`, *optional*): A string that gets tokenized and prepended to the image tokens. + query_prefix (`str`, *optional*): A prefix to be used for the query. + """ + + image_processor_class = "AutoImageProcessor" + tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast") + + def __init__( + self, + image_processor=None, + tokenizer=None, + chat_template=None, + visual_prompt_prefix: Optional[str] = None, + query_prefix: Optional[str] = None, + **kwargs, + ): + ProcessorMixin.__init__(self, image_processor, tokenizer, chat_template=chat_template) + self.image_token = "<|image_pad|>" if not hasattr(tokenizer, "image_token") else tokenizer.image_token + self.video_token = "<|video_pad|>" if not hasattr(tokenizer, "video_token") else tokenizer.video_token + + if visual_prompt_prefix is None: + visual_prompt_prefix = "<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Describe the image.<|im_end|><|endoftext|>" + self.visual_prompt_prefix = visual_prompt_prefix + + if query_prefix is None: + query_prefix = "Query: " + self.query_prefix = query_prefix + + def __call__( + self, + images: Optional[ImageInput] = None, + text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None, + audio=None, + videos=None, + **kwargs: Unpack[ColQwen2ProcessorKwargs], + ) -> BatchFeature: + """ + Main method to prepare for the model either (1) one or several texts, either (2) one or several image(s). This method is a custom + wrapper around the Qwen2VLProcessor's [`~Qwen2VLProcessor.__call__`] method adapted for the ColQwen2 model. It cannot process + both text and images at the same time. + + When preparing the the text(s), this method forwards the `text` and `kwargs` arguments to Qwen2TokenizerFast's + [`~Qwen2TokenizerFast.__call__`]. + When preparing the the image(s), this method forwards the `images` and `kwargs` arguments to Qwen2VLImageProcessor's + [`~Qwen2VLImageProcessor.__call__`]. + Please refer to the doctsring of the above two methods for more information. + + Args: + images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`): + The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch + tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a + number of channels, H and W are image height and width. + text (`str`, `list[str]`, `list[list[str]]`): + The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings + (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set + `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). + return_tensors (`str` or [`~utils.TensorType`], *optional*): + If set, will return tensors of a particular framework. Acceptable values are: + + - `'tf'`: Return TensorFlow `tf.constant` objects. + - `'pt'`: Return PyTorch `torch.Tensor` objects. + - `'np'`: Return NumPy `np.ndarray` objects. + - `'jax'`: Return JAX `jnp.ndarray` objects. + + Returns: + [`BatchFeature`]: A [`BatchFeature`] with the following fields: + + - **input_ids** -- List of token ids to be fed to a model. + - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when + `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not + `None`). + - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`. + """ + output_kwargs = self._merge_kwargs( + ColQwen2ProcessorKwargs, + tokenizer_init_kwargs=self.tokenizer.init_kwargs, + **kwargs, + ) + suffix = output_kwargs["text_kwargs"].pop("suffix", None) + + return_token_type_ids = suffix is not None + + if text is None and images is None: + raise ValueError("Either text or images must be provided") + if text is not None and images is not None: + raise ValueError("Only one of text or images can be processed at a time") + + if images is not None: + if is_valid_image(images): + images = [images] + elif isinstance(images, list) and is_valid_image(images[0]): + pass + elif not (isinstance(images, list) and isinstance(images[0], list) and is_valid_image(images[0][0])): + raise ValueError("images must be an image, list of images or list of list of images") + + texts_doc = [self.visual_prompt_prefix] * len(images) + + image_inputs = self.image_processor(images=images, **output_kwargs["images_kwargs"]) + image_grid_thw = image_inputs["image_grid_thw"] + + if image_grid_thw is not None: + merge_length = self.image_processor.merge_size**2 + index = 0 + for i in range(len(texts_doc)): + while self.image_token in texts_doc[i]: + texts_doc[i] = texts_doc[i].replace( + self.image_token, "<|placeholder|>" * (image_grid_thw[index].prod() // merge_length), 1 + ) + index += 1 + texts_doc[i] = texts_doc[i].replace("<|placeholder|>", self.image_token) + + text_inputs = self.tokenizer( + texts_doc, + return_token_type_ids=False, + **output_kwargs["text_kwargs"], + ) + + return_data = BatchFeature(data={**text_inputs, **image_inputs}) + + # NOTE: The following adjustment ensures correct behavior with DDP on multiple GPUs. + offsets = return_data["image_grid_thw"][:, 1] * return_data["image_grid_thw"][:, 2] # (batch_size,) + + # Split the pixel_values tensor into a list of tensors, one per image + pixel_values = list( + torch.split(return_data["pixel_values"], offsets.tolist()) + ) # [(num_patches_image_0, pixel_values), ..., (num_patches_image_n, pixel_values)] + + # Pad the list of pixel_value tensors to the same length along the sequence dimension + return_data["pixel_values"] = torch.nn.utils.rnn.pad_sequence( + pixel_values, batch_first=True + ) # (batch_size, max_num_patches, pixel_values) + + if return_token_type_ids: + labels = return_data["input_ids"].masked_fill(return_data["token_type_ids"] == 0, -100) + return_data.update({"labels": labels}) + + return return_data + + elif text is not None: + if isinstance(text, str): + text = [text] + elif not (isinstance(text, list) and isinstance(text[0], str)): + raise ValueError("Text must be a string or a list of strings") + + if suffix is None: + suffix = self.query_augmentation_token * 10 + + texts_query: list[str] = [] + + for query in text: + augmented_query = self.query_prefix + query + suffix + texts_query.append(augmented_query) + + batch_query = self.tokenizer( + texts_query, + return_token_type_ids=False, + **output_kwargs["text_kwargs"], + ) + + return batch_query + + def _get_num_multimodal_tokens(self, image_sizes=None, **kwargs): + """ + Computes the number of placeholder tokens needed for multimodal inputs with the given sizes. + Args: + image_sizes (`list[list[int]]`, *optional*): + The input sizes formatted as (height, width) per each image. + Returns: + `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided + input modalities, along with other useful data. + """ + + vision_data = {} + if image_sizes is not None: + images_kwargs = ColQwen2ProcessorKwargs._defaults.get("images_kwargs", {}) + images_kwargs.update(kwargs) + merge_size = images_kwargs.get("merge_size", None) or self.image_processor.merge_size + + num_image_patches = [ + self.image_processor.get_number_of_image_patches(*image_size, images_kwargs) + for image_size in image_sizes + ] + num_image_tokens = [(num_patches // merge_size**2) for num_patches in num_image_patches] + vision_data.update({"num_image_tokens": num_image_tokens, "num_image_patches": num_image_patches}) + + return MultiModalData(**vision_data) + + @property + def model_input_names(self): + tokenizer_input_names = self.tokenizer.model_input_names + image_processor_input_names = self.image_processor.model_input_names + + # ColQwen doesn't process videos. Make a copy of list when removing + # otherwise `self.feature_extractor.model_input_names` is also modified + image_processor_input_names = [ + name for name in image_processor_input_names if name not in ["pixel_values_videos", "video_grid_thw"] + ] + return tokenizer_input_names + image_processor_input_names + + +class ColQwen2PreTrainedModel(ColPaliPreTrainedModel): + pass + + +@dataclass +@auto_docstring( + custom_intro=""" + Base class for ColQwen2 embeddings output. + """ +) +class ColQwen2ForRetrievalOutput(ModelOutput): + r""" + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Language modeling loss (for next-token prediction). + embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + The embeddings of the model. + past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache). + + Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see + `past_key_values` input) to speed up sequential decoding. + """ + + loss: Optional[torch.FloatTensor] = None + embeddings: Optional[torch.Tensor] = None + past_key_values: Optional[Cache] = None + hidden_states: Optional[tuple[torch.FloatTensor]] = None + attentions: Optional[tuple[torch.FloatTensor]] = None + + +@auto_docstring( + custom_intro=""" + Following the ColPali approach, ColQwen2 leverages VLMs to construct efficient multi-vector embeddings directly + from document images (“screenshots”) for document retrieval. The model is trained to maximize the similarity + between these document embeddings and the corresponding query embeddings, using the late interaction method + introduced in ColBERT. + + Using ColQwen2 removes the need for potentially complex and brittle layout recognition and OCR pipelines with + a single model that can take into account both the textual and visual content (layout, charts, ...) of a document. + + ColQwen2 is part of the ColVision model family, which was introduced with ColPali in the following paper: + [*ColPali: Efficient Document Retrieval with Vision Language Models*](https://huggingface.co/papers/2407.01449). + """ +) +class ColQwen2ForRetrieval(ColPaliForRetrieval): + _checkpoint_conversion_mapping = {} + + def __init__(self, config: ColQwen2Config): + super().__init__(config) + del self._tied_weights_keys + self._tied_weights_keys = [f"vlm.{k}" for k in (self.vlm._tied_weights_keys or [])] + + @can_return_tuple + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + labels: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + pixel_values: Optional[torch.Tensor] = None, + image_grid_thw: Optional[torch.LongTensor] = None, + cache_position: Optional[torch.LongTensor] = None, + ) -> ColQwen2ForRetrievalOutput: + r""" + image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*): + The temporal, height and width of feature shape of each image in LLM. + """ + # Handle the custom "pixel_values" input obtained with `ColQwen2Processor` through unpadding + if pixel_values is not None and image_grid_thw is not None: + # NOTE: image_grid_thw: (batch_size, 3) where image_grid_thw[i] = (num_patches_h, num_patches_w, temporal_patch_size) + offsets = image_grid_thw[:, 1] * image_grid_thw[:, 2] # (num_patches_h, num_patches_w) + pixel_values = torch.cat( + [pixel_sequence[:offset] for pixel_sequence, offset in zip(pixel_values, offsets)], + dim=0, + ) # (num_patches_h * num_patches_w, pixel_values) + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + position_ids, rope_deltas = self.vlm.model.get_rope_index( + input_ids=input_ids, + image_grid_thw=image_grid_thw, + video_grid_thw=None, + attention_mask=attention_mask, + ) + + # Custom data preparation to fix an issue with the gradient flow when training with multiple GPUs. + if inputs_embeds is None: + inputs_embeds = self.vlm.language_model.embed_tokens(input_ids) + + if pixel_values is not None: + pixel_values = pixel_values.type(self.vlm.visual.get_dtype()) + image_embeds = self.vlm.visual(pixel_values, grid_thw=image_grid_thw) + image_mask = ( + (input_ids == self.config.vlm_config.image_token_id).unsqueeze(-1).expand_as(inputs_embeds) + ) + image_embeds = image_embeds.to(inputs_embeds.device, inputs_embeds.dtype) + inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds) + + vlm_output = self.vlm.model( + input_ids=None, + position_ids=position_ids, + attention_mask=attention_mask, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + cache_position=cache_position, + ) + + vlm_hidden_states = vlm_output.hidden_states if output_hidden_states else None + + last_hidden_states = vlm_output[0] # (batch_size, sequence_length, hidden_size) + proj_dtype = self.embedding_proj_layer.weight.dtype + embeddings = self.embedding_proj_layer(last_hidden_states.to(proj_dtype)) # (batch_size, sequence_length, dim) + + # L2 normalization + embeddings = embeddings / embeddings.norm(dim=-1, keepdim=True) # (batch_size, sequence_length, dim) + if attention_mask is not None: + embeddings = embeddings * attention_mask.unsqueeze(-1) # (batch_size, sequence_length, dim) + + return ColQwen2ForRetrievalOutput( + embeddings=embeddings, + past_key_values=vlm_output.past_key_values, + hidden_states=vlm_hidden_states, + attentions=vlm_output.attentions, + ) + + +__all__ = [ + "ColQwen2ForRetrieval", + "ColQwen2PreTrainedModel", + "ColQwen2Processor", +] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/colqwen2/processing_colqwen2.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/colqwen2/processing_colqwen2.py new file mode 100644 index 0000000000000000000000000000000000000000..1609f6e182dad154f3b85a2a5835dc027024550e --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/colqwen2/processing_colqwen2.py @@ -0,0 +1,407 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/colqwen2/modular_colqwen2.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_colqwen2.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright 2025 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Optional, Union + +from ...feature_extraction_utils import BatchFeature +from ...image_utils import ImageInput, is_valid_image +from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack +from ...tokenization_utils_base import PreTokenizedInput, TextInput +from ...utils import is_torch_available + + +if is_torch_available(): + import torch + + +class ColQwen2ProcessorKwargs(ProcessingKwargs, total=False): + _defaults = { + "text_kwargs": { + "padding": "longest", + }, + "images_kwargs": { + "data_format": "channels_first", + "do_convert_rgb": True, + }, + "common_kwargs": {"return_tensors": "pt"}, + } + + +class ColQwen2Processor(ProcessorMixin): + r""" + Constructs a ColQwen2 processor which wraps a Qwen2VLProcessor and special methods to process images and queries, as + well as to compute the late-interaction retrieval score. + + [`ColQwen2Processor`] offers all the functionalities of [`Qwen2VLProcessor`]. See the [`~Qwen2VLProcessor.__call__`] + for more information. + + Args: + image_processor ([`Qwen2VLImageProcessor`], *optional*): + The image processor is a required input. + tokenizer ([`Qwen2TokenizerFast`], *optional*): + The tokenizer is a required input. + chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages + in a chat into a tokenizable string. + visual_prompt_prefix (`str`, *optional*): A string that gets tokenized and prepended to the image tokens. + query_prefix (`str`, *optional*): A prefix to be used for the query. + """ + + attributes = ["image_processor", "tokenizer"] + + image_processor_class = "AutoImageProcessor" + tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast") + + def __init__( + self, + image_processor=None, + tokenizer=None, + chat_template=None, + visual_prompt_prefix: Optional[str] = None, + query_prefix: Optional[str] = None, + **kwargs, + ): + super().__init__(image_processor, tokenizer, chat_template=chat_template) + self.image_token = "<|image_pad|>" if not hasattr(tokenizer, "image_token") else tokenizer.image_token + self.video_token = "<|video_pad|>" if not hasattr(tokenizer, "video_token") else tokenizer.video_token + + if visual_prompt_prefix is None: + visual_prompt_prefix = "<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Describe the image.<|im_end|><|endoftext|>" + self.visual_prompt_prefix = visual_prompt_prefix + + if query_prefix is None: + query_prefix = "Query: " + self.query_prefix = query_prefix + + def __call__( + self, + images: Optional[ImageInput] = None, + text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None, + audio=None, + videos=None, + **kwargs: Unpack[ColQwen2ProcessorKwargs], + ) -> BatchFeature: + """ + Main method to prepare for the model either (1) one or several texts, either (2) one or several image(s). This method is a custom + wrapper around the Qwen2VLProcessor's [`~Qwen2VLProcessor.__call__`] method adapted for the ColQwen2 model. It cannot process + both text and images at the same time. + + When preparing the the text(s), this method forwards the `text` and `kwargs` arguments to Qwen2TokenizerFast's + [`~Qwen2TokenizerFast.__call__`]. + When preparing the the image(s), this method forwards the `images` and `kwargs` arguments to Qwen2VLImageProcessor's + [`~Qwen2VLImageProcessor.__call__`]. + Please refer to the doctsring of the above two methods for more information. + + Args: + images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`): + The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch + tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a + number of channels, H and W are image height and width. + text (`str`, `list[str]`, `list[list[str]]`): + The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings + (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set + `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). + return_tensors (`str` or [`~utils.TensorType`], *optional*): + If set, will return tensors of a particular framework. Acceptable values are: + + - `'tf'`: Return TensorFlow `tf.constant` objects. + - `'pt'`: Return PyTorch `torch.Tensor` objects. + - `'np'`: Return NumPy `np.ndarray` objects. + - `'jax'`: Return JAX `jnp.ndarray` objects. + + Returns: + [`BatchFeature`]: A [`BatchFeature`] with the following fields: + + - **input_ids** -- List of token ids to be fed to a model. + - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when + `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not + `None`). + - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`. + """ + output_kwargs = self._merge_kwargs( + ColQwen2ProcessorKwargs, + tokenizer_init_kwargs=self.tokenizer.init_kwargs, + **kwargs, + ) + suffix = output_kwargs["text_kwargs"].pop("suffix", None) + + return_token_type_ids = suffix is not None + + if text is None and images is None: + raise ValueError("Either text or images must be provided") + if text is not None and images is not None: + raise ValueError("Only one of text or images can be processed at a time") + + if images is not None: + if is_valid_image(images): + images = [images] + elif isinstance(images, list) and is_valid_image(images[0]): + pass + elif not (isinstance(images, list) and isinstance(images[0], list) and is_valid_image(images[0][0])): + raise ValueError("images must be an image, list of images or list of list of images") + + texts_doc = [self.visual_prompt_prefix] * len(images) + + image_inputs = self.image_processor(images=images, **output_kwargs["images_kwargs"]) + image_grid_thw = image_inputs["image_grid_thw"] + + if image_grid_thw is not None: + merge_length = self.image_processor.merge_size**2 + index = 0 + for i in range(len(texts_doc)): + while self.image_token in texts_doc[i]: + texts_doc[i] = texts_doc[i].replace( + self.image_token, "<|placeholder|>" * (image_grid_thw[index].prod() // merge_length), 1 + ) + index += 1 + texts_doc[i] = texts_doc[i].replace("<|placeholder|>", self.image_token) + + text_inputs = self.tokenizer( + texts_doc, + return_token_type_ids=False, + **output_kwargs["text_kwargs"], + ) + + return_data = BatchFeature(data={**text_inputs, **image_inputs}) + + # NOTE: The following adjustment ensures correct behavior with DDP on multiple GPUs. + offsets = return_data["image_grid_thw"][:, 1] * return_data["image_grid_thw"][:, 2] # (batch_size,) + + # Split the pixel_values tensor into a list of tensors, one per image + pixel_values = list( + torch.split(return_data["pixel_values"], offsets.tolist()) + ) # [(num_patches_image_0, pixel_values), ..., (num_patches_image_n, pixel_values)] + + # Pad the list of pixel_value tensors to the same length along the sequence dimension + return_data["pixel_values"] = torch.nn.utils.rnn.pad_sequence( + pixel_values, batch_first=True + ) # (batch_size, max_num_patches, pixel_values) + + if return_token_type_ids: + labels = return_data["input_ids"].masked_fill(return_data["token_type_ids"] == 0, -100) + return_data.update({"labels": labels}) + + return return_data + + elif text is not None: + if isinstance(text, str): + text = [text] + elif not (isinstance(text, list) and isinstance(text[0], str)): + raise ValueError("Text must be a string or a list of strings") + + if suffix is None: + suffix = self.query_augmentation_token * 10 + + texts_query: list[str] = [] + + for query in text: + augmented_query = self.query_prefix + query + suffix + texts_query.append(augmented_query) + + batch_query = self.tokenizer( + texts_query, + return_token_type_ids=False, + **output_kwargs["text_kwargs"], + ) + + return batch_query + + def _get_num_multimodal_tokens(self, image_sizes=None, **kwargs): + """ + Computes the number of placeholder tokens needed for multimodal inputs with the given sizes. + Args: + image_sizes (`list[list[int]]`, *optional*): + The input sizes formatted as (height, width) per each image. + Returns: + `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided + input modalities, along with other useful data. + """ + + vision_data = {} + if image_sizes is not None: + images_kwargs = ColQwen2ProcessorKwargs._defaults.get("images_kwargs", {}) + images_kwargs.update(kwargs) + merge_size = images_kwargs.get("merge_size", None) or self.image_processor.merge_size + + num_image_patches = [ + self.image_processor.get_number_of_image_patches(*image_size, images_kwargs) + for image_size in image_sizes + ] + num_image_tokens = [(num_patches // merge_size**2) for num_patches in num_image_patches] + vision_data.update({"num_image_tokens": num_image_tokens, "num_image_patches": num_image_patches}) + + return MultiModalData(**vision_data) + + @property + def query_augmentation_token(self) -> str: + """ + Return the query augmentation token. + + Query augmentation buffers are used as reasoning buffers during inference. + """ + return self.tokenizer.pad_token + + def process_images( + self, + images: Optional[ImageInput] = None, + **kwargs: Unpack[ColQwen2ProcessorKwargs], + ) -> BatchFeature: + """ + Prepare for the model one or several image(s). This method is a wrapper around the `__call__` method of the ColQwen2Processor's + [`ColQwen2Processor.__call__`]. + + This method forwards the `images` and `kwargs` arguments to the image processor. + + Args: + images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`): + The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch + tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a + number of channels, H and W are image height and width. + return_tensors (`str` or [`~utils.TensorType`], *optional*): + If set, will return tensors of a particular framework. Acceptable values are: + + - `'tf'`: Return TensorFlow `tf.constant` objects. + - `'pt'`: Return PyTorch `torch.Tensor` objects. + - `'np'`: Return NumPy `np.ndarray` objects. + - `'jax'`: Return JAX `jnp.ndarray` objects. + + Returns: + [`BatchFeature`]: A [`BatchFeature`] with the following fields: + + - **input_ids** -- List of token ids to be fed to a model. + - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when + `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not + `None`). + - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`. + """ + return self.__call__(images=images, **kwargs) + + def process_queries( + self, + text: Union[TextInput, list[TextInput]], + **kwargs: Unpack[ColQwen2ProcessorKwargs], + ) -> BatchFeature: + """ + Prepare for the model one or several texts. This method is a wrapper around the `__call__` method of the ColQwen2Processor's + [`ColQwen2Processor.__call__`]. + + This method forwards the `text` and `kwargs` arguments to the tokenizer. + + Args: + text (`str`, `list[str]`, `list[list[str]]`): + The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings + (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set + `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). + return_tensors (`str` or [`~utils.TensorType`], *optional*): + If set, will return tensors of a particular framework. Acceptable values are: + + - `'tf'`: Return TensorFlow `tf.constant` objects. + - `'pt'`: Return PyTorch `torch.Tensor` objects. + - `'np'`: Return NumPy `np.ndarray` objects. + - `'jax'`: Return JAX `jnp.ndarray` objects. + + Returns: + [`BatchFeature`]: A [`BatchFeature`] with the following fields: + + - **input_ids** -- List of token ids to be fed to a model. + - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when + `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not + `None`). + """ + return self.__call__(text=text, **kwargs) + + def score_retrieval( + self, + query_embeddings: Union["torch.Tensor", list["torch.Tensor"]], + passage_embeddings: Union["torch.Tensor", list["torch.Tensor"]], + batch_size: int = 128, + output_dtype: Optional["torch.dtype"] = None, + output_device: Union["torch.device", str] = "cpu", + ) -> "torch.Tensor": + """ + Compute the late-interaction/MaxSim score (ColBERT-like) for the given multi-vector + query embeddings (`qs`) and passage embeddings (`ps`). For ColQwen2, a passage is the + image of a document page. + + Because the embedding tensors are multi-vector and can thus have different shapes, they + should be fed as: + (1) a list of tensors, where the i-th tensor is of shape (sequence_length_i, embedding_dim) + (2) a single tensor of shape (n_passages, max_sequence_length, embedding_dim) -> usually + obtained by padding the list of tensors. + + Args: + query_embeddings (`Union[torch.Tensor, list[torch.Tensor]`): Query embeddings. + passage_embeddings (`Union[torch.Tensor, list[torch.Tensor]`): Passage embeddings. + batch_size (`int`, *optional*, defaults to 128): Batch size for computing scores. + output_dtype (`torch.dtype`, *optional*, defaults to `torch.float32`): The dtype of the output tensor. + If `None`, the dtype of the input embeddings is used. + output_device (`torch.device` or `str`, *optional*, defaults to "cpu"): The device of the output tensor. + + Returns: + `torch.Tensor`: A tensor of shape `(n_queries, n_passages)` containing the scores. The score + tensor is saved on the "cpu" device. + """ + + if len(query_embeddings) == 0: + raise ValueError("No queries provided") + if len(passage_embeddings) == 0: + raise ValueError("No passages provided") + + if query_embeddings[0].device != passage_embeddings[0].device: + raise ValueError("Queries and passages must be on the same device") + + if query_embeddings[0].dtype != passage_embeddings[0].dtype: + raise ValueError("Queries and passages must have the same dtype") + + if output_dtype is None: + output_dtype = query_embeddings[0].dtype + + scores: list[torch.Tensor] = [] + + for i in range(0, len(query_embeddings), batch_size): + batch_scores: list[torch.Tensor] = [] + batch_queries = torch.nn.utils.rnn.pad_sequence( + query_embeddings[i : i + batch_size], batch_first=True, padding_value=0 + ) + for j in range(0, len(passage_embeddings), batch_size): + batch_passages = torch.nn.utils.rnn.pad_sequence( + passage_embeddings[j : j + batch_size], batch_first=True, padding_value=0 + ) + batch_scores.append( + torch.einsum("bnd,csd->bcns", batch_queries, batch_passages).max(dim=3)[0].sum(dim=2) + ) + scores.append(torch.cat(batch_scores, dim=1).to(output_dtype).to(output_device)) + + return torch.cat(scores, dim=0) + + @property + def model_input_names(self): + tokenizer_input_names = self.tokenizer.model_input_names + image_processor_input_names = self.image_processor.model_input_names + + # ColQwen doesn't process videos. Make a copy of list when removing + # otherwise `self.feature_extractor.model_input_names` is also modified + image_processor_input_names = [ + name for name in image_processor_input_names if name not in ["pixel_values_videos", "video_grid_thw"] + ] + return tokenizer_input_names + image_processor_input_names + + +__all__ = ["ColQwen2Processor"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/convbert/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/convbert/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..670a7d6f47647ff58d72cc461618bbb2c14f9565 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/convbert/__init__.py @@ -0,0 +1,30 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_convbert import * + from .modeling_convbert import * + from .modeling_tf_convbert import * + from .tokenization_convbert import * + from .tokenization_convbert_fast import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/convbert/configuration_convbert.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/convbert/configuration_convbert.py new file mode 100644 index 0000000000000000000000000000000000000000..a107c7209e9784143433c90ac4fc27a114965290 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/convbert/configuration_convbert.py @@ -0,0 +1,160 @@ +# coding=utf-8 +# Copyright The HuggingFace team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""ConvBERT model configuration""" + +from collections import OrderedDict +from collections.abc import Mapping + +from ...configuration_utils import PretrainedConfig +from ...onnx import OnnxConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class ConvBertConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`ConvBertModel`]. It is used to instantiate an + ConvBERT model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of the ConvBERT + [YituTech/conv-bert-base](https://huggingface.co/YituTech/conv-bert-base) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 30522): + Vocabulary size of the ConvBERT model. Defines the number of different tokens that can be represented by + the `inputs_ids` passed when calling [`ConvBertModel`] or [`TFConvBertModel`]. + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + intermediate_size (`int`, *optional*, defaults to 3072): + Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"` and `"gelu_new"` are supported. + hidden_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout ratio for the attention probabilities. + max_position_embeddings (`int`, *optional*, defaults to 512): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + type_vocab_size (`int`, *optional*, defaults to 2): + The vocabulary size of the `token_type_ids` passed when calling [`ConvBertModel`] or [`TFConvBertModel`]. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-12): + The epsilon used by the layer normalization layers. + head_ratio (`int`, *optional*, defaults to 2): + Ratio gamma to reduce the number of attention heads. + num_groups (`int`, *optional*, defaults to 1): + The number of groups for grouped linear layers for ConvBert model + conv_kernel_size (`int`, *optional*, defaults to 9): + The size of the convolutional kernel. + classifier_dropout (`float`, *optional*): + The dropout ratio for the classification head. + + Example: + + ```python + >>> from transformers import ConvBertConfig, ConvBertModel + + >>> # Initializing a ConvBERT convbert-base-uncased style configuration + >>> configuration = ConvBertConfig() + + >>> # Initializing a model (with random weights) from the convbert-base-uncased style configuration + >>> model = ConvBertModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "convbert" + + def __init__( + self, + vocab_size=30522, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=2, + initializer_range=0.02, + layer_norm_eps=1e-12, + pad_token_id=1, + bos_token_id=0, + eos_token_id=2, + embedding_size=768, + head_ratio=2, + conv_kernel_size=9, + num_groups=1, + classifier_dropout=None, + **kwargs, + ): + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + **kwargs, + ) + + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + self.embedding_size = embedding_size + self.head_ratio = head_ratio + self.conv_kernel_size = conv_kernel_size + self.num_groups = num_groups + self.classifier_dropout = classifier_dropout + + +# Copied from transformers.models.bert.configuration_bert.BertOnnxConfig +class ConvBertOnnxConfig(OnnxConfig): + @property + def inputs(self) -> Mapping[str, Mapping[int, str]]: + if self.task == "multiple-choice": + dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"} + else: + dynamic_axis = {0: "batch", 1: "sequence"} + return OrderedDict( + [ + ("input_ids", dynamic_axis), + ("attention_mask", dynamic_axis), + ("token_type_ids", dynamic_axis), + ] + ) + + +__all__ = ["ConvBertConfig", "ConvBertOnnxConfig"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/convbert/modeling_convbert.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/convbert/modeling_convbert.py new file mode 100644 index 0000000000000000000000000000000000000000..080b93fa92a65b9195c1c4365915898d2f025ba9 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/convbert/modeling_convbert.py @@ -0,0 +1,1334 @@ +# coding=utf-8 +# Copyright 2021 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch ConvBERT model.""" + +import math +import os +from operator import attrgetter +from typing import Callable, Optional, Union + +import torch +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from ...activations import ACT2FN, get_activation +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_outputs import ( + BaseModelOutputWithCrossAttentions, + MaskedLMOutput, + MultipleChoiceModelOutput, + QuestionAnsweringModelOutput, + SequenceClassifierOutput, + TokenClassifierOutput, +) +from ...modeling_utils import PreTrainedModel +from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer +from ...utils import ( + auto_docstring, + logging, +) +from .configuration_convbert import ConvBertConfig + + +logger = logging.get_logger(__name__) + + +def load_tf_weights_in_convbert(model, config, tf_checkpoint_path): + """Load tf checkpoints in a pytorch model.""" + try: + import tensorflow as tf + except ImportError: + logger.error( + "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see " + "https://www.tensorflow.org/install/ for installation instructions." + ) + raise + tf_path = os.path.abspath(tf_checkpoint_path) + logger.info(f"Converting TensorFlow checkpoint from {tf_path}") + # Load weights from TF model + init_vars = tf.train.list_variables(tf_path) + tf_data = {} + for name, shape in init_vars: + logger.info(f"Loading TF weight {name} with shape {shape}") + array = tf.train.load_variable(tf_path, name) + tf_data[name] = array + + param_mapping = { + "embeddings.word_embeddings.weight": "electra/embeddings/word_embeddings", + "embeddings.position_embeddings.weight": "electra/embeddings/position_embeddings", + "embeddings.token_type_embeddings.weight": "electra/embeddings/token_type_embeddings", + "embeddings.LayerNorm.weight": "electra/embeddings/LayerNorm/gamma", + "embeddings.LayerNorm.bias": "electra/embeddings/LayerNorm/beta", + "embeddings_project.weight": "electra/embeddings_project/kernel", + "embeddings_project.bias": "electra/embeddings_project/bias", + } + if config.num_groups > 1: + group_dense_name = "g_dense" + else: + group_dense_name = "dense" + + for j in range(config.num_hidden_layers): + param_mapping[f"encoder.layer.{j}.attention.self.query.weight"] = ( + f"electra/encoder/layer_{j}/attention/self/query/kernel" + ) + param_mapping[f"encoder.layer.{j}.attention.self.query.bias"] = ( + f"electra/encoder/layer_{j}/attention/self/query/bias" + ) + param_mapping[f"encoder.layer.{j}.attention.self.key.weight"] = ( + f"electra/encoder/layer_{j}/attention/self/key/kernel" + ) + param_mapping[f"encoder.layer.{j}.attention.self.key.bias"] = ( + f"electra/encoder/layer_{j}/attention/self/key/bias" + ) + param_mapping[f"encoder.layer.{j}.attention.self.value.weight"] = ( + f"electra/encoder/layer_{j}/attention/self/value/kernel" + ) + param_mapping[f"encoder.layer.{j}.attention.self.value.bias"] = ( + f"electra/encoder/layer_{j}/attention/self/value/bias" + ) + param_mapping[f"encoder.layer.{j}.attention.self.key_conv_attn_layer.depthwise.weight"] = ( + f"electra/encoder/layer_{j}/attention/self/conv_attn_key/depthwise_kernel" + ) + param_mapping[f"encoder.layer.{j}.attention.self.key_conv_attn_layer.pointwise.weight"] = ( + f"electra/encoder/layer_{j}/attention/self/conv_attn_key/pointwise_kernel" + ) + param_mapping[f"encoder.layer.{j}.attention.self.key_conv_attn_layer.bias"] = ( + f"electra/encoder/layer_{j}/attention/self/conv_attn_key/bias" + ) + param_mapping[f"encoder.layer.{j}.attention.self.conv_kernel_layer.weight"] = ( + f"electra/encoder/layer_{j}/attention/self/conv_attn_kernel/kernel" + ) + param_mapping[f"encoder.layer.{j}.attention.self.conv_kernel_layer.bias"] = ( + f"electra/encoder/layer_{j}/attention/self/conv_attn_kernel/bias" + ) + param_mapping[f"encoder.layer.{j}.attention.self.conv_out_layer.weight"] = ( + f"electra/encoder/layer_{j}/attention/self/conv_attn_point/kernel" + ) + param_mapping[f"encoder.layer.{j}.attention.self.conv_out_layer.bias"] = ( + f"electra/encoder/layer_{j}/attention/self/conv_attn_point/bias" + ) + param_mapping[f"encoder.layer.{j}.attention.output.dense.weight"] = ( + f"electra/encoder/layer_{j}/attention/output/dense/kernel" + ) + param_mapping[f"encoder.layer.{j}.attention.output.LayerNorm.weight"] = ( + f"electra/encoder/layer_{j}/attention/output/LayerNorm/gamma" + ) + param_mapping[f"encoder.layer.{j}.attention.output.dense.bias"] = ( + f"electra/encoder/layer_{j}/attention/output/dense/bias" + ) + param_mapping[f"encoder.layer.{j}.attention.output.LayerNorm.bias"] = ( + f"electra/encoder/layer_{j}/attention/output/LayerNorm/beta" + ) + param_mapping[f"encoder.layer.{j}.intermediate.dense.weight"] = ( + f"electra/encoder/layer_{j}/intermediate/{group_dense_name}/kernel" + ) + param_mapping[f"encoder.layer.{j}.intermediate.dense.bias"] = ( + f"electra/encoder/layer_{j}/intermediate/{group_dense_name}/bias" + ) + param_mapping[f"encoder.layer.{j}.output.dense.weight"] = ( + f"electra/encoder/layer_{j}/output/{group_dense_name}/kernel" + ) + param_mapping[f"encoder.layer.{j}.output.dense.bias"] = ( + f"electra/encoder/layer_{j}/output/{group_dense_name}/bias" + ) + param_mapping[f"encoder.layer.{j}.output.LayerNorm.weight"] = ( + f"electra/encoder/layer_{j}/output/LayerNorm/gamma" + ) + param_mapping[f"encoder.layer.{j}.output.LayerNorm.bias"] = f"electra/encoder/layer_{j}/output/LayerNorm/beta" + + for param in model.named_parameters(): + param_name = param[0] + retriever = attrgetter(param_name) + result = retriever(model) + tf_name = param_mapping[param_name] + value = torch.from_numpy(tf_data[tf_name]) + logger.info(f"TF: {tf_name}, PT: {param_name} ") + if tf_name.endswith("/kernel"): + if not tf_name.endswith("/intermediate/g_dense/kernel"): + if not tf_name.endswith("/output/g_dense/kernel"): + value = value.T + if tf_name.endswith("/depthwise_kernel"): + value = value.permute(1, 2, 0) # 2, 0, 1 + if tf_name.endswith("/pointwise_kernel"): + value = value.permute(2, 1, 0) # 2, 1, 0 + if tf_name.endswith("/conv_attn_key/bias"): + value = value.unsqueeze(-1) + result.data = value + return model + + +class ConvBertEmbeddings(nn.Module): + """Construct the embeddings from word, position and token_type embeddings.""" + + def __init__(self, config): + super().__init__() + self.word_embeddings = nn.Embedding(config.vocab_size, config.embedding_size, padding_idx=config.pad_token_id) + self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.embedding_size) + self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.embedding_size) + + # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load + # any TensorFlow checkpoint file + self.LayerNorm = nn.LayerNorm(config.embedding_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + # position_ids (1, len position emb) is contiguous in memory and exported when serialized + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) + self.register_buffer( + "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False + ) + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + ) -> torch.LongTensor: + if input_ids is not None: + input_shape = input_ids.size() + else: + input_shape = inputs_embeds.size()[:-1] + + seq_length = input_shape[1] + + if position_ids is None: + position_ids = self.position_ids[:, :seq_length] + + # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs + # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves + # issue #5664 + if token_type_ids is None: + if hasattr(self, "token_type_ids"): + buffered_token_type_ids = self.token_type_ids[:, :seq_length] + buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length) + token_type_ids = buffered_token_type_ids_expanded + else: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device) + + if inputs_embeds is None: + inputs_embeds = self.word_embeddings(input_ids) + position_embeddings = self.position_embeddings(position_ids) + token_type_embeddings = self.token_type_embeddings(token_type_ids) + + embeddings = inputs_embeds + position_embeddings + token_type_embeddings + embeddings = self.LayerNorm(embeddings) + embeddings = self.dropout(embeddings) + return embeddings + + +@auto_docstring +class ConvBertPreTrainedModel(PreTrainedModel): + config: ConvBertConfig + load_tf_weights = load_tf_weights_in_convbert + base_model_prefix = "convbert" + supports_gradient_checkpointing = True + + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, (nn.Linear, nn.Conv1d)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + elif isinstance(module, SeparableConv1D): + module.bias.data.zero_() + elif isinstance(module, GroupedLinearLayer): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + module.bias.data.zero_() + + +class SeparableConv1D(nn.Module): + """This class implements separable convolution, i.e. a depthwise and a pointwise layer""" + + def __init__(self, config, input_filters, output_filters, kernel_size, **kwargs): + super().__init__() + self.depthwise = nn.Conv1d( + input_filters, + input_filters, + kernel_size=kernel_size, + groups=input_filters, + padding=kernel_size // 2, + bias=False, + ) + self.pointwise = nn.Conv1d(input_filters, output_filters, kernel_size=1, bias=False) + self.bias = nn.Parameter(torch.zeros(output_filters, 1)) + + self.depthwise.weight.data.normal_(mean=0.0, std=config.initializer_range) + self.pointwise.weight.data.normal_(mean=0.0, std=config.initializer_range) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + x = self.depthwise(hidden_states) + x = self.pointwise(x) + x += self.bias + return x + + +class ConvBertSelfAttention(nn.Module): + def __init__(self, config): + super().__init__() + if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): + raise ValueError( + f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention " + f"heads ({config.num_attention_heads})" + ) + + new_num_attention_heads = config.num_attention_heads // config.head_ratio + if new_num_attention_heads < 1: + self.head_ratio = config.num_attention_heads + self.num_attention_heads = 1 + else: + self.num_attention_heads = new_num_attention_heads + self.head_ratio = config.head_ratio + + self.conv_kernel_size = config.conv_kernel_size + if config.hidden_size % self.num_attention_heads != 0: + raise ValueError("hidden_size should be divisible by num_attention_heads") + + self.attention_head_size = (config.hidden_size // self.num_attention_heads) // 2 + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size) + self.key = nn.Linear(config.hidden_size, self.all_head_size) + self.value = nn.Linear(config.hidden_size, self.all_head_size) + + self.key_conv_attn_layer = SeparableConv1D( + config, config.hidden_size, self.all_head_size, self.conv_kernel_size + ) + self.conv_kernel_layer = nn.Linear(self.all_head_size, self.num_attention_heads * self.conv_kernel_size) + self.conv_out_layer = nn.Linear(config.hidden_size, self.all_head_size) + + self.unfold = nn.Unfold( + kernel_size=[self.conv_kernel_size, 1], padding=[int((self.conv_kernel_size - 1) / 2), 0] + ) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = False, + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: + batch_size, seq_length, _ = hidden_states.shape + # If this is instantiated as a cross-attention module, the keys + # and values come from an encoder; the attention mask needs to be + # such that the encoder's padding tokens are not attended to. + if encoder_hidden_states is not None: + mixed_key_layer = self.key(encoder_hidden_states) + mixed_value_layer = self.value(encoder_hidden_states) + else: + mixed_key_layer = self.key(hidden_states) + mixed_value_layer = self.value(hidden_states) + + mixed_key_conv_attn_layer = self.key_conv_attn_layer(hidden_states.transpose(1, 2)) + mixed_key_conv_attn_layer = mixed_key_conv_attn_layer.transpose(1, 2) + + mixed_query_layer = self.query(hidden_states) + query_layer = mixed_query_layer.view( + batch_size, -1, self.num_attention_heads, self.attention_head_size + ).transpose(1, 2) + key_layer = mixed_key_layer.view(batch_size, -1, self.num_attention_heads, self.attention_head_size).transpose( + 1, 2 + ) + value_layer = mixed_value_layer.view( + batch_size, -1, self.num_attention_heads, self.attention_head_size + ).transpose(1, 2) + conv_attn_layer = torch.multiply(mixed_key_conv_attn_layer, mixed_query_layer) + + conv_kernel_layer = self.conv_kernel_layer(conv_attn_layer) + conv_kernel_layer = torch.reshape(conv_kernel_layer, [-1, self.conv_kernel_size, 1]) + conv_kernel_layer = torch.softmax(conv_kernel_layer, dim=1) + + conv_out_layer = self.conv_out_layer(hidden_states) + conv_out_layer = torch.reshape(conv_out_layer, [batch_size, -1, self.all_head_size]) + conv_out_layer = conv_out_layer.transpose(1, 2).contiguous().unsqueeze(-1) + conv_out_layer = nn.functional.unfold( + conv_out_layer, + kernel_size=[self.conv_kernel_size, 1], + dilation=1, + padding=[(self.conv_kernel_size - 1) // 2, 0], + stride=1, + ) + conv_out_layer = conv_out_layer.transpose(1, 2).reshape( + batch_size, -1, self.all_head_size, self.conv_kernel_size + ) + conv_out_layer = torch.reshape(conv_out_layer, [-1, self.attention_head_size, self.conv_kernel_size]) + conv_out_layer = torch.matmul(conv_out_layer, conv_kernel_layer) + conv_out_layer = torch.reshape(conv_out_layer, [-1, self.all_head_size]) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in ConvBertModel forward() function) + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = nn.functional.softmax(attention_scores, dim=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = attention_probs * head_mask + + context_layer = torch.matmul(attention_probs, value_layer) + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + + conv_out = torch.reshape(conv_out_layer, [batch_size, -1, self.num_attention_heads, self.attention_head_size]) + context_layer = torch.cat([context_layer, conv_out], 2) + + # conv and context + new_context_layer_shape = context_layer.size()[:-2] + ( + self.num_attention_heads * self.attention_head_size * 2, + ) + context_layer = context_layer.view(*new_context_layer_shape) + + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + return outputs + + +class ConvBertSelfOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class ConvBertAttention(nn.Module): + def __init__(self, config): + super().__init__() + self.self = ConvBertSelfAttention(config) + self.output = ConvBertSelfOutput(config) + self.pruned_heads = set() + + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads + ) + + # Prune linear layers + self.self.query = prune_linear_layer(self.self.query, index) + self.self.key = prune_linear_layer(self.self.key, index) + self.self.value = prune_linear_layer(self.self.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.self.num_attention_heads = self.self.num_attention_heads - len(heads) + self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = False, + ) -> tuple[torch.Tensor, Optional[torch.FloatTensor]]: + self_outputs = self.self( + hidden_states, + attention_mask, + head_mask, + encoder_hidden_states, + output_attentions, + ) + attention_output = self.output(self_outputs[0], hidden_states) + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + return outputs + + +class GroupedLinearLayer(nn.Module): + def __init__(self, input_size, output_size, num_groups): + super().__init__() + self.input_size = input_size + self.output_size = output_size + self.num_groups = num_groups + self.group_in_dim = self.input_size // self.num_groups + self.group_out_dim = self.output_size // self.num_groups + self.weight = nn.Parameter(torch.empty(self.num_groups, self.group_in_dim, self.group_out_dim)) + self.bias = nn.Parameter(torch.empty(output_size)) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + batch_size = list(hidden_states.size())[0] + x = torch.reshape(hidden_states, [-1, self.num_groups, self.group_in_dim]) + x = x.permute(1, 0, 2) + x = torch.matmul(x, self.weight) + x = x.permute(1, 0, 2) + x = torch.reshape(x, [batch_size, -1, self.output_size]) + x = x + self.bias + return x + + +class ConvBertIntermediate(nn.Module): + def __init__(self, config): + super().__init__() + if config.num_groups == 1: + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + else: + self.dense = GroupedLinearLayer( + input_size=config.hidden_size, output_size=config.intermediate_size, num_groups=config.num_groups + ) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +class ConvBertOutput(nn.Module): + def __init__(self, config): + super().__init__() + if config.num_groups == 1: + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + else: + self.dense = GroupedLinearLayer( + input_size=config.intermediate_size, output_size=config.hidden_size, num_groups=config.num_groups + ) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class ConvBertLayer(GradientCheckpointingLayer): + def __init__(self, config): + super().__init__() + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 + self.attention = ConvBertAttention(config) + self.is_decoder = config.is_decoder + self.add_cross_attention = config.add_cross_attention + if self.add_cross_attention: + if not self.is_decoder: + raise TypeError(f"{self} should be used as a decoder model if cross attention is added") + self.crossattention = ConvBertAttention(config) + self.intermediate = ConvBertIntermediate(config) + self.output = ConvBertOutput(config) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = False, + ) -> tuple[torch.Tensor, Optional[torch.FloatTensor]]: + self_attention_outputs = self.attention( + hidden_states, + attention_mask, + head_mask, + output_attentions=output_attentions, + ) + attention_output = self_attention_outputs[0] + outputs = self_attention_outputs[1:] # add self attentions if we output attention weights + + if self.is_decoder and encoder_hidden_states is not None: + if not hasattr(self, "crossattention"): + raise AttributeError( + f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers" + " by setting `config.add_cross_attention=True`" + ) + cross_attention_outputs = self.crossattention( + attention_output, + encoder_attention_mask, + head_mask, + encoder_hidden_states, + output_attentions, + ) + attention_output = cross_attention_outputs[0] + outputs = outputs + cross_attention_outputs[1:] # add cross attentions if we output attention weights + + layer_output = apply_chunking_to_forward( + self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output + ) + outputs = (layer_output,) + outputs + return outputs + + def feed_forward_chunk(self, attention_output): + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + return layer_output + + +class ConvBertEncoder(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.layer = nn.ModuleList([ConvBertLayer(config) for _ in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = False, + output_hidden_states: Optional[bool] = False, + return_dict: Optional[bool] = True, + ) -> Union[tuple, BaseModelOutputWithCrossAttentions]: + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None + for i, layer_module in enumerate(self.layer): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_head_mask = head_mask[i] if head_mask is not None else None + + layer_outputs = layer_module( + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + output_attentions, + ) + hidden_states = layer_outputs[0] + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + if self.config.add_cross_attention: + all_cross_attentions = all_cross_attentions + (layer_outputs[2],) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple( + v + for v in [hidden_states, all_hidden_states, all_self_attentions, all_cross_attentions] + if v is not None + ) + return BaseModelOutputWithCrossAttentions( + last_hidden_state=hidden_states, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + cross_attentions=all_cross_attentions, + ) + + +class ConvBertPredictionHeadTransform(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + if isinstance(config.hidden_act, str): + self.transform_act_fn = ACT2FN[config.hidden_act] + else: + self.transform_act_fn = config.hidden_act + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.transform_act_fn(hidden_states) + hidden_states = self.LayerNorm(hidden_states) + return hidden_states + + +# Copied from transformers.models.xlm.modeling_xlm.XLMSequenceSummary with XLM->ConvBert +class ConvBertSequenceSummary(nn.Module): + r""" + Compute a single vector summary of a sequence hidden states. + + Args: + config ([`ConvBertConfig`]): + The config used by the model. Relevant arguments in the config class of the model are (refer to the actual + config class of your model for the default values it uses): + + - **summary_type** (`str`) -- The method to use to make this summary. Accepted values are: + + - `"last"` -- Take the last token hidden state (like XLNet) + - `"first"` -- Take the first token hidden state (like Bert) + - `"mean"` -- Take the mean of all tokens hidden states + - `"cls_index"` -- Supply a Tensor of classification token position (GPT/GPT-2) + - `"attn"` -- Not implemented now, use multi-head attention + + - **summary_use_proj** (`bool`) -- Add a projection after the vector extraction. + - **summary_proj_to_labels** (`bool`) -- If `True`, the projection outputs to `config.num_labels` classes + (otherwise to `config.hidden_size`). + - **summary_activation** (`Optional[str]`) -- Set to `"tanh"` to add a tanh activation to the output, + another string or `None` will add no activation. + - **summary_first_dropout** (`float`) -- Optional dropout probability before the projection and activation. + - **summary_last_dropout** (`float`)-- Optional dropout probability after the projection and activation. + """ + + def __init__(self, config: ConvBertConfig): + super().__init__() + + self.summary_type = getattr(config, "summary_type", "last") + if self.summary_type == "attn": + # We should use a standard multi-head attention module with absolute positional embedding for that. + # Cf. https://github.com/zihangdai/xlnet/blob/master/modeling.py#L253-L276 + # We can probably just use the multi-head attention module of PyTorch >=1.1.0 + raise NotImplementedError + + self.summary = nn.Identity() + if hasattr(config, "summary_use_proj") and config.summary_use_proj: + if hasattr(config, "summary_proj_to_labels") and config.summary_proj_to_labels and config.num_labels > 0: + num_classes = config.num_labels + else: + num_classes = config.hidden_size + self.summary = nn.Linear(config.hidden_size, num_classes) + + activation_string = getattr(config, "summary_activation", None) + self.activation: Callable = get_activation(activation_string) if activation_string else nn.Identity() + + self.first_dropout = nn.Identity() + if hasattr(config, "summary_first_dropout") and config.summary_first_dropout > 0: + self.first_dropout = nn.Dropout(config.summary_first_dropout) + + self.last_dropout = nn.Identity() + if hasattr(config, "summary_last_dropout") and config.summary_last_dropout > 0: + self.last_dropout = nn.Dropout(config.summary_last_dropout) + + def forward( + self, hidden_states: torch.FloatTensor, cls_index: Optional[torch.LongTensor] = None + ) -> torch.FloatTensor: + """ + Compute a single vector summary of a sequence hidden states. + + Args: + hidden_states (`torch.FloatTensor` of shape `[batch_size, seq_len, hidden_size]`): + The hidden states of the last layer. + cls_index (`torch.LongTensor` of shape `[batch_size]` or `[batch_size, ...]` where ... are optional leading dimensions of `hidden_states`, *optional*): + Used if `summary_type == "cls_index"` and takes the last token of the sequence as classification token. + + Returns: + `torch.FloatTensor`: The summary of the sequence hidden states. + """ + if self.summary_type == "last": + output = hidden_states[:, -1] + elif self.summary_type == "first": + output = hidden_states[:, 0] + elif self.summary_type == "mean": + output = hidden_states.mean(dim=1) + elif self.summary_type == "cls_index": + if cls_index is None: + cls_index = torch.full_like( + hidden_states[..., :1, :], + hidden_states.shape[-2] - 1, + dtype=torch.long, + ) + else: + cls_index = cls_index.unsqueeze(-1).unsqueeze(-1) + cls_index = cls_index.expand((-1,) * (cls_index.dim() - 1) + (hidden_states.size(-1),)) + # shape of cls_index: (bsz, XX, 1, hidden_size) where XX are optional leading dim of hidden_states + output = hidden_states.gather(-2, cls_index).squeeze(-2) # shape (bsz, XX, hidden_size) + elif self.summary_type == "attn": + raise NotImplementedError + + output = self.first_dropout(output) + output = self.summary(output) + output = self.activation(output) + output = self.last_dropout(output) + + return output + + +@auto_docstring +class ConvBertModel(ConvBertPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.embeddings = ConvBertEmbeddings(config) + + if config.embedding_size != config.hidden_size: + self.embeddings_project = nn.Linear(config.embedding_size, config.hidden_size) + + self.encoder = ConvBertEncoder(config) + self.config = config + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embeddings.word_embeddings + + def set_input_embeddings(self, value): + self.embeddings.word_embeddings = value + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, BaseModelOutputWithCrossAttentions]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask) + input_shape = input_ids.size() + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + batch_size, seq_length = input_shape + device = input_ids.device if input_ids is not None else inputs_embeds.device + + if attention_mask is None: + attention_mask = torch.ones(input_shape, device=device) + if token_type_ids is None: + if hasattr(self.embeddings, "token_type_ids"): + buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length] + buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length) + token_type_ids = buffered_token_type_ids_expanded + else: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) + + extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape) + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + + hidden_states = self.embeddings( + input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds + ) + + if hasattr(self, "embeddings_project"): + hidden_states = self.embeddings_project(hidden_states) + + hidden_states = self.encoder( + hidden_states, + attention_mask=extended_attention_mask, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + return hidden_states + + +class ConvBertGeneratorPredictions(nn.Module): + """Prediction module for the generator, made up of two dense layers.""" + + def __init__(self, config): + super().__init__() + + self.activation = get_activation("gelu") + self.LayerNorm = nn.LayerNorm(config.embedding_size, eps=config.layer_norm_eps) + self.dense = nn.Linear(config.hidden_size, config.embedding_size) + + def forward(self, generator_hidden_states: torch.FloatTensor) -> torch.FloatTensor: + hidden_states = self.dense(generator_hidden_states) + hidden_states = self.activation(hidden_states) + hidden_states = self.LayerNorm(hidden_states) + + return hidden_states + + +@auto_docstring +class ConvBertForMaskedLM(ConvBertPreTrainedModel): + _tied_weights_keys = ["generator.lm_head.weight"] + + def __init__(self, config): + super().__init__(config) + + self.convbert = ConvBertModel(config) + self.generator_predictions = ConvBertGeneratorPredictions(config) + + self.generator_lm_head = nn.Linear(config.embedding_size, config.vocab_size) + # Initialize weights and apply final processing + self.post_init() + + def get_output_embeddings(self): + return self.generator_lm_head + + def set_output_embeddings(self, word_embeddings): + self.generator_lm_head = word_embeddings + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, MaskedLMOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., + config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the + loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + generator_hidden_states = self.convbert( + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + inputs_embeds, + output_attentions, + output_hidden_states, + return_dict, + ) + generator_sequence_output = generator_hidden_states[0] + + prediction_scores = self.generator_predictions(generator_sequence_output) + prediction_scores = self.generator_lm_head(prediction_scores) + + loss = None + # Masked language modeling softmax layer + if labels is not None: + loss_fct = nn.CrossEntropyLoss() # -100 index = padding token + loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) + + if not return_dict: + output = (prediction_scores,) + generator_hidden_states[1:] + return ((loss,) + output) if loss is not None else output + + return MaskedLMOutput( + loss=loss, + logits=prediction_scores, + hidden_states=generator_hidden_states.hidden_states, + attentions=generator_hidden_states.attentions, + ) + + +class ConvBertClassificationHead(nn.Module): + """Head for sentence-level classification tasks.""" + + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + classifier_dropout = ( + config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob + ) + self.dropout = nn.Dropout(classifier_dropout) + self.out_proj = nn.Linear(config.hidden_size, config.num_labels) + + self.config = config + + def forward(self, hidden_states: torch.Tensor, **kwargs) -> torch.Tensor: + x = hidden_states[:, 0, :] # take token (equiv. to [CLS]) + x = self.dropout(x) + x = self.dense(x) + x = ACT2FN[self.config.hidden_act](x) + x = self.dropout(x) + x = self.out_proj(x) + return x + + +@auto_docstring( + custom_intro=""" + ConvBERT Model transformer with a sequence classification/regression head on top (a linear layer on top of the + pooled output) e.g. for GLUE tasks. + """ +) +class ConvBertForSequenceClassification(ConvBertPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.config = config + self.convbert = ConvBertModel(config) + self.classifier = ConvBertClassificationHead(config) + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, SequenceClassifierOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.convbert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + logits = self.classifier(sequence_output) + + loss = None + if labels is not None: + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits, labels) + + if not return_dict: + output = (logits,) + outputs[1:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@auto_docstring +class ConvBertForMultipleChoice(ConvBertPreTrainedModel): + def __init__(self, config): + super().__init__(config) + + self.convbert = ConvBertModel(config) + self.sequence_summary = ConvBertSequenceSummary(config) + self.classifier = nn.Linear(config.hidden_size, 1) + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, MultipleChoiceModelOutput]: + r""" + input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, + 1]`: + + + - 0 corresponds to a *sentence A* token, + - 1 corresponds to a *sentence B* token. + + [What are token type IDs?](../glossary#token-type-ids) + position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + + [What are position IDs?](../glossary#position-ids) + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert *input_ids* indices into associated vectors than the + model's internal embedding lookup matrix. + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., + num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See + `input_ids` above) + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] + + input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None + attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None + token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None + position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None + inputs_embeds = ( + inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1)) + if inputs_embeds is not None + else None + ) + + outputs = self.convbert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + pooled_output = self.sequence_summary(sequence_output) + logits = self.classifier(pooled_output) + reshaped_logits = logits.view(-1, num_choices) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(reshaped_logits, labels) + + if not return_dict: + output = (reshaped_logits,) + outputs[1:] + return ((loss,) + output) if loss is not None else output + + return MultipleChoiceModelOutput( + loss=loss, + logits=reshaped_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@auto_docstring +class ConvBertForTokenClassification(ConvBertPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + + self.convbert = ConvBertModel(config) + classifier_dropout = ( + config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob + ) + self.dropout = nn.Dropout(classifier_dropout) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, TokenClassifierOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.convbert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + sequence_output = self.dropout(sequence_output) + logits = self.classifier(sequence_output) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + + if not return_dict: + output = (logits,) + outputs[1:] + return ((loss,) + output) if loss is not None else output + + return TokenClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@auto_docstring +class ConvBertForQuestionAnswering(ConvBertPreTrainedModel): + def __init__(self, config): + super().__init__(config) + + self.num_labels = config.num_labels + self.convbert = ConvBertModel(config) + self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + start_positions: Optional[torch.LongTensor] = None, + end_positions: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, QuestionAnsweringModelOutput]: + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.convbert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + logits = self.qa_outputs(sequence_output) + start_logits, end_logits = logits.split(1, dim=-1) + start_logits = start_logits.squeeze(-1).contiguous() + end_logits = end_logits.squeeze(-1).contiguous() + + total_loss = None + if start_positions is not None and end_positions is not None: + # If we are on multi-GPU, split add a dimension + if len(start_positions.size()) > 1: + start_positions = start_positions.squeeze(-1) + if len(end_positions.size()) > 1: + end_positions = end_positions.squeeze(-1) + # sometimes the start/end positions are outside our model inputs, we ignore these terms + ignored_index = start_logits.size(1) + start_positions = start_positions.clamp(0, ignored_index) + end_positions = end_positions.clamp(0, ignored_index) + + loss_fct = CrossEntropyLoss(ignore_index=ignored_index) + start_loss = loss_fct(start_logits, start_positions) + end_loss = loss_fct(end_logits, end_positions) + total_loss = (start_loss + end_loss) / 2 + + if not return_dict: + output = (start_logits, end_logits) + outputs[1:] + return ((total_loss,) + output) if total_loss is not None else output + + return QuestionAnsweringModelOutput( + loss=total_loss, + start_logits=start_logits, + end_logits=end_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +__all__ = [ + "ConvBertForMaskedLM", + "ConvBertForMultipleChoice", + "ConvBertForQuestionAnswering", + "ConvBertForSequenceClassification", + "ConvBertForTokenClassification", + "ConvBertLayer", + "ConvBertModel", + "ConvBertPreTrainedModel", + "load_tf_weights_in_convbert", +] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/convbert/modeling_tf_convbert.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/convbert/modeling_tf_convbert.py new file mode 100644 index 0000000000000000000000000000000000000000..47c720f5c12c36a67ed3441d4dcccea329b69f70 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/convbert/modeling_tf_convbert.py @@ -0,0 +1,1474 @@ +# coding=utf-8 +# Copyright 2021 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""TF 2.0 ConvBERT model.""" + +from __future__ import annotations + +import numpy as np +import tensorflow as tf + +from ...activations_tf import get_tf_activation +from ...modeling_tf_outputs import ( + TFBaseModelOutput, + TFMaskedLMOutput, + TFMultipleChoiceModelOutput, + TFQuestionAnsweringModelOutput, + TFSequenceClassifierOutput, + TFTokenClassifierOutput, +) +from ...modeling_tf_utils import ( + TFMaskedLanguageModelingLoss, + TFModelInputType, + TFMultipleChoiceLoss, + TFPreTrainedModel, + TFQuestionAnsweringLoss, + TFSequenceClassificationLoss, + TFSequenceSummary, + TFTokenClassificationLoss, + get_initializer, + keras, + keras_serializable, + unpack_inputs, +) +from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax +from ...utils import ( + add_code_sample_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, +) +from .configuration_convbert import ConvBertConfig + + +logger = logging.get_logger(__name__) + +_CHECKPOINT_FOR_DOC = "YituTech/conv-bert-base" +_CONFIG_FOR_DOC = "ConvBertConfig" + + +# Copied from transformers.models.albert.modeling_tf_albert.TFAlbertEmbeddings with Albert->ConvBert +class TFConvBertEmbeddings(keras.layers.Layer): + """Construct the embeddings from word, position and token_type embeddings.""" + + def __init__(self, config: ConvBertConfig, **kwargs): + super().__init__(**kwargs) + + self.config = config + self.embedding_size = config.embedding_size + self.max_position_embeddings = config.max_position_embeddings + self.initializer_range = config.initializer_range + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) + + def build(self, input_shape=None): + with tf.name_scope("word_embeddings"): + self.weight = self.add_weight( + name="weight", + shape=[self.config.vocab_size, self.embedding_size], + initializer=get_initializer(self.initializer_range), + ) + + with tf.name_scope("token_type_embeddings"): + self.token_type_embeddings = self.add_weight( + name="embeddings", + shape=[self.config.type_vocab_size, self.embedding_size], + initializer=get_initializer(self.initializer_range), + ) + + with tf.name_scope("position_embeddings"): + self.position_embeddings = self.add_weight( + name="embeddings", + shape=[self.max_position_embeddings, self.embedding_size], + initializer=get_initializer(self.initializer_range), + ) + + if self.built: + return + self.built = True + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.embedding_size]) + + # Copied from transformers.models.bert.modeling_tf_bert.TFBertEmbeddings.call + def call( + self, + input_ids: tf.Tensor | None = None, + position_ids: tf.Tensor | None = None, + token_type_ids: tf.Tensor | None = None, + inputs_embeds: tf.Tensor | None = None, + past_key_values_length=0, + training: bool = False, + ) -> tf.Tensor: + """ + Applies embedding based on inputs tensor. + + Returns: + final_embeddings (`tf.Tensor`): output embedding tensor. + """ + if input_ids is None and inputs_embeds is None: + raise ValueError("Need to provide either `input_ids` or `input_embeds`.") + + if input_ids is not None: + check_embeddings_within_bounds(input_ids, self.config.vocab_size) + inputs_embeds = tf.gather(params=self.weight, indices=input_ids) + + input_shape = shape_list(inputs_embeds)[:-1] + + if token_type_ids is None: + token_type_ids = tf.fill(dims=input_shape, value=0) + + if position_ids is None: + position_ids = tf.expand_dims( + tf.range(start=past_key_values_length, limit=input_shape[1] + past_key_values_length), axis=0 + ) + + position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids) + token_type_embeds = tf.gather(params=self.token_type_embeddings, indices=token_type_ids) + final_embeddings = inputs_embeds + position_embeds + token_type_embeds + final_embeddings = self.LayerNorm(inputs=final_embeddings) + final_embeddings = self.dropout(inputs=final_embeddings, training=training) + + return final_embeddings + + +class TFConvBertSelfAttention(keras.layers.Layer): + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + + if config.hidden_size % config.num_attention_heads != 0: + raise ValueError( + f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention " + f"heads ({config.num_attention_heads})" + ) + + new_num_attention_heads = int(config.num_attention_heads / config.head_ratio) + if new_num_attention_heads < 1: + self.head_ratio = config.num_attention_heads + num_attention_heads = 1 + else: + num_attention_heads = new_num_attention_heads + self.head_ratio = config.head_ratio + + self.num_attention_heads = num_attention_heads + self.conv_kernel_size = config.conv_kernel_size + + if config.hidden_size % self.num_attention_heads != 0: + raise ValueError("hidden_size should be divisible by num_attention_heads") + + self.attention_head_size = config.hidden_size // config.num_attention_heads + self.all_head_size = self.num_attention_heads * self.attention_head_size + self.query = keras.layers.Dense( + self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query" + ) + self.key = keras.layers.Dense( + self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key" + ) + self.value = keras.layers.Dense( + self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value" + ) + + self.key_conv_attn_layer = keras.layers.SeparableConv1D( + self.all_head_size, + self.conv_kernel_size, + padding="same", + activation=None, + depthwise_initializer=get_initializer(1 / self.conv_kernel_size), + pointwise_initializer=get_initializer(config.initializer_range), + name="key_conv_attn_layer", + ) + + self.conv_kernel_layer = keras.layers.Dense( + self.num_attention_heads * self.conv_kernel_size, + activation=None, + name="conv_kernel_layer", + kernel_initializer=get_initializer(config.initializer_range), + ) + + self.conv_out_layer = keras.layers.Dense( + self.all_head_size, + activation=None, + name="conv_out_layer", + kernel_initializer=get_initializer(config.initializer_range), + ) + + self.dropout = keras.layers.Dropout(config.attention_probs_dropout_prob) + self.config = config + + def transpose_for_scores(self, x, batch_size): + # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size] + x = tf.reshape(x, (batch_size, -1, self.num_attention_heads, self.attention_head_size)) + return tf.transpose(x, perm=[0, 2, 1, 3]) + + def call(self, hidden_states, attention_mask, head_mask, output_attentions, training=False): + batch_size = shape_list(hidden_states)[0] + mixed_query_layer = self.query(hidden_states) + mixed_key_layer = self.key(hidden_states) + mixed_value_layer = self.value(hidden_states) + + mixed_key_conv_attn_layer = self.key_conv_attn_layer(hidden_states) + + query_layer = self.transpose_for_scores(mixed_query_layer, batch_size) + key_layer = self.transpose_for_scores(mixed_key_layer, batch_size) + conv_attn_layer = tf.multiply(mixed_key_conv_attn_layer, mixed_query_layer) + + conv_kernel_layer = self.conv_kernel_layer(conv_attn_layer) + conv_kernel_layer = tf.reshape(conv_kernel_layer, [-1, self.conv_kernel_size, 1]) + conv_kernel_layer = stable_softmax(conv_kernel_layer, axis=1) + + paddings = tf.constant( + [ + [ + 0, + 0, + ], + [int((self.conv_kernel_size - 1) / 2), int((self.conv_kernel_size - 1) / 2)], + [0, 0], + ] + ) + + conv_out_layer = self.conv_out_layer(hidden_states) + conv_out_layer = tf.reshape(conv_out_layer, [batch_size, -1, self.all_head_size]) + conv_out_layer = tf.pad(conv_out_layer, paddings, "CONSTANT") + + unfold_conv_out_layer = tf.stack( + [ + tf.slice(conv_out_layer, [0, i, 0], [batch_size, shape_list(mixed_query_layer)[1], self.all_head_size]) + for i in range(self.conv_kernel_size) + ], + axis=-1, + ) + + conv_out_layer = tf.reshape(unfold_conv_out_layer, [-1, self.attention_head_size, self.conv_kernel_size]) + + conv_out_layer = tf.matmul(conv_out_layer, conv_kernel_layer) + conv_out_layer = tf.reshape(conv_out_layer, [-1, self.all_head_size]) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = tf.matmul( + query_layer, key_layer, transpose_b=True + ) # (batch size, num_heads, seq_len_q, seq_len_k) + dk = tf.cast(shape_list(key_layer)[-1], attention_scores.dtype) # scale attention_scores + attention_scores = attention_scores / tf.math.sqrt(dk) + + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in TFBertModel call() function) + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = stable_softmax(attention_scores, axis=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs, training=training) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = attention_probs * head_mask + + value_layer = tf.reshape( + mixed_value_layer, [batch_size, -1, self.num_attention_heads, self.attention_head_size] + ) + value_layer = tf.transpose(value_layer, [0, 2, 1, 3]) + + context_layer = tf.matmul(attention_probs, value_layer) + context_layer = tf.transpose(context_layer, perm=[0, 2, 1, 3]) + + conv_out = tf.reshape(conv_out_layer, [batch_size, -1, self.num_attention_heads, self.attention_head_size]) + context_layer = tf.concat([context_layer, conv_out], 2) + context_layer = tf.reshape( + context_layer, (batch_size, -1, self.head_ratio * self.all_head_size) + ) # (batch_size, seq_len_q, all_head_size) + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + + return outputs + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "query", None) is not None: + with tf.name_scope(self.query.name): + self.query.build([None, None, self.config.hidden_size]) + if getattr(self, "key", None) is not None: + with tf.name_scope(self.key.name): + self.key.build([None, None, self.config.hidden_size]) + if getattr(self, "value", None) is not None: + with tf.name_scope(self.value.name): + self.value.build([None, None, self.config.hidden_size]) + if getattr(self, "key_conv_attn_layer", None) is not None: + with tf.name_scope(self.key_conv_attn_layer.name): + self.key_conv_attn_layer.build([None, None, self.config.hidden_size]) + if getattr(self, "conv_kernel_layer", None) is not None: + with tf.name_scope(self.conv_kernel_layer.name): + self.conv_kernel_layer.build([None, None, self.all_head_size]) + if getattr(self, "conv_out_layer", None) is not None: + with tf.name_scope(self.conv_out_layer.name): + self.conv_out_layer.build([None, None, self.config.hidden_size]) + + +class TFConvBertSelfOutput(keras.layers.Layer): + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + + self.dense = keras.layers.Dense( + config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + ) + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = keras.layers.Dropout(config.hidden_dropout_prob) + self.config = config + + def call(self, hidden_states, input_tensor, training=False): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states, training=training) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + + return hidden_states + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build([None, None, self.config.hidden_size]) + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) + + +class TFConvBertAttention(keras.layers.Layer): + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + + self.self_attention = TFConvBertSelfAttention(config, name="self") + self.dense_output = TFConvBertSelfOutput(config, name="output") + + def prune_heads(self, heads): + raise NotImplementedError + + def call(self, input_tensor, attention_mask, head_mask, output_attentions, training=False): + self_outputs = self.self_attention( + input_tensor, attention_mask, head_mask, output_attentions, training=training + ) + attention_output = self.dense_output(self_outputs[0], input_tensor, training=training) + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + + return outputs + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "self_attention", None) is not None: + with tf.name_scope(self.self_attention.name): + self.self_attention.build(None) + if getattr(self, "dense_output", None) is not None: + with tf.name_scope(self.dense_output.name): + self.dense_output.build(None) + + +class GroupedLinearLayer(keras.layers.Layer): + def __init__(self, input_size, output_size, num_groups, kernel_initializer, **kwargs): + super().__init__(**kwargs) + self.input_size = input_size + self.output_size = output_size + self.num_groups = num_groups + self.kernel_initializer = kernel_initializer + self.group_in_dim = self.input_size // self.num_groups + self.group_out_dim = self.output_size // self.num_groups + + def build(self, input_shape=None): + self.kernel = self.add_weight( + "kernel", + shape=[self.group_out_dim, self.group_in_dim, self.num_groups], + initializer=self.kernel_initializer, + trainable=True, + ) + + self.bias = self.add_weight( + "bias", shape=[self.output_size], initializer=self.kernel_initializer, dtype=self.dtype, trainable=True + ) + super().build(input_shape) + + def call(self, hidden_states): + batch_size = shape_list(hidden_states)[0] + x = tf.transpose(tf.reshape(hidden_states, [-1, self.num_groups, self.group_in_dim]), [1, 0, 2]) + x = tf.matmul(x, tf.transpose(self.kernel, [2, 1, 0])) + x = tf.transpose(x, [1, 0, 2]) + x = tf.reshape(x, [batch_size, -1, self.output_size]) + x = tf.nn.bias_add(value=x, bias=self.bias) + return x + + +class TFConvBertIntermediate(keras.layers.Layer): + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + if config.num_groups == 1: + self.dense = keras.layers.Dense( + config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + ) + else: + self.dense = GroupedLinearLayer( + config.hidden_size, + config.intermediate_size, + num_groups=config.num_groups, + kernel_initializer=get_initializer(config.initializer_range), + name="dense", + ) + + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = get_tf_activation(config.hidden_act) + else: + self.intermediate_act_fn = config.hidden_act + self.config = config + + def call(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + + return hidden_states + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build([None, None, self.config.hidden_size]) + + +class TFConvBertOutput(keras.layers.Layer): + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + + if config.num_groups == 1: + self.dense = keras.layers.Dense( + config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + ) + else: + self.dense = GroupedLinearLayer( + config.intermediate_size, + config.hidden_size, + num_groups=config.num_groups, + kernel_initializer=get_initializer(config.initializer_range), + name="dense", + ) + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = keras.layers.Dropout(config.hidden_dropout_prob) + self.config = config + + def call(self, hidden_states, input_tensor, training=False): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states, training=training) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + + return hidden_states + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build([None, None, self.config.intermediate_size]) + + +class TFConvBertLayer(keras.layers.Layer): + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + + self.attention = TFConvBertAttention(config, name="attention") + self.intermediate = TFConvBertIntermediate(config, name="intermediate") + self.bert_output = TFConvBertOutput(config, name="output") + + def call(self, hidden_states, attention_mask, head_mask, output_attentions, training=False): + attention_outputs = self.attention( + hidden_states, attention_mask, head_mask, output_attentions, training=training + ) + attention_output = attention_outputs[0] + intermediate_output = self.intermediate(attention_output) + layer_output = self.bert_output(intermediate_output, attention_output, training=training) + outputs = (layer_output,) + attention_outputs[1:] # add attentions if we output them + + return outputs + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "attention", None) is not None: + with tf.name_scope(self.attention.name): + self.attention.build(None) + if getattr(self, "intermediate", None) is not None: + with tf.name_scope(self.intermediate.name): + self.intermediate.build(None) + if getattr(self, "bert_output", None) is not None: + with tf.name_scope(self.bert_output.name): + self.bert_output.build(None) + + +class TFConvBertEncoder(keras.layers.Layer): + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + + self.layer = [TFConvBertLayer(config, name=f"layer_._{i}") for i in range(config.num_hidden_layers)] + + def call( + self, + hidden_states, + attention_mask, + head_mask, + output_attentions, + output_hidden_states, + return_dict, + training=False, + ): + all_hidden_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + + for i, layer_module in enumerate(self.layer): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_outputs = layer_module( + hidden_states, attention_mask, head_mask[i], output_attentions, training=training + ) + hidden_states = layer_outputs[0] + + if output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + + # Add last layer + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None) + + return TFBaseModelOutput( + last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "layer", None) is not None: + for layer in self.layer: + with tf.name_scope(layer.name): + layer.build(None) + + +class TFConvBertPredictionHeadTransform(keras.layers.Layer): + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + + self.dense = keras.layers.Dense( + config.embedding_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + ) + + if isinstance(config.hidden_act, str): + self.transform_act_fn = get_tf_activation(config.hidden_act) + else: + self.transform_act_fn = config.hidden_act + + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.config = config + + def call(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.transform_act_fn(hidden_states) + hidden_states = self.LayerNorm(hidden_states) + + return hidden_states + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build([None, None, self.config.hidden_size]) + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) + + +@keras_serializable +class TFConvBertMainLayer(keras.layers.Layer): + config_class = ConvBertConfig + + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + + self.embeddings = TFConvBertEmbeddings(config, name="embeddings") + + if config.embedding_size != config.hidden_size: + self.embeddings_project = keras.layers.Dense(config.hidden_size, name="embeddings_project") + + self.encoder = TFConvBertEncoder(config, name="encoder") + self.config = config + + def get_input_embeddings(self): + return self.embeddings + + def set_input_embeddings(self, value): + self.embeddings.weight = value + self.embeddings.vocab_size = value.shape[0] + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + raise NotImplementedError + + def get_extended_attention_mask(self, attention_mask, input_shape, dtype): + if attention_mask is None: + attention_mask = tf.fill(input_shape, 1) + + # We create a 3D attention mask from a 2D tensor mask. + # Sizes are [batch_size, 1, 1, to_seq_length] + # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] + # this attention mask is more simple than the triangular masking of causal attention + # used in OpenAI GPT, we just need to prepare the broadcast dimension here. + extended_attention_mask = tf.reshape(attention_mask, (input_shape[0], 1, 1, input_shape[1])) + + # Since attention_mask is 1.0 for positions we want to attend and 0.0 for + # masked positions, this operation will create a tensor which is 0.0 for + # positions we want to attend and -10000.0 for masked positions. + # Since we are adding it to the raw scores before the softmax, this is + # effectively the same as removing these entirely. + extended_attention_mask = tf.cast(extended_attention_mask, dtype) + extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 + + return extended_attention_mask + + def get_head_mask(self, head_mask): + if head_mask is not None: + raise NotImplementedError + else: + head_mask = [None] * self.config.num_hidden_layers + + return head_mask + + @unpack_inputs + def call( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + training=False, + ): + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + input_shape = shape_list(input_ids) + elif inputs_embeds is not None: + input_shape = shape_list(inputs_embeds)[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + if attention_mask is None: + attention_mask = tf.fill(input_shape, 1) + + if token_type_ids is None: + token_type_ids = tf.fill(input_shape, 0) + + hidden_states = self.embeddings(input_ids, position_ids, token_type_ids, inputs_embeds, training=training) + extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape, hidden_states.dtype) + head_mask = self.get_head_mask(head_mask) + + if hasattr(self, "embeddings_project"): + hidden_states = self.embeddings_project(hidden_states, training=training) + + hidden_states = self.encoder( + hidden_states, + extended_attention_mask, + head_mask, + output_attentions, + output_hidden_states, + return_dict, + training=training, + ) + + return hidden_states + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "embeddings", None) is not None: + with tf.name_scope(self.embeddings.name): + self.embeddings.build(None) + if getattr(self, "encoder", None) is not None: + with tf.name_scope(self.encoder.name): + self.encoder.build(None) + if getattr(self, "embeddings_project", None) is not None: + with tf.name_scope(self.embeddings_project.name): + self.embeddings_project.build([None, None, self.config.embedding_size]) + + +class TFConvBertPreTrainedModel(TFPreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = ConvBertConfig + base_model_prefix = "convbert" + + +CONVBERT_START_DOCSTRING = r""" + + This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and + behavior. + + + + TensorFlow models and layers in `transformers` accept two formats as input: + + - having all inputs as keyword arguments (like PyTorch models), or + - having all inputs as a list, tuple or dict in the first positional argument. + + The reason the second format is supported is that Keras methods prefer this format when passing inputs to models + and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just + pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second + format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with + the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first + positional argument: + + - a single Tensor with `input_ids` only and nothing else: `model(input_ids)` + - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring: + `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])` + - a dictionary with one or several input Tensors associated to the input names given in the docstring: + `model({"input_ids": input_ids, "token_type_ids": token_type_ids})` + + Note that when creating models and layers with + [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry + about any of this, as you can just pass inputs like you would to any other Python function! + + + + Args: + config ([`ConvBertConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +CONVBERT_INPUTS_DOCSTRING = r""" + Args: + input_ids (`Numpy array` or `tf.Tensor` of shape `({0})`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and + [`PreTrainedTokenizer.encode`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + token_type_ids (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, + 1]`: + + - 0 corresponds to a *sentence A* token, + - 1 corresponds to a *sentence B* token. + + [What are token type IDs?](../glossary#token-type-ids) + position_ids (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + + [What are position IDs?](../glossary#position-ids) + head_mask (`Numpy array` or `tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + inputs_embeds (`tf.Tensor` of shape `({0}, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the + config will be used instead. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. This argument can be used only in eager mode, in graph mode the value in the config will be + used instead. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in + eager mode, in graph mode the value will always be set to True. + training (`bool`, *optional*, defaults to `False`): + Whether or not to use the model in training mode (some modules like dropout modules have different + behaviors between training and evaluation). +""" + + +@add_start_docstrings( + "The bare ConvBERT Model transformer outputting raw hidden-states without any specific head on top.", + CONVBERT_START_DOCSTRING, +) +class TFConvBertModel(TFConvBertPreTrainedModel): + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + self.convbert = TFConvBertMainLayer(config, name="convbert") + + @unpack_inputs + @add_start_docstrings_to_model_forward(CONVBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFBaseModelOutput, + config_class=_CONFIG_FOR_DOC, + ) + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.array | tf.Tensor | None = None, + token_type_ids: np.array | tf.Tensor | None = None, + position_ids: np.array | tf.Tensor | None = None, + head_mask: np.array | tf.Tensor | None = None, + inputs_embeds: tf.Tensor | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + training: bool = False, + ) -> TFBaseModelOutput | tuple[tf.Tensor]: + outputs = self.convbert( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + return outputs + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "convbert", None) is not None: + with tf.name_scope(self.convbert.name): + self.convbert.build(None) + + +class TFConvBertMaskedLMHead(keras.layers.Layer): + def __init__(self, config, input_embeddings, **kwargs): + super().__init__(**kwargs) + + self.config = config + self.embedding_size = config.embedding_size + self.input_embeddings = input_embeddings + + def build(self, input_shape): + self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias") + + super().build(input_shape) + + def get_output_embeddings(self): + return self.input_embeddings + + def set_output_embeddings(self, value): + self.input_embeddings.weight = value + self.input_embeddings.vocab_size = shape_list(value)[0] + + def get_bias(self): + return {"bias": self.bias} + + def set_bias(self, value): + self.bias = value["bias"] + self.config.vocab_size = shape_list(value["bias"])[0] + + def call(self, hidden_states): + seq_length = shape_list(tensor=hidden_states)[1] + hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.embedding_size]) + hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True) + hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size]) + hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias) + + return hidden_states + + +class TFConvBertGeneratorPredictions(keras.layers.Layer): + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dense = keras.layers.Dense(config.embedding_size, name="dense") + self.config = config + + def call(self, generator_hidden_states, training=False): + hidden_states = self.dense(generator_hidden_states) + hidden_states = get_tf_activation("gelu")(hidden_states) + hidden_states = self.LayerNorm(hidden_states) + + return hidden_states + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.embedding_size]) + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build([None, None, self.config.hidden_size]) + + +@add_start_docstrings("""ConvBERT Model with a `language modeling` head on top.""", CONVBERT_START_DOCSTRING) +class TFConvBertForMaskedLM(TFConvBertPreTrainedModel, TFMaskedLanguageModelingLoss): + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, **kwargs) + + self.config = config + self.convbert = TFConvBertMainLayer(config, name="convbert") + self.generator_predictions = TFConvBertGeneratorPredictions(config, name="generator_predictions") + + if isinstance(config.hidden_act, str): + self.activation = get_tf_activation(config.hidden_act) + else: + self.activation = config.hidden_act + + self.generator_lm_head = TFConvBertMaskedLMHead(config, self.convbert.embeddings, name="generator_lm_head") + + def get_lm_head(self): + return self.generator_lm_head + + def get_prefix_bias_name(self): + return self.name + "/" + self.generator_lm_head.name + + @unpack_inputs + @add_start_docstrings_to_model_forward(CONVBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFMaskedLMOutput, + config_class=_CONFIG_FOR_DOC, + ) + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: tf.Tensor | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + labels: tf.Tensor | None = None, + training: bool | None = False, + ) -> tuple | TFMaskedLMOutput: + r""" + labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., + config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the + loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` + """ + generator_hidden_states = self.convbert( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + generator_sequence_output = generator_hidden_states[0] + prediction_scores = self.generator_predictions(generator_sequence_output, training=training) + prediction_scores = self.generator_lm_head(prediction_scores, training=training) + loss = None if labels is None else self.hf_compute_loss(labels, prediction_scores) + + if not return_dict: + output = (prediction_scores,) + generator_hidden_states[1:] + + return ((loss,) + output) if loss is not None else output + + return TFMaskedLMOutput( + loss=loss, + logits=prediction_scores, + hidden_states=generator_hidden_states.hidden_states, + attentions=generator_hidden_states.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "convbert", None) is not None: + with tf.name_scope(self.convbert.name): + self.convbert.build(None) + if getattr(self, "generator_predictions", None) is not None: + with tf.name_scope(self.generator_predictions.name): + self.generator_predictions.build(None) + if getattr(self, "generator_lm_head", None) is not None: + with tf.name_scope(self.generator_lm_head.name): + self.generator_lm_head.build(None) + + +class TFConvBertClassificationHead(keras.layers.Layer): + """Head for sentence-level classification tasks.""" + + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + + self.dense = keras.layers.Dense( + config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + ) + classifier_dropout = ( + config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob + ) + self.dropout = keras.layers.Dropout(classifier_dropout) + self.out_proj = keras.layers.Dense( + config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj" + ) + + self.config = config + + def call(self, hidden_states, **kwargs): + x = hidden_states[:, 0, :] # take token (equiv. to [CLS]) + x = self.dropout(x) + x = self.dense(x) + x = get_tf_activation(self.config.hidden_act)(x) + x = self.dropout(x) + x = self.out_proj(x) + + return x + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build([None, None, self.config.hidden_size]) + if getattr(self, "out_proj", None) is not None: + with tf.name_scope(self.out_proj.name): + self.out_proj.build([None, None, self.config.hidden_size]) + + +@add_start_docstrings( + """ + ConvBERT Model transformer with a sequence classification/regression head on top e.g., for GLUE tasks. + """, + CONVBERT_START_DOCSTRING, +) +class TFConvBertForSequenceClassification(TFConvBertPreTrainedModel, TFSequenceClassificationLoss): + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + self.num_labels = config.num_labels + self.convbert = TFConvBertMainLayer(config, name="convbert") + self.classifier = TFConvBertClassificationHead(config, name="classifier") + + @unpack_inputs + @add_start_docstrings_to_model_forward(CONVBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFSequenceClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: tf.Tensor | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + labels: tf.Tensor | None = None, + training: bool | None = False, + ) -> tuple | TFSequenceClassifierOutput: + r""" + labels (`tf.Tensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + outputs = self.convbert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + logits = self.classifier(outputs[0], training=training) + loss = None if labels is None else self.hf_compute_loss(labels, logits) + + if not return_dict: + output = (logits,) + outputs[1:] + + return ((loss,) + output) if loss is not None else output + + return TFSequenceClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "convbert", None) is not None: + with tf.name_scope(self.convbert.name): + self.convbert.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build(None) + + +@add_start_docstrings( + """ + ConvBERT Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a + softmax) e.g. for RocStories/SWAG tasks. + """, + CONVBERT_START_DOCSTRING, +) +class TFConvBertForMultipleChoice(TFConvBertPreTrainedModel, TFMultipleChoiceLoss): + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + self.convbert = TFConvBertMainLayer(config, name="convbert") + self.sequence_summary = TFSequenceSummary( + config, initializer_range=config.initializer_range, name="sequence_summary" + ) + self.classifier = keras.layers.Dense( + 1, kernel_initializer=get_initializer(config.initializer_range), name="classifier" + ) + self.config = config + + @unpack_inputs + @add_start_docstrings_to_model_forward( + CONVBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length") + ) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFMultipleChoiceModelOutput, + config_class=_CONFIG_FOR_DOC, + ) + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: tf.Tensor | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + labels: tf.Tensor | None = None, + training: bool | None = False, + ) -> tuple | TFMultipleChoiceModelOutput: + r""" + labels (`tf.Tensor` of shape `(batch_size,)`, *optional*): + Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices]` + where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above) + """ + if input_ids is not None: + num_choices = shape_list(input_ids)[1] + seq_length = shape_list(input_ids)[2] + else: + num_choices = shape_list(inputs_embeds)[1] + seq_length = shape_list(inputs_embeds)[2] + + flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None + flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None + flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None + flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None + flat_inputs_embeds = ( + tf.reshape(inputs_embeds, (-1, seq_length, shape_list(inputs_embeds)[3])) + if inputs_embeds is not None + else None + ) + outputs = self.convbert( + flat_input_ids, + flat_attention_mask, + flat_token_type_ids, + flat_position_ids, + head_mask, + flat_inputs_embeds, + output_attentions, + output_hidden_states, + return_dict=return_dict, + training=training, + ) + logits = self.sequence_summary(outputs[0], training=training) + logits = self.classifier(logits) + reshaped_logits = tf.reshape(logits, (-1, num_choices)) + loss = None if labels is None else self.hf_compute_loss(labels, reshaped_logits) + + if not return_dict: + output = (reshaped_logits,) + outputs[1:] + + return ((loss,) + output) if loss is not None else output + + return TFMultipleChoiceModelOutput( + loss=loss, + logits=reshaped_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "convbert", None) is not None: + with tf.name_scope(self.convbert.name): + self.convbert.build(None) + if getattr(self, "sequence_summary", None) is not None: + with tf.name_scope(self.sequence_summary.name): + self.sequence_summary.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build([None, None, self.config.hidden_size]) + + +@add_start_docstrings( + """ + ConvBERT Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for + Named-Entity-Recognition (NER) tasks. + """, + CONVBERT_START_DOCSTRING, +) +class TFConvBertForTokenClassification(TFConvBertPreTrainedModel, TFTokenClassificationLoss): + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + self.num_labels = config.num_labels + self.convbert = TFConvBertMainLayer(config, name="convbert") + classifier_dropout = ( + config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob + ) + self.dropout = keras.layers.Dropout(classifier_dropout) + self.classifier = keras.layers.Dense( + config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" + ) + self.config = config + + @unpack_inputs + @add_start_docstrings_to_model_forward(CONVBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFTokenClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: tf.Tensor | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + labels: tf.Tensor | None = None, + training: bool | None = False, + ) -> tuple | TFTokenClassifierOutput: + r""" + labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. + """ + outputs = self.convbert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + sequence_output = outputs[0] + sequence_output = self.dropout(sequence_output, training=training) + logits = self.classifier(sequence_output) + loss = None if labels is None else self.hf_compute_loss(labels, logits) + + if not return_dict: + output = (logits,) + outputs[1:] + return ((loss,) + output) if loss is not None else output + + return TFTokenClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "convbert", None) is not None: + with tf.name_scope(self.convbert.name): + self.convbert.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build([None, None, self.config.hidden_size]) + + +@add_start_docstrings( + """ + ConvBERT Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear + layer on top of the hidden-states output to compute `span start logits` and `span end logits`). + """, + CONVBERT_START_DOCSTRING, +) +class TFConvBertForQuestionAnswering(TFConvBertPreTrainedModel, TFQuestionAnsweringLoss): + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + self.num_labels = config.num_labels + self.convbert = TFConvBertMainLayer(config, name="convbert") + self.qa_outputs = keras.layers.Dense( + config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" + ) + self.config = config + + @unpack_inputs + @add_start_docstrings_to_model_forward(CONVBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFQuestionAnsweringModelOutput, + config_class=_CONFIG_FOR_DOC, + ) + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: tf.Tensor | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + start_positions: tf.Tensor | None = None, + end_positions: tf.Tensor | None = None, + training: bool | None = False, + ) -> tuple | TFQuestionAnsweringModelOutput: + r""" + start_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*): + Labels for position (index) of the start of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence + are not taken into account for computing the loss. + end_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*): + Labels for position (index) of the end of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence + are not taken into account for computing the loss. + """ + outputs = self.convbert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + sequence_output = outputs[0] + logits = self.qa_outputs(sequence_output) + start_logits, end_logits = tf.split(logits, 2, axis=-1) + start_logits = tf.squeeze(start_logits, axis=-1) + end_logits = tf.squeeze(end_logits, axis=-1) + loss = None + + if start_positions is not None and end_positions is not None: + labels = {"start_position": start_positions} + labels["end_position"] = end_positions + loss = self.hf_compute_loss(labels, (start_logits, end_logits)) + + if not return_dict: + output = (start_logits, end_logits) + outputs[1:] + return ((loss,) + output) if loss is not None else output + + return TFQuestionAnsweringModelOutput( + loss=loss, + start_logits=start_logits, + end_logits=end_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "convbert", None) is not None: + with tf.name_scope(self.convbert.name): + self.convbert.build(None) + if getattr(self, "qa_outputs", None) is not None: + with tf.name_scope(self.qa_outputs.name): + self.qa_outputs.build([None, None, self.config.hidden_size]) + + +__all__ = [ + "TFConvBertForMaskedLM", + "TFConvBertForMultipleChoice", + "TFConvBertForQuestionAnswering", + "TFConvBertForSequenceClassification", + "TFConvBertForTokenClassification", + "TFConvBertLayer", + "TFConvBertModel", + "TFConvBertPreTrainedModel", +] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/convbert/tokenization_convbert.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/convbert/tokenization_convbert.py new file mode 100644 index 0000000000000000000000000000000000000000..b354b0eeae3d7f899311fa5322e9b2df6ad642ba --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/convbert/tokenization_convbert.py @@ -0,0 +1,483 @@ +# coding=utf-8 +# Copyright 2018 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes for ConvBERT.""" + +import collections +import os +import unicodedata +from typing import Optional + +from ...tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace +from ...utils import logging + + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"} + + +# Copied from transformers.models.bert.tokenization_bert.load_vocab +def load_vocab(vocab_file): + """Loads a vocabulary file into a dictionary.""" + vocab = collections.OrderedDict() + with open(vocab_file, "r", encoding="utf-8") as reader: + tokens = reader.readlines() + for index, token in enumerate(tokens): + token = token.rstrip("\n") + vocab[token] = index + return vocab + + +# Copied from transformers.models.bert.tokenization_bert.whitespace_tokenize +def whitespace_tokenize(text): + """Runs basic whitespace cleaning and splitting on a piece of text.""" + text = text.strip() + if not text: + return [] + tokens = text.split() + return tokens + + +# Copied from transformers.models.bert.tokenization_bert.BertTokenizer with bert-base-cased->YituTech/conv-bert-base, ConvBertTokenizer->BertTokenizer, BERT->ConvBERT +class ConvBertTokenizer(PreTrainedTokenizer): + r""" + Construct a ConvBERT tokenizer. Based on WordPiece. + + This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to + this superclass for more information regarding those methods. + + Args: + vocab_file (`str`): + File containing the vocabulary. + do_lower_case (`bool`, *optional*, defaults to `True`): + Whether or not to lowercase the input when tokenizing. + do_basic_tokenize (`bool`, *optional*, defaults to `True`): + Whether or not to do basic tokenization before WordPiece. + never_split (`Iterable`, *optional*): + Collection of tokens which will never be split during tokenization. Only has an effect when + `do_basic_tokenize=True` + unk_token (`str`, *optional*, defaults to `"[UNK]"`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + sep_token (`str`, *optional*, defaults to `"[SEP]"`): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for + sequence classification or for a text and a question for question answering. It is also used as the last + token of a sequence built with special tokens. + pad_token (`str`, *optional*, defaults to `"[PAD]"`): + The token used for padding, for example when batching sequences of different lengths. + cls_token (`str`, *optional*, defaults to `"[CLS]"`): + The classifier token which is used when doing sequence classification (classification of the whole sequence + instead of per-token classification). It is the first token of the sequence when built with special tokens. + mask_token (`str`, *optional*, defaults to `"[MASK]"`): + The token used for masking values. This is the token used when training this model with masked language + modeling. This is the token which the model will try to predict. + tokenize_chinese_chars (`bool`, *optional*, defaults to `True`): + Whether or not to tokenize Chinese characters. + + This should likely be deactivated for Japanese (see this + [issue](https://github.com/huggingface/transformers/issues/328)). + strip_accents (`bool`, *optional*): + Whether or not to strip all accents. If this option is not specified, then it will be determined by the + value for `lowercase` (as in the original ConvBERT). + clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`): + Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like + extra spaces. + """ + + vocab_files_names = VOCAB_FILES_NAMES + + def __init__( + self, + vocab_file, + do_lower_case=True, + do_basic_tokenize=True, + never_split=None, + unk_token="[UNK]", + sep_token="[SEP]", + pad_token="[PAD]", + cls_token="[CLS]", + mask_token="[MASK]", + tokenize_chinese_chars=True, + strip_accents=None, + clean_up_tokenization_spaces=True, + **kwargs, + ): + if not os.path.isfile(vocab_file): + raise ValueError( + f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained" + " model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`" + ) + self.vocab = load_vocab(vocab_file) + self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()]) + self.do_basic_tokenize = do_basic_tokenize + if do_basic_tokenize: + self.basic_tokenizer = BasicTokenizer( + do_lower_case=do_lower_case, + never_split=never_split, + tokenize_chinese_chars=tokenize_chinese_chars, + strip_accents=strip_accents, + ) + + self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token)) + + super().__init__( + do_lower_case=do_lower_case, + do_basic_tokenize=do_basic_tokenize, + never_split=never_split, + unk_token=unk_token, + sep_token=sep_token, + pad_token=pad_token, + cls_token=cls_token, + mask_token=mask_token, + tokenize_chinese_chars=tokenize_chinese_chars, + strip_accents=strip_accents, + clean_up_tokenization_spaces=clean_up_tokenization_spaces, + **kwargs, + ) + + @property + def do_lower_case(self): + return self.basic_tokenizer.do_lower_case + + @property + def vocab_size(self): + return len(self.vocab) + + def get_vocab(self): + return dict(self.vocab, **self.added_tokens_encoder) + + def _tokenize(self, text, split_special_tokens=False): + split_tokens = [] + if self.do_basic_tokenize: + for token in self.basic_tokenizer.tokenize( + text, never_split=self.all_special_tokens if not split_special_tokens else None + ): + # If the token is part of the never_split set + if token in self.basic_tokenizer.never_split: + split_tokens.append(token) + else: + split_tokens += self.wordpiece_tokenizer.tokenize(token) + else: + split_tokens = self.wordpiece_tokenizer.tokenize(text) + return split_tokens + + def _convert_token_to_id(self, token): + """Converts a token (str) in an id using the vocab.""" + return self.vocab.get(token, self.vocab.get(self.unk_token)) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.ids_to_tokens.get(index, self.unk_token) + + def convert_tokens_to_string(self, tokens): + """Converts a sequence of tokens (string) in a single string.""" + out_string = " ".join(tokens).replace(" ##", "").strip() + return out_string + + def build_inputs_with_special_tokens( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None + ) -> list[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A ConvBERT sequence has the following format: + + - single sequence: `[CLS] X [SEP]` + - pair of sequences: `[CLS] A [SEP] B [SEP]` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + if token_ids_1 is None: + return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] + cls = [self.cls_token_id] + sep = [self.sep_token_id] + return cls + token_ids_0 + sep + token_ids_1 + sep + + def get_special_tokens_mask( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False + ) -> list[int]: + """ + Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer `prepare_for_model` method. + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not the token list is already formatted with special tokens for the model. + + Returns: + `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + + if already_has_special_tokens: + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) + + if token_ids_1 is not None: + return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] + return [1] + ([0] * len(token_ids_0)) + [1] + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]: + index = 0 + if os.path.isdir(save_directory): + vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + else: + vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory + with open(vocab_file, "w", encoding="utf-8") as writer: + for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]): + if index != token_index: + logger.warning( + f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive." + " Please check that the vocabulary is not corrupted!" + ) + index = token_index + writer.write(token + "\n") + index += 1 + return (vocab_file,) + + +# Copied from transformers.models.bert.tokenization_bert.BasicTokenizer +class BasicTokenizer: + """ + Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.). + + Args: + do_lower_case (`bool`, *optional*, defaults to `True`): + Whether or not to lowercase the input when tokenizing. + never_split (`Iterable`, *optional*): + Collection of tokens which will never be split during tokenization. Only has an effect when + `do_basic_tokenize=True` + tokenize_chinese_chars (`bool`, *optional*, defaults to `True`): + Whether or not to tokenize Chinese characters. + + This should likely be deactivated for Japanese (see this + [issue](https://github.com/huggingface/transformers/issues/328)). + strip_accents (`bool`, *optional*): + Whether or not to strip all accents. If this option is not specified, then it will be determined by the + value for `lowercase` (as in the original BERT). + do_split_on_punc (`bool`, *optional*, defaults to `True`): + In some instances we want to skip the basic punctuation splitting so that later tokenization can capture + the full context of the words, such as contractions. + """ + + def __init__( + self, + do_lower_case=True, + never_split=None, + tokenize_chinese_chars=True, + strip_accents=None, + do_split_on_punc=True, + ): + if never_split is None: + never_split = [] + self.do_lower_case = do_lower_case + self.never_split = set(never_split) + self.tokenize_chinese_chars = tokenize_chinese_chars + self.strip_accents = strip_accents + self.do_split_on_punc = do_split_on_punc + + def tokenize(self, text, never_split=None): + """ + Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer. + + Args: + never_split (`List[str]`, *optional*) + Kept for backward compatibility purposes. Now implemented directly at the base class level (see + [`PreTrainedTokenizer.tokenize`]) List of token not to split. + """ + # union() returns a new set by concatenating the two sets. + never_split = self.never_split.union(set(never_split)) if never_split else self.never_split + text = self._clean_text(text) + + # This was added on November 1st, 2018 for the multilingual and Chinese + # models. This is also applied to the English models now, but it doesn't + # matter since the English models were not trained on any Chinese data + # and generally don't have any Chinese data in them (there are Chinese + # characters in the vocabulary because Wikipedia does have some Chinese + # words in the English Wikipedia.). + if self.tokenize_chinese_chars: + text = self._tokenize_chinese_chars(text) + # prevents treating the same character with different unicode codepoints as different characters + unicode_normalized_text = unicodedata.normalize("NFC", text) + orig_tokens = whitespace_tokenize(unicode_normalized_text) + split_tokens = [] + for token in orig_tokens: + if token not in never_split: + if self.do_lower_case: + token = token.lower() + if self.strip_accents is not False: + token = self._run_strip_accents(token) + elif self.strip_accents: + token = self._run_strip_accents(token) + split_tokens.extend(self._run_split_on_punc(token, never_split)) + + output_tokens = whitespace_tokenize(" ".join(split_tokens)) + return output_tokens + + def _run_strip_accents(self, text): + """Strips accents from a piece of text.""" + text = unicodedata.normalize("NFD", text) + output = [] + for char in text: + cat = unicodedata.category(char) + if cat == "Mn": + continue + output.append(char) + return "".join(output) + + def _run_split_on_punc(self, text, never_split=None): + """Splits punctuation on a piece of text.""" + if not self.do_split_on_punc or (never_split is not None and text in never_split): + return [text] + chars = list(text) + i = 0 + start_new_word = True + output = [] + while i < len(chars): + char = chars[i] + if _is_punctuation(char): + output.append([char]) + start_new_word = True + else: + if start_new_word: + output.append([]) + start_new_word = False + output[-1].append(char) + i += 1 + + return ["".join(x) for x in output] + + def _tokenize_chinese_chars(self, text): + """Adds whitespace around any CJK character.""" + output = [] + for char in text: + cp = ord(char) + if self._is_chinese_char(cp): + output.append(" ") + output.append(char) + output.append(" ") + else: + output.append(char) + return "".join(output) + + def _is_chinese_char(self, cp): + """Checks whether CP is the codepoint of a CJK character.""" + # This defines a "chinese character" as anything in the CJK Unicode block: + # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) + # + # Note that the CJK Unicode block is NOT all Japanese and Korean characters, + # despite its name. The modern Korean Hangul alphabet is a different block, + # as is Japanese Hiragana and Katakana. Those alphabets are used to write + # space-separated words, so they are not treated specially and handled + # like the all of the other languages. + if ( + (cp >= 0x4E00 and cp <= 0x9FFF) + or (cp >= 0x3400 and cp <= 0x4DBF) + or (cp >= 0x20000 and cp <= 0x2A6DF) + or (cp >= 0x2A700 and cp <= 0x2B73F) + or (cp >= 0x2B740 and cp <= 0x2B81F) + or (cp >= 0x2B820 and cp <= 0x2CEAF) + or (cp >= 0xF900 and cp <= 0xFAFF) + or (cp >= 0x2F800 and cp <= 0x2FA1F) + ): + return True + + return False + + def _clean_text(self, text): + """Performs invalid character removal and whitespace cleanup on text.""" + output = [] + for char in text: + cp = ord(char) + if cp == 0 or cp == 0xFFFD or _is_control(char): + continue + if _is_whitespace(char): + output.append(" ") + else: + output.append(char) + return "".join(output) + + +# Copied from transformers.models.bert.tokenization_bert.WordpieceTokenizer +class WordpieceTokenizer: + """Runs WordPiece tokenization.""" + + def __init__(self, vocab, unk_token, max_input_chars_per_word=100): + self.vocab = vocab + self.unk_token = unk_token + self.max_input_chars_per_word = max_input_chars_per_word + + def tokenize(self, text): + """ + Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform + tokenization using the given vocabulary. + + For example, `input = "unaffable"` will return as output `["un", "##aff", "##able"]`. + + Args: + text: A single token or whitespace separated tokens. This should have + already been passed through *BasicTokenizer*. + + Returns: + A list of wordpiece tokens. + """ + + output_tokens = [] + for token in whitespace_tokenize(text): + chars = list(token) + if len(chars) > self.max_input_chars_per_word: + output_tokens.append(self.unk_token) + continue + + is_bad = False + start = 0 + sub_tokens = [] + while start < len(chars): + end = len(chars) + cur_substr = None + while start < end: + substr = "".join(chars[start:end]) + if start > 0: + substr = "##" + substr + if substr in self.vocab: + cur_substr = substr + break + end -= 1 + if cur_substr is None: + is_bad = True + break + sub_tokens.append(cur_substr) + start = end + + if is_bad: + output_tokens.append(self.unk_token) + else: + output_tokens.extend(sub_tokens) + return output_tokens + + +__all__ = ["ConvBertTokenizer"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/convbert/tokenization_convbert_fast.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/convbert/tokenization_convbert_fast.py new file mode 100644 index 0000000000000000000000000000000000000000..e328e7490f4d2ca78a5d2005c2a00227eba33b78 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/convbert/tokenization_convbert_fast.py @@ -0,0 +1,147 @@ +# coding=utf-8 +# Copyright The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes for ConvBERT.""" + +import json +from typing import Optional + +from tokenizers import normalizers + +from ...tokenization_utils_fast import PreTrainedTokenizerFast +from ...utils import logging +from .tokenization_convbert import ConvBertTokenizer + + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"} + + +# Copied from transformers.models.bert.tokenization_bert_fast.BertTokenizerFast with bert-base-cased->YituTech/conv-bert-base, Bert->ConvBert, BERT->ConvBERT +class ConvBertTokenizerFast(PreTrainedTokenizerFast): + r""" + Construct a "fast" ConvBERT tokenizer (backed by HuggingFace's *tokenizers* library). Based on WordPiece. + + This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should + refer to this superclass for more information regarding those methods. + + Args: + vocab_file (`str`): + File containing the vocabulary. + do_lower_case (`bool`, *optional*, defaults to `True`): + Whether or not to lowercase the input when tokenizing. + unk_token (`str`, *optional*, defaults to `"[UNK]"`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + sep_token (`str`, *optional*, defaults to `"[SEP]"`): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for + sequence classification or for a text and a question for question answering. It is also used as the last + token of a sequence built with special tokens. + pad_token (`str`, *optional*, defaults to `"[PAD]"`): + The token used for padding, for example when batching sequences of different lengths. + cls_token (`str`, *optional*, defaults to `"[CLS]"`): + The classifier token which is used when doing sequence classification (classification of the whole sequence + instead of per-token classification). It is the first token of the sequence when built with special tokens. + mask_token (`str`, *optional*, defaults to `"[MASK]"`): + The token used for masking values. This is the token used when training this model with masked language + modeling. This is the token which the model will try to predict. + clean_text (`bool`, *optional*, defaults to `True`): + Whether or not to clean the text before tokenization by removing any control characters and replacing all + whitespaces by the classic one. + tokenize_chinese_chars (`bool`, *optional*, defaults to `True`): + Whether or not to tokenize Chinese characters. This should likely be deactivated for Japanese (see [this + issue](https://github.com/huggingface/transformers/issues/328)). + strip_accents (`bool`, *optional*): + Whether or not to strip all accents. If this option is not specified, then it will be determined by the + value for `lowercase` (as in the original ConvBERT). + wordpieces_prefix (`str`, *optional*, defaults to `"##"`): + The prefix for subwords. + """ + + vocab_files_names = VOCAB_FILES_NAMES + slow_tokenizer_class = ConvBertTokenizer + + def __init__( + self, + vocab_file=None, + tokenizer_file=None, + do_lower_case=True, + unk_token="[UNK]", + sep_token="[SEP]", + pad_token="[PAD]", + cls_token="[CLS]", + mask_token="[MASK]", + tokenize_chinese_chars=True, + strip_accents=None, + **kwargs, + ): + super().__init__( + vocab_file, + tokenizer_file=tokenizer_file, + do_lower_case=do_lower_case, + unk_token=unk_token, + sep_token=sep_token, + pad_token=pad_token, + cls_token=cls_token, + mask_token=mask_token, + tokenize_chinese_chars=tokenize_chinese_chars, + strip_accents=strip_accents, + **kwargs, + ) + + normalizer_state = json.loads(self.backend_tokenizer.normalizer.__getstate__()) + if ( + normalizer_state.get("lowercase", do_lower_case) != do_lower_case + or normalizer_state.get("strip_accents", strip_accents) != strip_accents + or normalizer_state.get("handle_chinese_chars", tokenize_chinese_chars) != tokenize_chinese_chars + ): + normalizer_class = getattr(normalizers, normalizer_state.pop("type")) + normalizer_state["lowercase"] = do_lower_case + normalizer_state["strip_accents"] = strip_accents + normalizer_state["handle_chinese_chars"] = tokenize_chinese_chars + self.backend_tokenizer.normalizer = normalizer_class(**normalizer_state) + + self.do_lower_case = do_lower_case + + def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A ConvBERT sequence has the following format: + + - single sequence: `[CLS] X [SEP]` + - pair of sequences: `[CLS] A [SEP] B [SEP]` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id] + + if token_ids_1 is not None: + output += token_ids_1 + [self.sep_token_id] + + return output + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]: + files = self._tokenizer.model.save(save_directory, name=filename_prefix) + return tuple(files) + + +__all__ = ["ConvBertTokenizerFast"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/cpmant/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/cpmant/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..d92eea75693e728f4c1e9afc199d40751ef6af7a --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/cpmant/__init__.py @@ -0,0 +1,28 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_cpmant import * + from .modeling_cpmant import * + from .tokenization_cpmant import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/cpmant/configuration_cpmant.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/cpmant/configuration_cpmant.py new file mode 100644 index 0000000000000000000000000000000000000000..c3368d67af7ab7db1278bcecdd01f6bf0b6ca59c --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/cpmant/configuration_cpmant.py @@ -0,0 +1,122 @@ +# coding=utf-8 +# Copyright 2022 The OpenBMB Team and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""CPMAnt model configuration""" + +from ...configuration_utils import PretrainedConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class CpmAntConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`CpmAntModel`]. It is used to instantiate an + CPMAnt model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of the CPMAnt + [openbmb/cpm-ant-10b](https://huggingface.co/openbmb/cpm-ant-10b) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + vocab_size (`int`, *optional*, defaults to 30720): + Vocabulary size of the CPMAnt model. Defines the number of different tokens that can be represented by the + `input` passed when calling [`CpmAntModel`]. + hidden_size (`int`, *optional*, defaults to 4096): + Dimension of the encoder layers. + num_attention_heads (`int`, *optional*, defaults to 32): + Number of attention heads in the Transformer encoder. + dim_head (`int`, *optional*, defaults to 128): + Dimension of attention heads for each attention layer in the Transformer encoder. + dim_ff (`int`, *optional*, defaults to 10240): + Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + num_hidden_layers (`int`, *optional*, defaults to 48): + Number of layers of the Transformer encoder. + dropout_p (`float`, *optional*, defaults to 0.0): + The dropout probability for all fully connected layers in the embeddings, encoder. + position_bias_num_buckets (`int`, *optional*, defaults to 512): + The number of position_bias buckets. + position_bias_max_distance (`int`, *optional*, defaults to 2048): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + eps (`float`, *optional*, defaults to 1e-06): + The epsilon used by the layer normalization layers. + init_std (`float`, *optional*, defaults to 1.0): + Initialize parameters with std = init_std. + prompt_types (`int`, *optional*, defaults to 32): + The type of prompt. + prompt_length (`int`, *optional*, defaults to 32): + The length of prompt. + segment_types (`int`, *optional*, defaults to 32): + The type of segment. + use_cache (`bool`, *optional*, defaults to `True`): + Whether to use cache. + + Example: + + ```python + >>> from transformers import CpmAntModel, CpmAntConfig + + >>> # Initializing a CPMAnt cpm-ant-10b style configuration + >>> configuration = CpmAntConfig() + + >>> # Initializing a model from the cpm-ant-10b style configuration + >>> model = CpmAntModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "cpmant" + + def __init__( + self, + vocab_size: int = 30720, + hidden_size: int = 4096, + num_attention_heads: int = 32, + dim_head: int = 128, + dim_ff: int = 10240, + num_hidden_layers: int = 48, + dropout_p: int = 0.0, + position_bias_num_buckets: int = 512, + position_bias_max_distance: int = 2048, + eps: int = 1e-6, + init_std: float = 1.0, + prompt_types: int = 32, + prompt_length: int = 32, + segment_types: int = 32, + use_cache: bool = True, + **kwargs, + ): + super().__init__(**kwargs) + self.prompt_types = prompt_types + self.prompt_length = prompt_length + self.segment_types = segment_types + self.hidden_size = hidden_size + self.num_attention_heads = num_attention_heads + self.dim_head = dim_head + self.dim_ff = dim_ff + self.num_hidden_layers = num_hidden_layers + self.position_bias_num_buckets = position_bias_num_buckets + self.position_bias_max_distance = position_bias_max_distance + self.dropout_p = dropout_p + self.eps = eps + self.use_cache = use_cache + self.vocab_size = vocab_size + self.init_std = init_std + + +__all__ = ["CpmAntConfig"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/cpmant/modeling_cpmant.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/cpmant/modeling_cpmant.py new file mode 100644 index 0000000000000000000000000000000000000000..15881a64eb3799cc23bf95c5bece4079717af0a8 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/cpmant/modeling_cpmant.py @@ -0,0 +1,807 @@ +# coding=utf-8 +# Copyright 2022 The OpenBMB Team and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch CPMAnt""" + +import math +from typing import Optional, Union + +import torch +import torch.nn.functional as F +from torch import nn +from torch.nn import CrossEntropyLoss + +from ...activations import ACT2FN +from ...cache_utils import Cache, DynamicCache +from ...generation import GenerationMixin +from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast +from ...modeling_utils import PreTrainedModel +from ...utils import auto_docstring, logging +from .configuration_cpmant import CpmAntConfig + + +logger = logging.get_logger(__name__) + + +class CpmAntLayerNorm(nn.Module): + """ + We use Root Mean Square (RMS) Layer Normalization, please see https://huggingface.co/papers/1910.07467 for details." + """ + + def __init__(self, config: CpmAntConfig): + super().__init__() + + self.eps = config.eps + self.dim_norm = config.hidden_size + self.weight = nn.Parameter(torch.empty(config.hidden_size)) + + def forward(self, hidden_states: torch.Tensor): + """ + Args: + hidden_states (`torch.Tensor` of shape `(batch, seq_len, dim_in)`) + """ + if hidden_states.size(-1) != self.dim_norm: + raise AssertionError("hidden_states.size(-1) != self.dim_norm") + old_dtype = hidden_states.dtype + variance = hidden_states.to(torch.float32).pow(2).mean(dim=-1, keepdim=True) + hidden_states = (hidden_states * torch.rsqrt(variance + self.eps)).to(old_dtype) * self.weight + return hidden_states + + +class CpmAntAttention(nn.Module): + def __init__(self, config: CpmAntConfig, layer_idx=None): + super().__init__() + self.dim_model = config.hidden_size + self.num_heads = config.num_attention_heads + self.dim_head = config.dim_head + self.layer_idx = layer_idx + + self.project_q = nn.Linear(self.dim_model, self.num_heads * self.dim_head, bias=False) + self.project_k = nn.Linear(self.dim_model, self.num_heads * self.dim_head, bias=False) + self.project_v = nn.Linear(self.dim_model, self.num_heads * self.dim_head, bias=False) + + self.attention_out = nn.Linear(self.num_heads * self.dim_head, self.dim_model, bias=False) + + self.softmax = torch.nn.Softmax(dim=-1) + + if config.dropout_p is not None: + self.dropout = torch.nn.Dropout(p=config.dropout_p) + else: + self.dropout = None + + def forward( + self, + hidden_q: torch.Tensor, + hidden_kv: torch.Tensor, + attention_mask: torch.BoolTensor, + position_bias: torch.Tensor, + output_attentions: Optional[bool] = False, + past_key_values: Optional[Cache] = None, + use_cache: Optional[bool] = None, + cache_position: Optional[torch.Tensor] = None, + ): + """ + Args: + hidden_q (`torch.Tensor`): + Input of transformer block(self-attention block). It can be the raw embedding of a batch of sequences. + hidden_kv (`torch.Tensor` of shape `(batch, len_k, dim_model)`)): + Tensor *key_value* and *query* of shape `(batch, len_k, dim_model)` + attention_mask (`torch.Tensor` of shape `(batch, len_seq, len_seq)`): + Avoid invalid areas to participate in the calculation of self-attention. + position_bias (`torch.Tensor` of shape `(batch, len_seq, len_seq)`): + Provide positional information to self-attention block. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. + past_key_values (`Cache`, *optional*): + Cached past key and value projection states. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + """ + batch_size = hidden_q.size(0) + len_q = hidden_q.size(1) + len_k = hidden_kv.size(1) + + query = self.project_q(hidden_q) + key = self.project_k(hidden_kv) + value = self.project_v(hidden_kv) + + query = query.view(batch_size, len_q, self.num_heads, self.dim_head).permute(0, 2, 1, 3) + key = key.view(batch_size, len_k, self.num_heads, self.dim_head).permute(0, 2, 1, 3) + value = value.view(batch_size, len_k, self.num_heads, self.dim_head).permute(0, 2, 1, 3) + + if past_key_values is not None: + key, value = past_key_values.update(key, value, self.layer_idx, {"cache_position": cache_position}) + len_k = key.size(-2) + + # (batch_size, num_heads, len_q, dim_head) @ (batch_size, num_heads, dim_head, len_k) -> (batch_size, num_heads, len_q, len_k) + score = torch.matmul(query, key.transpose(-1, -2)) / math.sqrt(self.dim_head) + score = score + position_bias + + score = torch.masked_fill( + score, + attention_mask.view(batch_size, 1, len_q, len_k) == torch.tensor(False), + torch.scalar_tensor(float("-inf"), device=score.device, dtype=score.dtype), + ) + score = self.softmax(score) + + score = torch.masked_fill( + score, + attention_mask.view(batch_size, 1, len_q, len_k) == torch.tensor(False), + torch.scalar_tensor(0, device=score.device, dtype=score.dtype), + ) + if output_attentions: + attn_weights = score + else: + attn_weights = None + + if self.dropout is not None: + score = self.dropout(score) + + # (batch_size, num_heads, len_q, len_k) @ (batch_size, num_heads, len_k, dim_head) -> (batch_size, num_heads, len_q, dim_head) + score = torch.matmul(score, value) + + score = score.view(batch_size, self.num_heads, len_q, self.dim_head).permute(0, 2, 1, 3) + score = score.contiguous().view(batch_size, len_q, self.num_heads * self.dim_head) + + score = self.attention_out(score) + + return score, attn_weights + + +class CpmAntSelfAttentionBlock(nn.Module): + def __init__(self, config: CpmAntConfig, layer_idx=None): + super().__init__() + self.layernorm_before_attention = CpmAntLayerNorm(config) + self.self_attention = CpmAntAttention(config, layer_idx=layer_idx) + if config.dropout_p: + self.dropout = torch.nn.Dropout(config.dropout_p) + else: + self.dropout = None + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: torch.Tensor, + position_bias: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = False, + past_key_values: Optional[Cache] = None, + use_cache: Optional[bool] = None, + cache_position: Optional[torch.Tensor] = None, + ): + """ + Args: + hidden_states (`torch.Tensor` of shape `(batch, len_seq, dim_model)`): + Input of transformer block(self-attention block). It can be the raw embedding of a batch of sequences. + attention_mask (`torch.Tensor` of shape `(batch, len_seq, len_seq)`): + Avoid invalid areas to participate in the calculation of self-attention. + position_bias (`torch.Tensor` of shape `(batch, len_seq, len_seq)`): + Provide positional information to self-attention block. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. + past_key_values (`Cache`, *optional*): + Cached past key and value projection states. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + """ + outputs = self.layernorm_before_attention(hidden_states) + outputs, attn_weights = self.self_attention( + outputs, + outputs, + attention_mask, + position_bias, + output_attentions, + past_key_values, + use_cache, + cache_position, + ) + + if self.dropout is not None: + outputs = self.dropout(outputs) + hidden_states = hidden_states + outputs + + return hidden_states, attn_weights + + +class CpmAntDenseGatedACT(nn.Module): + def __init__(self, config: CpmAntConfig): + super().__init__() + self.w_0 = nn.Linear(config.hidden_size, config.dim_ff, bias=False) + self.w_1 = nn.Linear(config.hidden_size, config.dim_ff, bias=False) + self.act = torch.nn.GELU() + + def forward(self, hidden_states: torch.Tensor): + """Transform an input tensor from one feature space to another via a nonlinear operation + + Args: + hidden_states (`torch.Tensor` of shape `(batch, seq_len, dim_in)`) + """ + gate_score = self.act(self.w_0(hidden_states)) + hidden_states = self.w_1(hidden_states) + + hidden_states = gate_score * hidden_states + return hidden_states + + +class CpmAntFeedForward(nn.Module): + def __init__(self, config: CpmAntConfig): + super().__init__() + self.w_in = CpmAntDenseGatedACT(config) + if config.dropout_p is not None: + self.dropout = torch.nn.Dropout(config.dropout_p) + else: + self.dropout = None + + self.w_out = nn.Linear(config.dim_ff, config.hidden_size, bias=False) + + def forward(self, hidden_states: torch.Tensor): + """ + Args: + hidden_states (`torch.Tensor` of shape `(batch, seq_len, dim_in)`) + """ + hidden_states = self.w_in(hidden_states) + + if self.dropout is not None: + hidden_states = self.dropout(hidden_states) + + hidden_states = self.w_out(hidden_states) + + return hidden_states + + +class CpmAntFFNBlock(nn.Module): + def __init__(self, config: CpmAntConfig): + super().__init__() + self.layernorm_before_ffn = CpmAntLayerNorm(config) + self.ffn = CpmAntFeedForward(config) + if config.dropout_p: + self.dropout = torch.nn.Dropout(config.dropout_p) + else: + self.dropout = None + + def forward( + self, + hidden_states: torch.Tensor, + ): + """ + Args: + hidden_states (`torch.Tensor` of shape `(batch, len_seq, dim_model)`): + Hidden states before feed forward layer. + """ + ln_outputs = self.layernorm_before_ffn(hidden_states) + outputs = self.ffn(ln_outputs) + if self.dropout is not None: + outputs = self.dropout(outputs) + hidden_states = hidden_states + outputs + return hidden_states + + +class CpmAntTransformerBlock(nn.Module): + def __init__(self, config: CpmAntConfig, layer_idx=None): + super().__init__() + self.self_att = CpmAntSelfAttentionBlock(config, layer_idx=layer_idx) + self.ffn = CpmAntFFNBlock(config) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: torch.Tensor, + position_bias: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = False, + past_key_values: Optional[Cache] = None, + use_cache: Optional[bool] = None, + cache_position: Optional[torch.Tensor] = None, + ): + """ + Args: + hidden_states (`torch.Tensor`): + Input to the layer of shape `(batch, seq_len, dim_model)` + attention_mask (`torch.Tensor`): + Avoid invalid areas to participate in the calculation of shape `(batch, seq_len, seq_len)` + position_bias (`torch.Tensor`): + Provides position information to attention mechanism of shape `(num_heads, seq_len, seq_len)` + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. + past_key_values (`Cache`, *optional*): + Cached past key and value projection states + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + """ + hidden_states, attn_weights = self.self_att( + hidden_states, + attention_mask=attention_mask, + position_bias=position_bias, + output_attentions=output_attentions, + past_key_values=past_key_values, + use_cache=use_cache, + cache_position=cache_position, + ) + + hidden_states = self.ffn(hidden_states) + return hidden_states, attn_weights + + +class CpmAntEncoder(nn.Module): + def __init__(self, config: CpmAntConfig): + super().__init__() + self.num_layers = config.num_hidden_layers + self.layers = nn.ModuleList([CpmAntTransformerBlock(config, layer_idx=i) for i in range(self.num_layers)]) + + self.output_layernorm = CpmAntLayerNorm(config) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: torch.Tensor, + position_bias: torch.Tensor, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + past_key_values: Optional[Cache] = None, + use_cache: Optional[bool] = None, + cache_position: Optional[torch.Tensor] = None, + ): + """ + Args: + hidden_states (`torch.Tensor`): + Input to the layer of shape `(batch, seq_len, dim_model)` + attention_mask (`torch.Tensor`): + Avoid invalid areas to participate in the calculation of shape `(batch, seq_len, seq_len)` + position_bias (`torch.Tensor`): + Provides position information to attention mechanism of shape `(num_heads, seq_len, seq_len)` + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. + past_key_values (`Cache`, *optional*): + Cached past key and value projection states + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + """ + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + + for i, layer in enumerate(self.layers): + if output_hidden_states: + all_hidden_states += (hidden_states,) + layer_outputs = layer( + hidden_states, + attention_mask, + position_bias, + output_attentions=output_attentions, + past_key_values=past_key_values, + use_cache=use_cache, + ) + hidden_states, attn_weights = layer_outputs + if output_attentions: + all_self_attns += (attn_weights,) + + hidden_states = self.output_layernorm(hidden_states) + + if output_hidden_states: + all_hidden_states += (hidden_states,) + + return hidden_states, all_hidden_states, all_self_attns + + +# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->CPMAnt +class CpmAntIntermediate(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +class CpmAntSegmentPositionEmbedding(nn.Module): + def __init__(self, config: CpmAntConfig): + super().__init__() + + self.num_heads = config.num_attention_heads + self.num_buckets = config.position_bias_num_buckets + self.max_distance = config.position_bias_max_distance + self.num_segments = config.segment_types + + self.relative_attention_bias = nn.Parameter( + torch.empty( + config.segment_types * config.segment_types + config.position_bias_num_buckets, + config.num_attention_heads, + ) + ) + + def forward( + self, + key_pos: torch.Tensor, + query_pos: torch.Tensor, + key_segment: torch.Tensor, + query_segment: torch.Tensor, + ): + with torch.no_grad(): + batch = key_pos.size(0) + keylen = key_pos.size(1) + querylen = query_pos.size(1) + + if key_pos.size(0) != query_pos.size(0): + raise AssertionError( + f"key_pos.size(0) should be equal to query_pos.size(0), but got {key_pos.size(0)} and {query_pos.size(0)}!" + ) + if keylen != key_segment.size(1) or querylen != query_segment.size(1): + raise AssertionError( + f"keylen should be equal to key_segment.size(1), but got {keylen} and {key_segment.size(1)}!" + ) + if querylen != query_segment.size(1): + raise AssertionError( + f"querylen should be equal to query_segment.size(1), but got {querylen} and {query_segment.size(1)}!" + ) + + key_pos = key_pos.view(batch, -1, keylen) + query_pos = query_pos.view(batch, querylen, -1) + key_segment = key_segment.view(batch, -1, keylen) + query_segment = query_segment.view(batch, querylen, -1) + + relative_position_bucket = self._segment_relative_position_bucket(query_segment, key_segment) + relative_position_bucket = relative_position_bucket + self.num_buckets + + # (batch, len_q, len_k) + absolute_position_bucket = self._position_bucket( + torch.arange(keylen, dtype=torch.int32, device=relative_position_bucket.device)[None, :] + - torch.arange(querylen, dtype=torch.int32, device=relative_position_bucket.device)[:, None], + num_buckets=self.num_buckets, + max_distance=self.max_distance, + ) + relative_position_bucket = torch.where( + (key_segment == query_segment), + absolute_position_bucket[None, :, :], + relative_position_bucket, + ) + + # (batch, len_q, len_k, num_heads) + embeds = F.embedding(relative_position_bucket, self.relative_attention_bias) + # (batch, num_heads, len_q, len_k) + embeds = embeds.permute(0, 3, 1, 2).contiguous() + return embeds + + def _segment_relative_position_bucket(self, query_segment, key_segment): + return query_segment * self.num_segments + key_segment + + def _position_bucket(self, relative_position, num_buckets=32, max_distance=128): + relative_buckets = 0 + # always bidirectional in CPMAnt + num_buckets //= 2 + relative_buckets = (relative_position > 0).to(torch.int32) * num_buckets + relative_position = torch.abs(relative_position) + max_exact = num_buckets // 2 + is_small = relative_position < max_exact + relative_position_if_large = max_exact + ( + torch.log(relative_position.float() / max_exact) + / math.log(max_distance / max_exact) + * (num_buckets - max_exact) + ).to(torch.int32) + relative_position_if_large = torch.min( + relative_position_if_large, + torch.full_like(relative_position_if_large, num_buckets - 1), + ) + relative_buckets += torch.where(is_small, relative_position.to(torch.int32), relative_position_if_large) + return relative_buckets + + +# Copied from transformers.models.bert.modeling_bert.BertOutput with Bert->CPMAnt +class CpmAntOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +@auto_docstring +class CpmAntPreTrainedModel(PreTrainedModel): + config: CpmAntConfig + base_model_prefix = "cpmant" + + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=self.config.init_std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.init_std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + elif isinstance(module, CpmAntLayerNorm): + module.weight.data.fill_(1.0) + elif isinstance(module, CpmAntSegmentPositionEmbedding): + module.relative_attention_bias.data.normal_(mean=0.0, std=self.config.init_std) + + +@auto_docstring +class CpmAntModel(CpmAntPreTrainedModel): + def __init__(self, config: CpmAntConfig): + super().__init__(config) + self.encoder = CpmAntEncoder(config) + self.segment_embedding = nn.Embedding(config.segment_types, config.hidden_size) + self.input_embedding = nn.Embedding( + config.vocab_size + config.prompt_types * config.prompt_length, config.hidden_size + ) + self.position_bias = CpmAntSegmentPositionEmbedding(config) + self.prompt_length = config.prompt_length + self.vocab_size = config.vocab_size + + self.post_init() + + def get_input_embeddings(self): + return self.input_embedding + + def set_input_embeddings(self, embeddings, **kwargs): + self.input_embedding = embeddings + + def _prepare_attention_mask(self, input_ids, span, context, length): + batch = input_ids.size(0) + seqlen = input_ids.size(1) + device = input_ids.device + directional_mask_2d = torch.arange(seqlen, device=device) <= torch.arange(seqlen, device=device).view(-1, 1) + attention_mask = context[:, None, :] | ( + context[:, :, None].logical_not() & directional_mask_2d.view(1, seqlen, seqlen) + ) + attention_mask = attention_mask & (span[:, None, :] == span[:, :, None]) + # mask for left padding + mask_1d = ( + torch.tensor(list(range(seqlen - self.prompt_length))[::-1], device=device)[None, :].repeat(batch, 1) + < length[:, None] + ) + mask_1d = torch.cat((torch.ones(batch, self.prompt_length, device=device).bool(), mask_1d), dim=1) + attention_mask = mask_1d.view(batch, seqlen, 1) & mask_1d.view(batch, 1, seqlen) & attention_mask + return attention_mask + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + past_key_values: Optional[Cache] = None, + use_cache: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.Tensor] = None, + **kwargs, + ) -> Union[tuple[torch.Tensor], BaseModelOutputWithPast]: + r""" + input_ids (`torch.Tensor` of shape `(batch_size, seq_len)`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`CPMAntTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + use_cache = use_cache if use_cache is not None else self.config.use_cache + + # add prompts ahead + if input_ids.dtype != torch.int32: + input_ids = input_ids.to(torch.int32) + dtype, device = input_ids.dtype, input_ids.device + segment = torch.where(input_ids != 0, 2, 0).to(dtype=dtype, device=device) + length = (segment != 0).sum(-1).to(dtype=dtype, device=device) + input_ids = torch.cat( + ( + torch.arange( + self.prompt_length * 2 + self.vocab_size, + self.prompt_length * 3 + self.vocab_size, + dtype=dtype, + device=device, + ).repeat(input_ids.size(0), 1), + input_ids, + ), + dim=1, + ) + batch, seq_length = input_ids.size() + segment = torch.cat((torch.zeros(batch, self.prompt_length, dtype=dtype, device=device), segment), dim=1) + context = torch.full((batch, seq_length), 1, dtype=dtype, device=device) + position = torch.arange(seq_length, dtype=dtype, device=device).repeat(batch, 1) + span = torch.full((batch, seq_length), 0, dtype=dtype, device=device) + + if use_cache and past_key_values is None: + past_key_values = DynamicCache(config=self.config) + if use_cache and isinstance(past_key_values, tuple): + logger.warning_once( + "Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. " + "You should pass an instance of `DynamicCache` instead, e.g. " + "`past_key_values=DynamicCache.from_legacy_cache(past_key_values)`." + ) + past_key_values = DynamicCache.from_legacy_cache(past_key_values) + + past_length = past_key_values.get_seq_length() if past_key_values is not None else 0 + input_ids = input_ids.contiguous() + hidden_states = self.input_embedding(input_ids) + segment_states = self.segment_embedding(segment) + if past_length != 0: + segment_states = segment_states[:, -1:, :] + + hidden_states = hidden_states + segment_states + + attention_mask = self._prepare_attention_mask(input_ids, span, context, length) + position_bias = self.position_bias(position, position, segment, segment) + + attention_mask = attention_mask[:, past_length:, :] + position_bias = position_bias[:, :, past_length:, :] + hidden_states = hidden_states[:, past_length:, :] + + hidden_states, all_hidden_states, all_attentions = self.encoder( + hidden_states, + attention_mask, + position_bias, + output_attentions, + output_hidden_states, + past_key_values, + use_cache, + cache_position, + ) + + if past_length == 0: + hidden_states = hidden_states[:, self.prompt_length :, :] + # drop the prompt + if all_attentions is not None: + new_attentions = () + for attention in all_attentions: + new_attentions += (attention[:, :, self.prompt_length :, self.prompt_length :],) + all_attentions = new_attentions + if all_hidden_states is not None: + new_hidden_states = () + for hidden_state in all_hidden_states: + new_hidden_states += (hidden_state[:, self.prompt_length :, :],) + all_hidden_states = new_hidden_states + + if not return_dict: + return tuple( + v for v in [hidden_states, past_key_values, all_hidden_states, all_attentions] if v is not None + ) + + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=past_key_values, + hidden_states=all_hidden_states, + attentions=all_attentions, + ) + + +@auto_docstring( + custom_intro=""" + The CPMAnt Model with a language modeling head on top (linear layer with weights tied to the input embeddings). + """ +) +class CpmAntForCausalLM(CpmAntPreTrainedModel, GenerationMixin): + _tied_weights_keys = ["lm_head.weight"] + + def __init__(self, config: CpmAntConfig): + super().__init__(config) + self.cpmant = CpmAntModel(config) + + # lm_head.weight is tied to cpmant.input_embedding.weight + self.lm_head = nn.Linear( + config.hidden_size, config.vocab_size + config.prompt_types * config.prompt_length, bias=False + ) + self.post_init() + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + past_key_values: Optional[Cache] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + labels: Optional[torch.Tensor] = None, + return_dict: Optional[bool] = None, + attention_mask: Optional[torch.Tensor] = None, # dummy parameter for text-generation pipeline + cache_position: Optional[torch.Tensor] = None, + **kwargs, + ) -> Union[tuple, CausalLMOutputWithPast]: + r""" + input_ids (`torch.Tensor` of shape `(batch_size, seq_len)`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`CPMAntTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + labels (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. + + Example: + + Text Generation with CpmAntForCausalLM. + ```python + >>> from transformers import CPMAntTokenizer, CpmAntForCausalLM + + >>> texts = "今天天气不错," + >>> model = CpmAntForCausalLM.from_pretrained("openbmb/cpm-ant-10b") + >>> tokenizer = CPMAntTokenizer.from_pretrained("openbmb/cpm-ant-10b") + >>> input_ids = tokenizer(texts, return_tensors="pt") + >>> outputs = model.generate(**input_ids) + >>> output_texts = tokenizer.batch_decode(outputs) + >>> print(output_texts) + ['今天天气不错,阳光明媚,我和妈妈一起去超市买东西。\n在超市里,我看到了一个很好玩的玩具,它的名字叫“机器人”。它有一个圆圆的脑袋,两只圆圆的眼睛,还有一个圆圆的'] + ``` + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + model_output = self.cpmant( + input_ids, + output_attentions, + output_hidden_states, + past_key_values, + use_cache, + return_dict, + cache_position, + ) + hidden_states = model_output.last_hidden_state if return_dict else model_output[0] + + logits = self.lm_head(hidden_states) + + loss = None + if labels is not None: + loss_func = CrossEntropyLoss() + loss = loss_func(logits.view(-1, logits.size(-1)), labels.view(-1)) + + if not return_dict: + output = (logits,) + model_output[1:] + return ((loss,) + output) if loss is not None else output + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=model_output.past_key_values, + hidden_states=model_output.hidden_states, + attentions=model_output.attentions, + ) + + def get_input_embeddings(self): + return self.cpmant.input_embedding + + def set_input_embeddings(self, embeddings): + self.cpmant.input_embedding = embeddings + + def _reorder_cache(self, past_key_values, beam_idx): + past_key_values = [list(each) if each is not None else each for each in past_key_values] + for key_value_layer in past_key_values: + key_value_layer[0] = key_value_layer[0][beam_idx] + key_value_layer[1] = key_value_layer[1][beam_idx] + return past_key_values + + +__all__ = ["CpmAntForCausalLM", "CpmAntModel", "CpmAntPreTrainedModel"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/cpmant/tokenization_cpmant.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/cpmant/tokenization_cpmant.py new file mode 100644 index 0000000000000000000000000000000000000000..38cd9f0c6a25dd5ace5a51afc31be8ff8b10b3eb --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/cpmant/tokenization_cpmant.py @@ -0,0 +1,272 @@ +# coding=utf-8 +# Copyright 2022 The OpenBMB Team and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes for CPMAnt.""" + +import collections +import os +from typing import Optional + +from transformers.utils import is_rjieba_available, requires_backends + + +if is_rjieba_available(): + import rjieba + +from ...tokenization_utils import PreTrainedTokenizer +from ...utils import logging + + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"} + + +def load_vocab(vocab_file): + """Loads a vocabulary file into a dictionary.""" + vocab = collections.OrderedDict() + with open(vocab_file, "r", encoding="utf-8") as reader: + tokens = reader.readlines() + for index, token in enumerate(tokens): + token = token.rstrip("\n") + vocab[token] = index + return vocab + + +class WordpieceTokenizer: + def __init__(self, vocab, unk_token="", max_input_chars_per_word=200): + self.vocab = vocab + self.unk_token = unk_token + self.max_input_chars_per_word = max_input_chars_per_word + + def tokenize(self, token): + chars = list(token) + if len(chars) > self.max_input_chars_per_word: + return [self.unk_token] + + start = 0 + sub_tokens = [] + while start < len(chars): + end = len(chars) + cur_substr = None + while start < end: + substr = "".join(chars[start:end]) + if substr in self.vocab: + cur_substr = substr + break + end -= 1 + if cur_substr is None: + sub_tokens.append(self.unk_token) + start += 1 + else: + sub_tokens.append(cur_substr) + start = end + + return sub_tokens + + +class CpmAntTokenizer(PreTrainedTokenizer): + """ + Construct a CPMAnt tokenizer. Based on byte-level Byte-Pair-Encoding. + + Args: + vocab_file (`str`): + Path to the vocabulary file. + bod_token (`str`, *optional*, defaults to `""`): + The beginning of document token. + eod_token (`str`, *optional*, defaults to `""`): + The end of document token. + bos_token (`str`, *optional*, defaults to `""`): + The beginning of sequence token. + eos_token (`str`, *optional*, defaults to `""`): + The end of sequence token. + pad_token (`str`, *optional*, defaults to `""`): + The token used for padding. + unk_token (`str`, *optional*, defaults to `""`): + The unknown token. + line_token (`str`, *optional*, defaults to `""`): + The line token. + space_token (`str`, *optional*, defaults to `""`): + The space token. + """ + + vocab_files_names = VOCAB_FILES_NAMES + model_input_names = ["input_ids", "attention_mask"] + add_prefix_space = False + + def __init__( + self, + vocab_file, + bod_token="", + eod_token="", + bos_token="", + eos_token="", + pad_token="", + unk_token="", + line_token="", + space_token="", + padding_side="left", + **kwargs, + ): + requires_backends(self, ["rjieba"]) + self.bod_token = bod_token + self.eod_token = eod_token + self.encoder = load_vocab(vocab_file) + self.encoder[" "] = self.encoder[space_token] + self.encoder["\n"] = self.encoder[line_token] + + del self.encoder[space_token] + del self.encoder[line_token] + + self.encoder = collections.OrderedDict(sorted(self.encoder.items(), key=lambda x: x[1])) + self.decoder = {v: k for k, v in self.encoder.items()} + + self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.encoder, unk_token=unk_token) + + super().__init__( + bod_token=bod_token, + eod_token=eod_token, + bos_token=bos_token, + eos_token=eos_token, + pad_token=pad_token, + unk_token=unk_token, + line_token=line_token, + space_token=space_token, + padding_side=padding_side, + **kwargs, + ) + + @property + def bod_token_id(self): + return self.encoder[self.bod_token] + + @property + def eod_token_id(self): + return self.encoder[self.eod_token] + + @property + def newline_id(self): + return self.encoder["\n"] + + @property + def vocab_size(self) -> int: + return len(self.encoder) + + def get_vocab(self): + return dict(self.encoder, **self.added_tokens_encoder) + + def _tokenize(self, text): + """Tokenize a string.""" + output_tokens = [] + for x in rjieba.cut(text, False): + output_tokens.extend(self.wordpiece_tokenizer.tokenize(x)) + return output_tokens + + def _decode(self, token_ids, **kwargs): + """Decode ids into a string.""" + token_ids = [i for i in token_ids if i >= 0] + token_ids = [ + x for x in token_ids if x != self.pad_token_id and x != self.eos_token_id and x != self.bos_token_id + ] + return super()._decode(token_ids, **kwargs) + + def check(self, token): + return token in self.encoder + + def convert_tokens_to_string(self, tokens: list[str]) -> str: + return "".join(tokens) + + def _convert_token_to_id(self, token): + """Converts a token (str) in an id using the vocab.""" + return self.encoder.get(token, self.encoder.get(self.unk_token)) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.decoder.get(index, self.unk_token) + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]: + if os.path.isdir(save_directory): + vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + else: + vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory + index = 0 + if " " in self.encoder: + self.encoder[""] = self.encoder[" "] + del self.encoder[" "] + if "\n" in self.encoder: + self.encoder[""] = self.encoder["\n"] + del self.encoder["\n"] + self.encoder = collections.OrderedDict(sorted(self.encoder.items(), key=lambda x: x[1])) + with open(vocab_file, "w", encoding="utf-8") as writer: + for token, token_index in self.encoder.items(): + if index != token_index: + logger.warning( + f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive." + " Please check that the vocabulary is not corrupted!" + ) + index = token_index + writer.write(token + "\n") + index += 1 + return (vocab_file,) + + def build_inputs_with_special_tokens( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None + ) -> list[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A CPMAnt sequence has the following format: + + - single sequence: `[BOS] Sequence`. + + Args: + token_ids_0 (`list[int]`): The first tokenized sequence that special tokens will be added. + token_ids_1 (`list[int]`): The optional second tokenized sequence that special tokens will be added. + + Returns: + `list[int]`: The model input with special tokens. + """ + if token_ids_1 is None: + return [self.bos_token_id] + token_ids_0 + return [self.bos_token_id] + token_ids_0 + [self.bos_token_id] + token_ids_1 + + def get_special_tokens_mask( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False + ) -> list[int]: + """ + Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer `prepare_for_model` method. + + Args: + token_ids_0 (`list[int]`): List of IDs. + token_ids_1 (`list[int]`, *optional*): Optional second list of IDs for sequence pairs. + already_has_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not the token list is already formatted with special tokens for the model. + + Returns: + `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + + if already_has_special_tokens: + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) + + if token_ids_1 is not None: + return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + return [1] + ([0] * len(token_ids_0)) + + +__all__ = ["CpmAntTokenizer"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/ctrl/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/ctrl/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ea62163babef934550a87c17577891614b071642 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/ctrl/__init__.py @@ -0,0 +1,29 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_ctrl import * + from .modeling_ctrl import * + from .modeling_tf_ctrl import * + from .tokenization_ctrl import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/ctrl/configuration_ctrl.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/ctrl/configuration_ctrl.py new file mode 100644 index 0000000000000000000000000000000000000000..7a812f0b55650fab8ab9a4d1b4c6bad4cd9446f4 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/ctrl/configuration_ctrl.py @@ -0,0 +1,116 @@ +# coding=utf-8 +# Copyright 2018 Salesforce and HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Salesforce CTRL configuration""" + +from ...configuration_utils import PretrainedConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class CTRLConfig(PretrainedConfig): + """ + This is the configuration class to store the configuration of a [`CTRLModel`] or a [`TFCTRLModel`]. It is used to + instantiate a CTRL model according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of the + [Salesforce/ctrl](https://huggingface.co/Salesforce/ctrl) architecture from SalesForce. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + vocab_size (`int`, *optional*, defaults to 246534): + Vocabulary size of the CTRL model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`CTRLModel`] or [`TFCTRLModel`]. + n_positions (`int`, *optional*, defaults to 256): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + n_embd (`int`, *optional*, defaults to 1280): + Dimensionality of the embeddings and hidden states. + dff (`int`, *optional*, defaults to 8192): + Dimensionality of the inner dimension of the feed forward networks (FFN). + n_layer (`int`, *optional*, defaults to 48): + Number of hidden layers in the Transformer encoder. + n_head (`int`, *optional*, defaults to 16): + Number of attention heads for each attention layer in the Transformer encoder. + resid_pdrop (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + embd_pdrop (`int`, *optional*, defaults to 0.1): + The dropout ratio for the embeddings. + layer_norm_epsilon (`float`, *optional*, defaults to 1e-06): + The epsilon to use in the layer normalization layers + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). + + + Examples: + + ```python + >>> from transformers import CTRLConfig, CTRLModel + + >>> # Initializing a CTRL configuration + >>> configuration = CTRLConfig() + + >>> # Initializing a model (with random weights) from the configuration + >>> model = CTRLModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "ctrl" + keys_to_ignore_at_inference = ["past_key_values"] + attribute_map = { + "max_position_embeddings": "n_positions", + "hidden_size": "n_embd", + "num_attention_heads": "n_head", + "num_hidden_layers": "n_layer", + } + + def __init__( + self, + vocab_size=246534, + n_positions=256, + n_embd=1280, + dff=8192, + n_layer=48, + n_head=16, + resid_pdrop=0.1, + embd_pdrop=0.1, + layer_norm_epsilon=1e-6, + initializer_range=0.02, + use_cache=True, + **kwargs, + ): + self.vocab_size = vocab_size + self.n_positions = n_positions + self.n_embd = n_embd + self.n_layer = n_layer + self.n_head = n_head + self.dff = dff + self.resid_pdrop = resid_pdrop + self.embd_pdrop = embd_pdrop + self.layer_norm_epsilon = layer_norm_epsilon + self.initializer_range = initializer_range + + self.use_cache = use_cache + + super().__init__(**kwargs) + + +__all__ = ["CTRLConfig"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/ctrl/modeling_ctrl.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/ctrl/modeling_ctrl.py new file mode 100644 index 0000000000000000000000000000000000000000..506bed039b17d4dadfce2755366207f3a2e718f3 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/ctrl/modeling_ctrl.py @@ -0,0 +1,787 @@ +# coding=utf-8 +# Copyright 2018 Salesforce and HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch CTRL model.""" + +from typing import Optional, Union + +import numpy as np +import torch +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from ...cache_utils import Cache, DynamicCache +from ...generation import GenerationMixin +from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutput +from ...modeling_utils import PreTrainedModel +from ...pytorch_utils import Conv1D, find_pruneable_heads_and_indices, prune_linear_layer +from ...utils import ( + auto_docstring, + logging, +) +from .configuration_ctrl import CTRLConfig + + +logger = logging.get_logger(__name__) + + +def angle_defn(pos, i, d_model_size): + angle_rates = 1 / torch.pow(10000, (2 * (i // 2)) / d_model_size) + return pos * angle_rates + + +def positional_encoding(position, d_model_size, dtype): + # create the sinusoidal pattern for the positional encoding + angle_rads = angle_defn( + torch.arange(position, dtype=torch.int64).to(dtype).unsqueeze(1), + torch.arange(d_model_size, dtype=torch.int64).to(dtype).unsqueeze(0), + d_model_size, + ) + + sines = torch.sin(angle_rads[:, 0::2]) + cosines = torch.cos(angle_rads[:, 1::2]) + + pos_encoding = torch.cat([sines, cosines], dim=-1) + return pos_encoding + + +def scaled_dot_product_attention(q, k, v, mask, attention_mask=None, head_mask=None): + # calculate attention + matmul_qk = torch.matmul(q, k.permute(0, 1, 3, 2)) + + dk = k.shape[-1] + scaled_attention_logits = matmul_qk / np.sqrt(dk) + + if mask is not None: + nd, ns = scaled_attention_logits.size(-2), scaled_attention_logits.size(-1) + scaled_attention_logits += mask[ns - nd : ns, :ns] * -1e4 + + if attention_mask is not None: + # Apply the attention mask + scaled_attention_logits = scaled_attention_logits + attention_mask + + attention_weights = torch.softmax(scaled_attention_logits, dim=-1) + + # Mask heads if we want to + if head_mask is not None: + attention_weights = attention_weights * head_mask + + output = torch.matmul(attention_weights, v) + + return output, attention_weights + + +class MultiHeadAttention(nn.Module): + def __init__(self, d_model_size, num_heads, layer_idx=None): + super().__init__() + self.num_heads = num_heads + self.d_model_size = d_model_size + self.layer_idx = layer_idx + + self.depth = int(d_model_size / self.num_heads) + + self.Wq = nn.Linear(d_model_size, d_model_size) + self.Wk = nn.Linear(d_model_size, d_model_size) + self.Wv = nn.Linear(d_model_size, d_model_size) + + self.dense = nn.Linear(d_model_size, d_model_size) + self.pruned_heads = set() + + def prune_heads(self, heads): + attention_head_size = self.d_model_size // self.num_heads + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices(heads, self.num_heads, attention_head_size, self.pruned_heads) + + # Prune linear layers + self.Wq = prune_linear_layer(self.Wq, index) + self.Wk = prune_linear_layer(self.Wk, index) + self.Wv = prune_linear_layer(self.Wv, index) + self.dense = prune_linear_layer(self.dense, index, dim=1) + + # Update hyper params + self.num_heads = self.num_heads - len(heads) + self.d_model_size = attention_head_size * self.num_heads + self.pruned_heads = self.pruned_heads.union(heads) + + def split_into_heads(self, x, batch_size): + x = x.reshape(batch_size, -1, self.num_heads, self.depth) + return x.permute([0, 2, 1, 3]) + + def forward( + self, + v, + k, + q, + mask, + layer_past=None, + attention_mask=None, + head_mask=None, + use_cache=False, + output_attentions=False, + cache_position=None, + ): + batch_size = q.shape[0] + + q = self.Wq(q) + k = self.Wk(k) + v = self.Wv(v) + + q = self.split_into_heads(q, batch_size) + k = self.split_into_heads(k, batch_size) + v = self.split_into_heads(v, batch_size) + + if layer_past is not None: + k, v = layer_past.update(k, v, self.layer_idx, {"cache_position": cache_position}) + + output = scaled_dot_product_attention(q, k, v, mask, attention_mask, head_mask) + scaled_attention = output[0].permute([0, 2, 1, 3]) + attn = output[1] + original_size_attention = scaled_attention.reshape(batch_size, -1, self.d_model_size) + output = self.dense(original_size_attention) + return output, attn + + +def point_wise_feed_forward_network(d_model_size, dff): + return nn.Sequential(nn.Linear(d_model_size, dff), nn.ReLU(), nn.Linear(dff, d_model_size)) + + +class EncoderLayer(nn.Module): + def __init__(self, d_model_size, num_heads, dff, rate=0.1, layer_idx=None): + super().__init__() + + self.multi_head_attention = MultiHeadAttention(d_model_size, num_heads, layer_idx=layer_idx) + self.ffn = point_wise_feed_forward_network(d_model_size, dff) + + self.layernorm1 = nn.LayerNorm(d_model_size, eps=1e-6) + self.layernorm2 = nn.LayerNorm(d_model_size, eps=1e-6) + + self.dropout1 = nn.Dropout(rate) + self.dropout2 = nn.Dropout(rate) + + def forward( + self, + x, + mask, + layer_past=None, + attention_mask=None, + head_mask=None, + use_cache=False, + output_attentions=False, + cache_position=None, + ): + normed = self.layernorm1(x) + attn_outputs = self.multi_head_attention( + normed, + normed, + normed, + mask, + layer_past=layer_past, + attention_mask=attention_mask, + head_mask=head_mask, + use_cache=use_cache, + output_attentions=output_attentions, + cache_position=cache_position, + ) + attn_output = attn_outputs[0] + attn_output = self.dropout1(attn_output) + out1 = x + attn_output + + out2 = self.layernorm2(out1) + ffn_output = self.ffn(out2) + ffn_output = self.dropout2(ffn_output) + out2 = out1 + ffn_output + + outputs = (out2,) + attn_outputs[1:] + return outputs + + +@auto_docstring +class CTRLPreTrainedModel(PreTrainedModel): + config: CTRLConfig + base_model_prefix = "transformer" + + def _init_weights(self, module): + """Initialize the weights.""" + if isinstance(module, (nn.Linear, Conv1D)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + +@auto_docstring +class CTRLModel(CTRLPreTrainedModel): + def __init__(self, config): + super().__init__(config) + + self.d_model_size = config.n_embd + self.num_layers = config.n_layer + + self.pos_encoding = positional_encoding(config.n_positions, self.d_model_size, torch.float) + + self.w = nn.Embedding(config.vocab_size, config.n_embd) + + self.dropout = nn.Dropout(config.embd_pdrop) + self.h = nn.ModuleList( + [ + EncoderLayer(config.n_embd, config.n_head, config.dff, config.resid_pdrop, layer_idx=i) + for i in range(config.n_layer) + ] + ) + self.layernorm = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.w + + def set_input_embeddings(self, new_embeddings): + self.w = new_embeddings + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} + """ + for layer, heads in heads_to_prune.items(): + self.h[layer].multi_head_attention.prune_heads(heads) + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.Tensor] = None, + **kwargs, # NOOP kwargs, for now + ) -> Union[tuple[torch.Tensor], BaseModelOutputWithPast]: + r""" + input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`): + `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values[0].shape[-2]` + (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary. + + If `past_key_values` is used, only input IDs that do not have their past calculated should be passed as + `input_ids`. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and + [`PreTrainedTokenizer.encode`] for details. + + [What are input IDs?](../glossary#input-ids) + + Example: + + ```python + >>> from transformers import AutoTokenizer, CTRLModel + >>> import torch + + >>> tokenizer = AutoTokenizer.from_pretrained("Salesforce/ctrl") + >>> model = CTRLModel.from_pretrained("Salesforce/ctrl") + + >>> # CTRL was trained with control codes as the first token + >>> inputs = tokenizer("Opinion My dog is cute", return_tensors="pt") + >>> assert inputs["input_ids"][0, 0].item() in tokenizer.control_codes.values() + + >>> outputs = model(**inputs) + + >>> last_hidden_states = outputs.last_hidden_state + >>> list(last_hidden_states.shape) + [1, 5, 1280] + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + use_cache = use_cache if use_cache is not None else self.config.use_cache + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask) + input_shape = input_ids.size() + input_ids = input_ids.view(-1, input_shape[-1]) + batch_size = input_ids.shape[0] + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + batch_size = inputs_embeds.shape[0] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + device = input_ids.device if input_ids is not None else inputs_embeds.device + + if use_cache and past_key_values is None: + past_key_values = DynamicCache(config=self.config) + if use_cache and isinstance(past_key_values, tuple): + logger.warning_once( + "Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. " + "You should pass an instance of `DynamicCache` instead, e.g. " + "`past_key_values=DynamicCache.from_legacy_cache(past_key_values)`." + ) + past_key_values = DynamicCache.from_legacy_cache(past_key_values) + + past_length = past_key_values.get_seq_length() if past_key_values is not None else 0 + if position_ids is None: + position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device) + position_ids = position_ids.unsqueeze(0) + + # Attention mask. + if attention_mask is not None: + if batch_size <= 0: + raise ValueError("batch_size has to be defined and > 0") + attention_mask = attention_mask.view(batch_size, -1) + # We create a 3D attention mask from a 2D tensor mask. + # Sizes are [batch_size, 1, 1, to_seq_length] + # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] + # this attention mask is more simple than the triangular masking of causal attention + # used in OpenAI GPT, we just need to prepare the broadcast dimension here. + attention_mask = attention_mask.unsqueeze(1).unsqueeze(2) + + # Since attention_mask is 1.0 for positions we want to attend and 0.0 for + # masked positions, this operation will create a tensor which is 0.0 for + # positions we want to attend and the dtype's smallest value for masked positions. + # Since we are adding it to the raw scores before the softmax, this is + # effectively the same as removing these entirely. + attention_mask = attention_mask.to(dtype=self.dtype) # fp16 compatibility + attention_mask = (1.0 - attention_mask) * torch.finfo(self.dtype).min + + # Prepare head mask if needed + head_mask = self.get_head_mask(head_mask, self.config.n_layer) + + if token_type_ids is not None: + token_type_ids = token_type_ids.view(-1, input_shape[-1]) + token_type_embeds = self.w(token_type_ids) + token_type_embeds *= np.sqrt(self.d_model_size) + else: + token_type_embeds = 0 + + if inputs_embeds is None: + inputs_embeds = self.w(input_ids) + # inputs_embeds = embedded.unsqueeze(0) if len(input_ids.shape)<2 else embedded + seq_len = input_shape[-1] + mask = torch.triu(torch.ones(seq_len + past_length, seq_len + past_length), 1).to(device) + + inputs_embeds *= np.sqrt(self.d_model_size) + + # `self.pos_encoding` won't be sent to the correct device along the model, so we do it manually. + self.pos_encoding = self.pos_encoding.to(device) + pos_embeds = self.pos_encoding[position_ids, :] + + hidden_states = inputs_embeds + pos_embeds + token_type_embeds + + hidden_states = self.dropout(hidden_states) + + all_hidden_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + for i, h in enumerate(self.h): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + outputs = h( + hidden_states, + mask, + layer_past=past_key_values, + attention_mask=attention_mask, + head_mask=head_mask[i], + use_cache=use_cache, + output_attentions=output_attentions, + cache_position=cache_position, + ) + hidden_states = outputs[0] + if output_attentions: + all_attentions += (outputs[1],) + + hidden_states = self.layernorm(hidden_states) + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple( + v for v in [hidden_states, past_key_values, all_hidden_states, all_attentions] if v is not None + ) + + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=past_key_values, + hidden_states=all_hidden_states, + attentions=all_attentions, + ) + + +@auto_docstring( + custom_intro=""" + The CTRL Model transformer with a language modeling head on top (linear layer with weights tied to the input + embeddings). + """ +) +class CTRLLMHeadModel(CTRLPreTrainedModel, GenerationMixin): + _tied_weights_keys = ["lm_head.weight"] + + def __init__(self, config): + super().__init__(config) + self.transformer = CTRLModel(config) + self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=True) + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.Tensor] = None, + **kwargs, + ) -> Union[tuple[torch.Tensor], CausalLMOutputWithPast]: + r""" + input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`): + `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values[0].shape[-2]` + (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary. + + If `past_key_values` is used, only input IDs that do not have their past calculated should be passed as + `input_ids`. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and + [`PreTrainedTokenizer.encode`] for details. + + [What are input IDs?](../glossary#input-ids) + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set + `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100` + are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]` + + Example: + + ```python + >>> import torch + >>> from transformers import AutoTokenizer, CTRLLMHeadModel + + >>> tokenizer = AutoTokenizer.from_pretrained("Salesforce/ctrl") + >>> model = CTRLLMHeadModel.from_pretrained("Salesforce/ctrl") + + >>> # CTRL was trained with control codes as the first token + >>> inputs = tokenizer("Wikipedia The llama is", return_tensors="pt") + >>> assert inputs["input_ids"][0, 0].item() in tokenizer.control_codes.values() + + >>> sequence_ids = model.generate(inputs["input_ids"]) + >>> sequences = tokenizer.batch_decode(sequence_ids) + >>> sequences + ['Wikipedia The llama is a member of the family Bovidae. It is native to the Andes of Peru,'] + + >>> outputs = model(**inputs, labels=inputs["input_ids"]) + >>> round(outputs.loss.item(), 2) + 9.21 + + >>> list(outputs.logits.shape) + [1, 5, 246534] + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + transformer_outputs = self.transformer( + input_ids, + past_key_values=past_key_values, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + cache_position=cache_position, + ) + + hidden_states = transformer_outputs[0] + + lm_logits = self.lm_head(hidden_states) + + loss = None + if labels is not None: + loss = self.loss_function( + lm_logits, + labels, + vocab_size=self.config.vocab_size, + **kwargs, + ) + + if not return_dict: + output = (lm_logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return CausalLMOutputWithPast( + loss=loss, + logits=lm_logits, + past_key_values=transformer_outputs.past_key_values, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) + + def prepare_inputs_for_generation(self, input_ids, past_key_values=None, use_cache=None, **kwargs): + # Overwritten -- inputs_embeds not working properly + + # only last tokens for inputs_ids if past is defined in kwargs + if past_key_values is not None: + past_length = past_key_values.get_seq_length() + + # Some generation methods already pass only the last input ID + if input_ids.shape[1] > past_length: + remove_prefix_length = past_length + else: + # Default to old behavior: keep only final ID + remove_prefix_length = input_ids.shape[1] - 1 + + input_ids = input_ids[:, remove_prefix_length:] + + model_inputs = {"input_ids": input_ids, "past_key_values": past_key_values, "use_cache": use_cache} + + # token_type_ids are computed on CTRLModel.forward() + kwargs.pop("token_type_ids", None) + # Forward ALL kwargs that are uninitialized (e.g. `use_cache`). + for key, value in kwargs.items(): + if key not in model_inputs: + print(f"Warning: {key} is not a recognized input.") + model_inputs[key] = value + + return model_inputs + + +@auto_docstring( + custom_intro=""" + The CTRL Model transformer with a sequence classification head on top (linear layer). + [`CTRLForSequenceClassification`] uses the last token in order to do the classification, as other causal models + (e.g. GPT-2) do. Since it does classification on the last token, it requires to know the position of the last + token. If a `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in + each row. If no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot + guess the padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last + value in each row of the batch). + """ +) +class CTRLForSequenceClassification(CTRLPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.transformer = CTRLModel(config) + self.classifier = nn.Linear(config.n_embd, self.num_labels, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple[torch.Tensor], SequenceClassifierOutput]: + r""" + input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`): + `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values[0].shape[-2]` + (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary. + + If `past_key_values` is used, only input IDs that do not have their past calculated should be passed as + `input_ids`. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and + [`PreTrainedTokenizer.encode`] for details. + + [What are input IDs?](../glossary#input-ids) + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + + Example of single-label classification: + + ```python + >>> import torch + >>> from transformers import AutoTokenizer, CTRLForSequenceClassification + + >>> tokenizer = AutoTokenizer.from_pretrained("Salesforce/ctrl") + >>> model = CTRLForSequenceClassification.from_pretrained("Salesforce/ctrl") + + >>> # CTRL was trained with control codes as the first token + >>> inputs = tokenizer("Opinion My dog is cute", return_tensors="pt") + >>> assert inputs["input_ids"][0, 0].item() in tokenizer.control_codes.values() + + >>> with torch.no_grad(): + ... logits = model(**inputs).logits + + >>> predicted_class_id = logits.argmax().item() + >>> model.config.id2label[predicted_class_id] + 'LABEL_0' + ``` + + ```python + >>> import torch + + >>> torch.manual_seed(42) # doctest: +IGNORE_RESULT + >>> # To train a model on `num_labels` classes, you can pass `num_labels=num_labels` to `.from_pretrained(...)` + >>> num_labels = len(model.config.id2label) + >>> model = CTRLForSequenceClassification.from_pretrained("Salesforce/ctrl", num_labels=num_labels) + + >>> labels = torch.tensor(1) + >>> loss = model(**inputs, labels=labels).loss + >>> round(loss.item(), 2) + 0.93 + ``` + + Example of multi-label classification: + + ```python + >>> import torch + >>> from transformers import AutoTokenizer, CTRLForSequenceClassification + + >>> tokenizer = AutoTokenizer.from_pretrained("Salesforce/ctrl") + >>> model = CTRLForSequenceClassification.from_pretrained( + ... "Salesforce/ctrl", problem_type="multi_label_classification" + ... ) + + >>> # CTRL was trained with control codes as the first token + >>> inputs = tokenizer("Opinion My dog is cute", return_tensors="pt") + >>> assert inputs["input_ids"][0, 0].item() in tokenizer.control_codes.values() + + >>> with torch.no_grad(): + ... logits = model(**inputs).logits + + >>> predicted_class_id = logits.argmax().item() + >>> model.config.id2label[predicted_class_id] + 'LABEL_0' + ``` + + ```python + >>> # To train a model on `num_labels` classes, you can pass `num_labels=num_labels` to `.from_pretrained(...)` + >>> num_labels = len(model.config.id2label) + >>> model = CTRLForSequenceClassification.from_pretrained("Salesforce/ctrl", num_labels=num_labels) + + >>> num_labels = len(model.config.id2label) + >>> labels = torch.nn.functional.one_hot(torch.tensor([predicted_class_id]), num_classes=num_labels).to( + ... torch.float + ... ) + >>> loss = model(**inputs, labels=labels).loss + >>> loss.backward() # doctest: +IGNORE_RESULT + ```""" + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + transformer_outputs = self.transformer( + input_ids, + past_key_values=past_key_values, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = transformer_outputs[0] + logits = self.classifier(hidden_states) + + if input_ids is not None: + batch_size, sequence_length = input_ids.shape[:2] + else: + batch_size, sequence_length = inputs_embeds.shape[:2] + + if self.config.pad_token_id is None and batch_size != 1: + raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.") + if self.config.pad_token_id is None: + last_non_pad_token = -1 + elif input_ids is not None: + # To handle both left- and right- padding, we take the rightmost token that is not equal to pad_token_id + non_pad_mask = (input_ids != self.config.pad_token_id).to(logits.device, torch.int32) + token_indices = torch.arange(input_ids.shape[-1], device=logits.device, dtype=torch.int32) + last_non_pad_token = (token_indices * non_pad_mask).argmax(-1) + else: + last_non_pad_token = -1 + logger.warning_once( + f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be " + "unexpected if using padding tokens in conjunction with `inputs_embeds.`" + ) + + pooled_logits = logits[torch.arange(batch_size, device=logits.device), last_non_pad_token] + + loss = None + if labels is not None: + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(pooled_logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(pooled_logits, labels) + if not return_dict: + output = (pooled_logits,) + transformer_outputs[2:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutput( + loss=loss, + logits=pooled_logits, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) + + +__all__ = ["CTRLForSequenceClassification", "CTRLLMHeadModel", "CTRLModel", "CTRLPreTrainedModel"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/ctrl/modeling_tf_ctrl.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/ctrl/modeling_tf_ctrl.py new file mode 100644 index 0000000000000000000000000000000000000000..1dce90147bd854b5072410eb6d6e74f73c0a9fd0 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/ctrl/modeling_tf_ctrl.py @@ -0,0 +1,920 @@ +# coding=utf-8 +# Copyright 2018 Salesforce and HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""TF 2.0 CTRL model.""" + +from __future__ import annotations + +import numpy as np +import tensorflow as tf + +from ...modeling_tf_outputs import TFBaseModelOutputWithPast, TFCausalLMOutputWithPast, TFSequenceClassifierOutput +from ...modeling_tf_utils import ( + TFCausalLanguageModelingLoss, + TFModelInputType, + TFPreTrainedModel, + TFSequenceClassificationLoss, + get_initializer, + keras, + keras_serializable, + unpack_inputs, +) +from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax +from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging +from .configuration_ctrl import CTRLConfig + + +logger = logging.get_logger(__name__) + +_CHECKPOINT_FOR_DOC = "Salesforce/ctrl" +_CONFIG_FOR_DOC = "CTRLConfig" + + +def angle_defn(pos, i, d_model_size): + angle_rates = 1 / np.power(10000, (2 * (i // 2)) / d_model_size) + return pos * angle_rates + + +def positional_encoding(position, d_model_size): + # create the sinusoidal pattern for the positional encoding + angle_rads = angle_defn(np.arange(position)[:, np.newaxis], np.arange(d_model_size)[np.newaxis, :], d_model_size) + + sines = np.sin(angle_rads[:, 0::2]) + cosines = np.cos(angle_rads[:, 1::2]) + pos_encoding = tf.convert_to_tensor(np.concatenate([sines, cosines], axis=-1)) + + return pos_encoding + + +def scaled_dot_product_attention(q, k, v, mask, attention_mask=None, head_mask=None): + # calculate attention + matmul_qk = tf.matmul(q, k, transpose_b=True) + + dk = tf.cast(shape_list(k)[-1], dtype=matmul_qk.dtype) + scaled_attention_logits = matmul_qk / tf.math.sqrt(dk) + + if mask is not None: + scaled_attention_logits += tf.cast(mask * -1e4, dtype=scaled_attention_logits.dtype) + + if attention_mask is not None: + # Apply the attention mask + attention_mask = tf.cast(attention_mask, dtype=scaled_attention_logits.dtype) + scaled_attention_logits = scaled_attention_logits + attention_mask + + attention_weights = stable_softmax(scaled_attention_logits, axis=-1) + + # Mask heads if we want to + if head_mask is not None: + attention_weights = attention_weights * head_mask + + output = tf.matmul(attention_weights, v) + + return output, attention_weights + + +class TFMultiHeadAttention(keras.layers.Layer): + def __init__(self, d_model_size, num_heads, output_attentions=False, **kwargs): + super().__init__(**kwargs) + self.num_heads = num_heads + self.d_model_size = d_model_size + self.output_attentions = output_attentions + + self.depth = int(d_model_size / self.num_heads) + + self.Wq = keras.layers.Dense(d_model_size, name="Wq") + self.Wk = keras.layers.Dense(d_model_size, name="Wk") + self.Wv = keras.layers.Dense(d_model_size, name="Wv") + + self.dense = keras.layers.Dense(d_model_size, name="dense") + + def split_into_heads(self, x, batch_size): + x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth)) + return tf.transpose(x, perm=[0, 2, 1, 3]) + + def call(self, v, k, q, mask, layer_past, attention_mask, head_mask, use_cache, output_attentions, training=False): + batch_size = shape_list(q)[0] + + q = self.Wq(q) + k = self.Wk(k) + v = self.Wv(v) + + q = self.split_into_heads(q, batch_size) + k = self.split_into_heads(k, batch_size) + v = self.split_into_heads(v, batch_size) + + if layer_past is not None: + past_key, past_value = tf.unstack(layer_past, axis=0) + k = tf.concat((past_key, k), axis=-2) + v = tf.concat((past_value, v), axis=-2) + + if use_cache: + present = tf.stack((k, v), axis=0) + else: + present = (None,) + + output = scaled_dot_product_attention(q, k, v, mask, attention_mask, head_mask) + scaled_attention = tf.transpose(output[0], perm=[0, 2, 1, 3]) + attn = output[1] + original_size_attention = tf.reshape(scaled_attention, (batch_size, -1, self.d_model_size)) + output = self.dense(original_size_attention) + outputs = (output, present) + + if output_attentions: + outputs = outputs + (attn,) + + return outputs + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "Wq", None) is not None: + with tf.name_scope(self.Wq.name): + self.Wq.build([None, None, self.d_model_size]) + if getattr(self, "Wk", None) is not None: + with tf.name_scope(self.Wk.name): + self.Wk.build([None, None, self.d_model_size]) + if getattr(self, "Wv", None) is not None: + with tf.name_scope(self.Wv.name): + self.Wv.build([None, None, self.d_model_size]) + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build([None, None, self.d_model_size]) + + +class TFPointWiseFeedForwardLayer(keras.layers.Layer): + def __init__(self, d_model_size, dff, **kwargs): + super().__init__(**kwargs) + + self.dense_0 = keras.layers.Dense(dff, activation="relu", name="0") + self.dense_2 = keras.layers.Dense(d_model_size, name="2") + self.d_model_size = d_model_size + self.dff = dff + + def call(self, inputs, trainable=False): + dense_0_output = self.dense_0(inputs) + dense_2_output = self.dense_2(dense_0_output) + + return dense_2_output + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense_0", None) is not None: + with tf.name_scope(self.dense_0.name): + self.dense_0.build([None, None, self.d_model_size]) + if getattr(self, "dense_2", None) is not None: + with tf.name_scope(self.dense_2.name): + self.dense_2.build([None, None, self.dff]) + + +class TFEncoderLayer(keras.layers.Layer): + def __init__( + self, d_model_size, num_heads, dff, rate=0.1, layer_norm_epsilon=1e-6, output_attentions=False, **kwargs + ): + super().__init__(**kwargs) + + self.output_attentions = output_attentions + + self.multi_head_attention = TFMultiHeadAttention( + d_model_size, num_heads, output_attentions=self.output_attentions, name="multi_head_attention" + ) + self.ffn = TFPointWiseFeedForwardLayer(d_model_size, dff, name="ffn") + + self.layernorm1 = keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name="layernorm1") + self.layernorm2 = keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name="layernorm2") + + self.dropout1 = keras.layers.Dropout(rate) + self.dropout2 = keras.layers.Dropout(rate) + self.d_model_size = d_model_size + + def call(self, x, mask, layer_past, attention_mask, head_mask, use_cache, output_attentions, training=False): + normed = self.layernorm1(x) + attn_outputs = self.multi_head_attention( + normed, + normed, + normed, + mask, + layer_past, + attention_mask, + head_mask, + use_cache, + output_attentions, + training=training, + ) + attn_output = attn_outputs[0] + attn_output = self.dropout1(attn_output, training=training) + out1 = x + attn_output + + out2 = self.layernorm2(out1) + ffn_output = self.ffn(out2) + ffn_output = self.dropout2(ffn_output, training=training) + out2 = out1 + ffn_output + + outputs = (out2,) + attn_outputs[1:] + return outputs + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "multi_head_attention", None) is not None: + with tf.name_scope(self.multi_head_attention.name): + self.multi_head_attention.build(None) + if getattr(self, "ffn", None) is not None: + with tf.name_scope(self.ffn.name): + self.ffn.build(None) + if getattr(self, "layernorm1", None) is not None: + with tf.name_scope(self.layernorm1.name): + self.layernorm1.build([None, None, self.d_model_size]) + if getattr(self, "layernorm2", None) is not None: + with tf.name_scope(self.layernorm2.name): + self.layernorm2.build([None, None, self.d_model_size]) + + +@keras_serializable +class TFCTRLMainLayer(keras.layers.Layer): + config_class = CTRLConfig + + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + + self.config = config + self.output_hidden_states = config.output_hidden_states + self.output_attentions = config.output_attentions + self.use_cache = config.use_cache + self.return_dict = config.use_return_dict + + self.d_model_size = config.n_embd + self.num_layers = config.n_layer + + self.pos_encoding = positional_encoding(config.n_positions, self.d_model_size) + + self.w = keras.layers.Embedding( + input_dim=config.vocab_size, + output_dim=config.n_embd, + embeddings_initializer=get_initializer(config.initializer_range), + name="w", + ) + + self.dropout = keras.layers.Dropout(config.embd_pdrop) + self.h = [ + TFEncoderLayer( + config.n_embd, + config.n_head, + config.dff, + config.resid_pdrop, + config.layer_norm_epsilon, + self.output_attentions, + name=f"h_._{i}", + ) + for i in range(config.n_layer) + ] + self.layernorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="layernorm") + + def get_input_embeddings(self): + return self.w + + def set_input_embeddings(self, new_embeddings): + self.w = new_embeddings + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} + """ + raise NotImplementedError + + @unpack_inputs + def call( + self, + input_ids: TFModelInputType | None = None, + past_key_values: tuple[tuple[np.ndarray | tf.Tensor]] | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + use_cache: bool | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + training: bool | None = False, + ) -> tuple | TFBaseModelOutputWithPast: + # If using past key value states, only the last tokens + # should be given as an input + if past_key_values is not None: + if input_ids is not None: + input_ids = input_ids[:, -1:] + if inputs_embeds is not None: + inputs_embeds = inputs_embeds[:, -1:] + if token_type_ids is not None: + token_type_ids = token_type_ids[:, -1:] + + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + input_shape = shape_list(input_ids) + input_ids = tf.reshape(input_ids, [-1, input_shape[-1]]) + elif inputs_embeds is not None: + input_shape = shape_list(inputs_embeds)[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + if past_key_values is None: + past_length = 0 + past_key_values = [None] * len(self.h) + else: + past_length = shape_list(past_key_values[0][0])[-2] + if position_ids is None: + position_ids = tf.expand_dims(tf.range(past_length, input_shape[-1] + past_length, dtype=tf.int32), axis=0) + position_ids = tf.tile(position_ids, [input_shape[0], 1]) + + # Attention mask. + if attention_mask is not None: + # We create a 3D attention mask from a 2D tensor mask. + # Sizes are [batch_size, 1, 1, to_seq_length] + # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] + # this attention mask is more simple than the triangular masking of causal attention + # used in OpenAI GPT, we just need to prepare the broadcast dimension here. + attention_mask = tf.reshape(attention_mask, (input_shape[0], 1, 1, input_shape[1] + past_length)) + + # Since attention_mask is 1.0 for positions we want to attend and 0.0 for + # masked positions, this operation will create a tensor which is 0.0 for + # positions we want to attend and -10000.0 for masked positions. + # Since we are adding it to the raw scores before the softmax, this is + # effectively the same as removing these entirely. + + one_cst = tf.constant(1.0) + ten_thousand_cst = tf.constant(-10000.0) + attention_mask = tf.cast(attention_mask, dtype=one_cst.dtype) + attention_mask = tf.multiply(tf.subtract(one_cst, attention_mask), ten_thousand_cst) + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # head_mask has shape n_layer x batch x n_heads x N x N + if head_mask is not None: + raise NotImplementedError + else: + head_mask = [None] * self.num_layers + + if token_type_ids is not None: + token_type_ids = tf.reshape(token_type_ids, [-1, shape_list(token_type_ids)[-1]]) + token_type_embeds = self.w(token_type_ids) + token_type_embeds *= tf.math.sqrt(tf.cast(self.d_model_size, dtype=token_type_embeds.dtype)) + else: + token_type_embeds = tf.constant(0.0) + position_ids = tf.reshape(position_ids, [-1, shape_list(position_ids)[-1]]) + + if inputs_embeds is None: + check_embeddings_within_bounds(input_ids, self.w.input_dim) + inputs_embeds = self.w(input_ids) + seq_len = input_shape[-1] + mask = 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0) + + inputs_embeds *= tf.math.sqrt(tf.cast(self.d_model_size, inputs_embeds.dtype)) + + pos_embeds = tf.gather(self.pos_encoding, position_ids) + pos_embeds = tf.cast(pos_embeds, dtype=token_type_embeds.dtype) + hidden_states = inputs_embeds + pos_embeds + token_type_embeds + + hidden_states = self.dropout(hidden_states, training=training) + + output_shape = input_shape + [shape_list(hidden_states)[-1]] + presents = () if use_cache else None + all_hidden_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + for i, (h, layer_past) in enumerate(zip(self.h, past_key_values)): + if output_hidden_states: + all_hidden_states = all_hidden_states + (tf.reshape(hidden_states, output_shape),) + outputs = h( + hidden_states, + mask, + layer_past, + attention_mask, + head_mask[i], + use_cache, + output_attentions, + training=training, + ) + hidden_states, present = outputs[:2] + + if use_cache: + presents = presents + (present,) + + if output_attentions: + all_attentions = all_attentions + (outputs[2],) + + hidden_states = self.layernorm(hidden_states) + hidden_states = tf.reshape(hidden_states, output_shape) + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if output_attentions: + # let the number of heads free (-1) so we can extract attention even after head pruning + attention_output_shape = input_shape[:-1] + [-1] + shape_list(all_attentions[0])[-2:] + all_attentions = tuple(tf.reshape(t, attention_output_shape) for t in all_attentions) + + if not return_dict: + return tuple(v for v in [hidden_states, presents, all_hidden_states, all_attentions] if v is not None) + + return TFBaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=presents, + hidden_states=all_hidden_states, + attentions=all_attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "w", None) is not None: + with tf.name_scope(self.w.name): + self.w.build(None) + if getattr(self, "layernorm", None) is not None: + with tf.name_scope(self.layernorm.name): + self.layernorm.build([None, None, self.config.n_embd]) + if getattr(self, "h", None) is not None: + for layer in self.h: + with tf.name_scope(layer.name): + layer.build(None) + + +class TFCTRLPreTrainedModel(TFPreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = CTRLConfig + base_model_prefix = "transformer" + + +CTRL_START_DOCSTRING = r""" + + This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and + behavior. + + + + TensorFlow models and layers in `transformers` accept two formats as input: + + - having all inputs as keyword arguments (like PyTorch models), or + - having all inputs as a list, tuple or dict in the first positional argument. + + The reason the second format is supported is that Keras methods prefer this format when passing inputs to models + and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just + pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second + format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with + the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first + positional argument: + + - a single Tensor with `input_ids` only and nothing else: `model(input_ids)` + - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring: + `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])` + - a dictionary with one or several input Tensors associated to the input names given in the docstring: + `model({"input_ids": input_ids, "token_type_ids": token_type_ids})` + + Note that when creating models and layers with + [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry + about any of this, as you can just pass inputs like you would to any other Python function! + + + + Parameters: + config ([`CTRLConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +CTRL_INPUTS_DOCSTRING = r""" + Args: + input_ids (`Numpy array` or `tf.Tensor` of shape `(batch_size, input_ids_length)`): + `input_ids_length` = `sequence_length` if `past` is `None` else `past[0].shape[-2]` (`sequence_length` of + input past key value states). + + Indices of input sequence tokens in the vocabulary. + + If `past` is used, only input IDs that do not have their past calculated should be passed as `input_ids`. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and + [`PreTrainedTokenizer.encode`] for details. + + [What are input IDs?](../glossary#input-ids) + past (`list[tf.Tensor]` of length `config.n_layers`): + Contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model (see + `past` output below). Can be used to speed up sequential decoding. The token ids which have their past + given to this model should not be passed as input ids as they have already been computed. + attention_mask (`tf.Tensor` or `Numpy array` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + token_type_ids (`tf.Tensor` or `Numpy array` of shape `(batch_size, input_ids_length)`, *optional*): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, + 1]`: + + - 0 corresponds to a *sentence A* token, + - 1 corresponds to a *sentence B* token. + + [What are token type IDs?](../glossary#token-type-ids) + position_ids (`tf.Tensor` or `Numpy array` of shape `(batch_size, input_ids_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + + [What are position IDs?](../glossary#position-ids) + head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + inputs_embeds (`tf.Tensor` or `Numpy array` of shape `(batch_size, input_ids_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + use_cache (`bool`, *optional*): + If set to `True`, `past` key value states are returned and can be used to speed up decoding (see `past`). + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the + config will be used instead. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. This argument can be used only in eager mode, in graph mode the value in the config will be + used instead. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in + eager mode, in graph mode the value will always be set to True. + training (`bool`, *optional*, defaults to `False`): + Whether or not to use the model in training mode (some modules like dropout modules have different + behaviors between training and evaluation). +""" + + +@add_start_docstrings( + "The bare CTRL Model transformer outputting raw hidden-states without any specific head on top.", + CTRL_START_DOCSTRING, +) +class TFCTRLModel(TFCTRLPreTrainedModel): + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + self.transformer = TFCTRLMainLayer(config, name="transformer") + + @unpack_inputs + @add_start_docstrings_to_model_forward(CTRL_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFBaseModelOutputWithPast, + config_class=_CONFIG_FOR_DOC, + ) + def call( + self, + input_ids: TFModelInputType | None = None, + past_key_values: tuple[tuple[np.ndarray | tf.Tensor]] | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + use_cache: bool | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + training: bool | None = False, + ) -> tuple | TFBaseModelOutputWithPast: + outputs = self.transformer( + input_ids=input_ids, + past_key_values=past_key_values, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + return outputs + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "transformer", None) is not None: + with tf.name_scope(self.transformer.name): + self.transformer.build(None) + + +class TFCTRLBiasLayer(keras.layers.Layer): + """ + Bias as a layer. It is used for serialization purposes: `keras.Model.save_weights` stores on a per-layer basis, + so all weights have to be registered in a layer. + """ + + def __init__(self, shape, initializer, trainable, name, **kwargs): + super().__init__(name=name, **kwargs) + self.shape = shape + self.initializer = initializer + self.trainable = trainable + + def build(self, input_shape): + self.bias = self.add_weight( + name="bias", shape=self.shape, initializer=self.initializer, trainable=self.trainable + ) + super().build(input_shape) + + def call(self, x): + return x + self.bias + + +@add_start_docstrings( + """ + The CTRL Model transformer with a language modeling head on top (linear layer with weights tied to the input + embeddings). + """, + CTRL_START_DOCSTRING, +) +class TFCTRLLMHeadModel(TFCTRLPreTrainedModel, TFCausalLanguageModelingLoss): + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + self.transformer = TFCTRLMainLayer(config, name="transformer") + self.bias_layer = TFCTRLBiasLayer( + name="lm_head", shape=[1, config.vocab_size], initializer="zeros", trainable=True + ) + + def get_output_embeddings(self): + return self.get_input_embeddings() + + def set_output_embeddings(self, value): + self.set_input_embeddings(value) + + def get_bias(self): + return {"lm_head.bias": self.bias_layer.bias} + + def set_bias(self, value): + # Replaces the existing layers containing bias for correct (de)serialization. + vocab_size = value["lm_head.bias"].shape[-1] + self.bias_layer = TFCTRLBiasLayer( + name="final_logits_bias", shape=[1, vocab_size], initializer="zeros", trainable=True + ) + self.bias_layer.build(None) + self.bias_layer.bias.assign(value["lm_head.bias"]) + + # Copied from transformers.models.gpt2.modeling_tf_gpt2.TFGPT2LMHeadModel.prepare_inputs_for_generation + def prepare_inputs_for_generation(self, inputs, past_key_values=None, use_cache=None, **kwargs): + token_type_ids = kwargs.get("token_type_ids") + # only last token for inputs_ids if past is defined in kwargs + if past_key_values: + inputs = tf.expand_dims(inputs[:, -1], -1) + if token_type_ids is not None: + token_type_ids = tf.expand_dims(token_type_ids[:, -1], -1) + + position_ids = kwargs.get("position_ids") + attention_mask = kwargs.get("attention_mask") + + if attention_mask is not None and position_ids is None: + position_ids = tf.math.cumsum(attention_mask, axis=-1, exclusive=True) + if past_key_values: + position_ids = tf.expand_dims(position_ids[:, -1], -1) + + return { + "input_ids": inputs, + "attention_mask": attention_mask, + "position_ids": position_ids, + "past_key_values": past_key_values, + "use_cache": use_cache, + "token_type_ids": token_type_ids, + } + + @unpack_inputs + @add_start_docstrings_to_model_forward(CTRL_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFCausalLMOutputWithPast, + config_class=_CONFIG_FOR_DOC, + ) + def call( + self, + input_ids: TFModelInputType | None = None, + past_key_values: tuple[tuple[np.ndarray | tf.Tensor]] | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + use_cache: bool | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + labels: np.ndarray | tf.Tensor | None = None, + training: bool | None = False, + ) -> tuple | TFCausalLMOutputWithPast: + r""" + labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the cross entropy classification loss. Indices should be in `[0, ..., + config.vocab_size - 1]`. + """ + transformer_outputs = self.transformer( + input_ids=input_ids, + past_key_values=past_key_values, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + hidden_states = transformer_outputs[0] + logits = tf.matmul(hidden_states, self.transformer.w.weights, transpose_b=True) + logits = self.bias_layer(logits) + + loss = None + if labels is not None: + # shift labels to the left and cut last logit token + shifted_logits = logits[:, :-1] + labels = labels[:, 1:] + loss = self.hf_compute_loss(labels, shifted_logits) + + if not return_dict: + output = (logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return TFCausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=transformer_outputs.past_key_values, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "transformer", None) is not None: + with tf.name_scope(self.transformer.name): + self.transformer.build(None) + if getattr(self, "bias_layer", None) is not None: + with tf.name_scope(self.bias_layer.name): + self.bias_layer.build(None) + + +@add_start_docstrings( + """ + The CTRL Model transformer with a sequence classification head on top (linear layer). + + [`TFCTRLForSequenceClassification`] uses the last token in order to do the classification, as other causal models + (e.g. GPT-1, GPT-2) do. + + Since it does classification on the last token, it requires to know the position of the last token. If a + `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If + no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the + padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in + each row of the batch). + """, + CTRL_START_DOCSTRING, +) +class TFCTRLForSequenceClassification(TFCTRLPreTrainedModel, TFSequenceClassificationLoss): + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + self.num_labels = config.num_labels + self.classifier = keras.layers.Dense( + config.num_labels, + kernel_initializer=get_initializer(config.initializer_range), + name="classifier", + use_bias=False, + ) + self.transformer = TFCTRLMainLayer(config, name="transformer") + self.config = config + + def get_output_embeddings(self): + # Remove after transformers v4.32. Fix this model's `test_model_common_attributes` test too. + logger.warning( + "Sequence classification models do not have output embeddings. `.get_output_embeddings` will be removed " + "in transformers v4.32." + ) + return self.transformer.w + + @unpack_inputs + @add_start_docstrings_to_model_forward(CTRL_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFSequenceClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) + def call( + self, + input_ids: TFModelInputType | None = None, + past_key_values: tuple[tuple[np.ndarray | tf.Tensor]] | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + use_cache: bool | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + labels: np.ndarray | tf.Tensor | None = None, + training: bool | None = False, + ) -> tuple | TFSequenceClassifierOutput: + r""" + labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the cross entropy classification loss. Indices should be in `[0, ..., + config.vocab_size - 1]`. + """ + + transformer_outputs = self.transformer( + input_ids=input_ids, + past_key_values=past_key_values, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + hidden_states = transformer_outputs[0] + logits = self.classifier(hidden_states) + logits_shape = shape_list(logits) + batch_size = logits_shape[0] + + if self.config.pad_token_id is None: + last_non_pad_token = tf.fill((batch_size,), value=logits_shape[1] - 1) + else: + if input_ids is not None: + token_indices = tf.range(shape_list(input_ids)[-1]) + non_pad_mask = tf.cast(input_ids != self.config.pad_token_id, token_indices.dtype) + last_non_pad_token = tf.reduce_max(token_indices * non_pad_mask, axis=-1) + else: + last_non_pad_token = tf.fill((batch_size,), value=logits_shape[1] - 1) + logger.warning_once( + f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be " + "unexpected if using padding tokens in conjunction with `inputs_embeds.`" + ) + loss = None + + pooled_logits = tf.gather(logits, last_non_pad_token, batch_dims=1, axis=1) + + if labels is not None: + if self.config.pad_token_id is None and logits_shape[0] != 1: + raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.") + + loss = self.hf_compute_loss(tf.reshape(labels, [-1]), tf.reshape(pooled_logits, [-1, self.num_labels])) + + if not return_dict: + output = (pooled_logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return TFSequenceClassifierOutput( + loss=loss, + logits=pooled_logits, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build([None, None, self.config.n_embd]) + if getattr(self, "transformer", None) is not None: + with tf.name_scope(self.transformer.name): + self.transformer.build(None) + + +__all__ = ["TFCTRLForSequenceClassification", "TFCTRLLMHeadModel", "TFCTRLModel", "TFCTRLPreTrainedModel"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/ctrl/tokenization_ctrl.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/ctrl/tokenization_ctrl.py new file mode 100644 index 0000000000000000000000000000000000000000..5b7935e6404d153e67f89d5c24c4bfd04936a180 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/ctrl/tokenization_ctrl.py @@ -0,0 +1,251 @@ +# coding=utf-8 +# Copyright 2018 Salesforce and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes for Salesforce CTRL.""" + +import json +import os +from typing import Optional + +import regex as re + +from ...tokenization_utils import PreTrainedTokenizer +from ...utils import logging + + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = { + "vocab_file": "vocab.json", + "merges_file": "merges.txt", +} + + +CONTROL_CODES = { + "Pregnancy": 168629, + "Christianity": 7675, + "Explain": 106423, + "Fitness": 63440, + "Saving": 63163, + "Ask": 27171, + "Ass": 95985, + "Joke": 163509, + "Questions": 45622, + "Thoughts": 49605, + "Retail": 52342, + "Feminism": 164338, + "Writing": 11992, + "Atheism": 192263, + "Netflix": 48616, + "Computing": 39639, + "Opinion": 43213, + "Alone": 44967, + "Funny": 58917, + "Gaming": 40358, + "Human": 4088, + "India": 1331, + "Joker": 77138, + "Diet": 36206, + "Legal": 11859, + "Norman": 4939, + "Tip": 72689, + "Weight": 52343, + "Movies": 46273, + "Running": 23425, + "Science": 2090, + "Horror": 37793, + "Confession": 60572, + "Finance": 12250, + "Politics": 16360, + "Scary": 191985, + "Support": 12654, + "Technologies": 32516, + "Teenage": 66160, + "Event": 32769, + "Learned": 67460, + "Notion": 182770, + "Wikipedia": 37583, + "Books": 6665, + "Extract": 76050, + "Confessions": 102701, + "Conspiracy": 75932, + "Links": 63674, + "Narcissus": 150425, + "Relationship": 54766, + "Relationships": 134796, + "Reviews": 41671, + "News": 4256, + "Translation": 26820, + "multilingual": 128406, +} + + +def get_pairs(word): + """ + Return set of symbol pairs in a word. + + Word is represented as tuple of symbols (symbols being variable-length strings). + """ + pairs = set() + prev_char = word[0] + for char in word[1:]: + pairs.add((prev_char, char)) + prev_char = char + + pairs = set(pairs) + return pairs + + +class CTRLTokenizer(PreTrainedTokenizer): + """ + Construct a CTRL tokenizer. Based on Byte-Pair-Encoding. + + This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to + this superclass for more information regarding those methods. + + Args: + vocab_file (`str`): + Path to the vocabulary file. + merges_file (`str`): + Path to the merges file. + unk_token (`str`, *optional*, defaults to `""`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + """ + + vocab_files_names = VOCAB_FILES_NAMES + control_codes = CONTROL_CODES + + def __init__(self, vocab_file, merges_file, unk_token="", **kwargs): + with open(vocab_file, encoding="utf-8") as vocab_handle: + self.encoder = json.load(vocab_handle) + self.decoder = {v: k for k, v in self.encoder.items()} + with open(merges_file, encoding="utf-8") as merges_handle: + merges = merges_handle.read().split("\n")[1:-1] + merges = [tuple(merge.split()) for merge in merges] + self.bpe_ranks = dict(zip(merges, range(len(merges)))) + self.cache = {} + super().__init__(unk_token=unk_token, **kwargs) + + @property + def vocab_size(self): + return len(self.encoder) + + def get_vocab(self): + return dict(self.encoder, **self.added_tokens_encoder) + + def bpe(self, token): + if token in self.cache: + return self.cache[token] + word = tuple(token) + word = tuple(list(word[:-1]) + [word[-1] + ""]) + pairs = get_pairs(word) + + if not pairs: + return token + + while True: + bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf"))) + if bigram not in self.bpe_ranks: + break + first, second = bigram + new_word = [] + i = 0 + while i < len(word): + try: + j = word.index(first, i) + except ValueError: + new_word.extend(word[i:]) + break + else: + new_word.extend(word[i:j]) + i = j + + if word[i] == first and i < len(word) - 1 and word[i + 1] == second: + new_word.append(first + second) + i += 2 + else: + new_word.append(word[i]) + i += 1 + new_word = tuple(new_word) + word = new_word + if len(word) == 1: + break + else: + pairs = get_pairs(word) + word = "@@ ".join(word) + word = word[:-4] + self.cache[token] = word + return word + + def _tokenize(self, text): + """Tokenize a string.""" + split_tokens = [] + + words = re.findall(r"\S+\n?", text) + + for token in words: + split_tokens.extend(list(self.bpe(token).split(" "))) + return split_tokens + + def _convert_token_to_id(self, token): + """Converts a token (str) in an id using the vocab.""" + return self.encoder.get(token, self.encoder.get(self.unk_token)) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.decoder.get(index, self.unk_token) + + def convert_tokens_to_string(self, tokens): + """Converts a sequence of tokens (string) in a single string.""" + out_string = " ".join(tokens).replace("@@ ", "").strip() + return out_string + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]: + if not os.path.isdir(save_directory): + logger.error(f"Vocabulary path ({save_directory}) should be a directory") + return + vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + merge_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"] + ) + + with open(vocab_file, "w", encoding="utf-8") as f: + f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n") + + index = 0 + with open(merge_file, "w", encoding="utf-8") as writer: + writer.write("#version: 0.2\n") + for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]): + if index != token_index: + logger.warning( + f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive." + " Please check that the tokenizer is not corrupted!" + ) + index = token_index + writer.write(" ".join(bpe_tokens) + "\n") + index += 1 + + return vocab_file, merge_file + + # def decode(self, token_ids, skip_special_tokens=False, clean_up_tokenization_spaces=True): + # filtered_tokens = ' '.join(self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)) + # tokens_generated_so_far = re.sub('(@@ )', '', string=filtered_tokens) + # tokens_generated_so_far = re.sub('(@@ ?$)', '', string=tokens_generated_so_far) + # return ''.join(tokens_generated_so_far) + + +__all__ = ["CTRLTokenizer"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/d_fine/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/d_fine/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..879b53709bc673bcf28553a51175f06fa1e362c0 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/d_fine/__init__.py @@ -0,0 +1,29 @@ +# Copyright 2025 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_d_fine import * + from .modeling_d_fine import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/d_fine/configuration_d_fine.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/d_fine/configuration_d_fine.py new file mode 100644 index 0000000000000000000000000000000000000000..7484d9a347e534f3ebfff5a0776a0413a5a416dc --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/d_fine/configuration_d_fine.py @@ -0,0 +1,433 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/d_fine/modular_d_fine.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_d_fine.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright 2025 Baidu Inc and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from ...configuration_utils import PretrainedConfig +from ...utils import logging +from ...utils.backbone_utils import verify_backbone_config_arguments +from ..auto import CONFIG_MAPPING + + +logger = logging.get_logger(__name__) + + +# TODO: Attribute map assignment logic should be fixed in modular +# as well as super() call parsing because otherwise we cannot re-write args after initialization +class DFineConfig(PretrainedConfig): + """ + This is the configuration class to store the configuration of a [`DFineModel`]. It is used to instantiate a D-FINE + model according to the specified arguments, defining the model architecture. Instantiating a configuration with the + defaults will yield a similar configuration to that of D-FINE-X-COCO "[ustc-community/dfine-xlarge-coco"](https://huggingface.co/ustc-community/dfine-xlarge-coco"). + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + initializer_range (`float`, *optional*, defaults to 0.01): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + initializer_bias_prior_prob (`float`, *optional*): + The prior probability used by the bias initializer to initialize biases for `enc_score_head` and `class_embed`. + If `None`, `prior_prob` computed as `prior_prob = 1 / (num_labels + 1)` while initializing model weights. + layer_norm_eps (`float`, *optional*, defaults to 1e-05): + The epsilon used by the layer normalization layers. + batch_norm_eps (`float`, *optional*, defaults to 1e-05): + The epsilon used by the batch normalization layers. + backbone_config (`Dict`, *optional*, defaults to `RTDetrResNetConfig()`): + The configuration of the backbone model. + backbone (`str`, *optional*): + Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this + will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone` + is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights. + use_pretrained_backbone (`bool`, *optional*, defaults to `False`): + Whether to use pretrained weights for the backbone. + use_timm_backbone (`bool`, *optional*, defaults to `False`): + Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers + library. + freeze_backbone_batch_norms (`bool`, *optional*, defaults to `True`): + Whether to freeze the batch normalization layers in the backbone. + backbone_kwargs (`dict`, *optional*): + Keyword arguments to be passed to AutoBackbone when loading from a checkpoint + e.g. `{'out_indices': (0, 1, 2, 3)}`. Cannot be specified if `backbone_config` is set. + encoder_hidden_dim (`int`, *optional*, defaults to 256): + Dimension of the layers in hybrid encoder. + encoder_in_channels (`list`, *optional*, defaults to `[512, 1024, 2048]`): + Multi level features input for encoder. + feat_strides (`list[int]`, *optional*, defaults to `[8, 16, 32]`): + Strides used in each feature map. + encoder_layers (`int`, *optional*, defaults to 1): + Total of layers to be used by the encoder. + encoder_ffn_dim (`int`, *optional*, defaults to 1024): + Dimension of the "intermediate" (often named feed-forward) layer in decoder. + encoder_attention_heads (`int`, *optional*, defaults to 8): + Number of attention heads for each attention layer in the Transformer encoder. + dropout (`float`, *optional*, defaults to 0.0): + The ratio for all dropout layers. + activation_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for activations inside the fully connected layer. + encode_proj_layers (`list[int]`, *optional*, defaults to `[2]`): + Indexes of the projected layers to be used in the encoder. + positional_encoding_temperature (`int`, *optional*, defaults to 10000): + The temperature parameter used to create the positional encodings. + encoder_activation_function (`str`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"silu"` and `"gelu_new"` are supported. + activation_function (`str`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the general layer. If string, `"gelu"`, + `"relu"`, `"silu"` and `"gelu_new"` are supported. + eval_size (`tuple[int, int]`, *optional*): + Height and width used to computes the effective height and width of the position embeddings after taking + into account the stride. + normalize_before (`bool`, *optional*, defaults to `False`): + Determine whether to apply layer normalization in the transformer encoder layer before self-attention and + feed-forward modules. + hidden_expansion (`float`, *optional*, defaults to 1.0): + Expansion ratio to enlarge the dimension size of RepVGGBlock and CSPRepLayer. + d_model (`int`, *optional*, defaults to 256): + Dimension of the layers exclude hybrid encoder. + num_queries (`int`, *optional*, defaults to 300): + Number of object queries. + decoder_in_channels (`list`, *optional*, defaults to `[256, 256, 256]`): + Multi level features dimension for decoder + decoder_ffn_dim (`int`, *optional*, defaults to 1024): + Dimension of the "intermediate" (often named feed-forward) layer in decoder. + num_feature_levels (`int`, *optional*, defaults to 3): + The number of input feature levels. + decoder_n_points (`int`, *optional*, defaults to 4): + The number of sampled keys in each feature level for each attention head in the decoder. + decoder_layers (`int`, *optional*, defaults to 6): + Number of decoder layers. + decoder_attention_heads (`int`, *optional*, defaults to 8): + Number of attention heads for each attention layer in the Transformer decoder. + decoder_activation_function (`str`, *optional*, defaults to `"relu"`): + The non-linear activation function (function or string) in the decoder. If string, `"gelu"`, + `"relu"`, `"silu"` and `"gelu_new"` are supported. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + num_denoising (`int`, *optional*, defaults to 100): + The total number of denoising tasks or queries to be used for contrastive denoising. + label_noise_ratio (`float`, *optional*, defaults to 0.5): + The fraction of denoising labels to which random noise should be added. + box_noise_scale (`float`, *optional*, defaults to 1.0): + Scale or magnitude of noise to be added to the bounding boxes. + learn_initial_query (`bool`, *optional*, defaults to `False`): + Indicates whether the initial query embeddings for the decoder should be learned during training + anchor_image_size (`tuple[int, int]`, *optional*): + Height and width of the input image used during evaluation to generate the bounding box anchors. If None, automatic generate anchor is applied. + with_box_refine (`bool`, *optional*, defaults to `True`): + Whether to apply iterative bounding box refinement, where each decoder layer refines the bounding boxes + based on the predictions from the previous layer. + is_encoder_decoder (`bool`, *optional*, defaults to `True`): + Whether the architecture has an encoder decoder structure. + matcher_alpha (`float`, *optional*, defaults to 0.25): + Parameter alpha used by the Hungarian Matcher. + matcher_gamma (`float`, *optional*, defaults to 2.0): + Parameter gamma used by the Hungarian Matcher. + matcher_class_cost (`float`, *optional*, defaults to 2.0): + The relative weight of the class loss used by the Hungarian Matcher. + matcher_bbox_cost (`float`, *optional*, defaults to 5.0): + The relative weight of the bounding box loss used by the Hungarian Matcher. + matcher_giou_cost (`float`, *optional*, defaults to 2.0): + The relative weight of the giou loss of used by the Hungarian Matcher. + use_focal_loss (`bool`, *optional*, defaults to `True`): + Parameter informing if focal focal should be used. + auxiliary_loss (`bool`, *optional*, defaults to `True`): + Whether auxiliary decoding losses (loss at each decoder layer) are to be used. + focal_loss_alpha (`float`, *optional*, defaults to 0.75): + Parameter alpha used to compute the focal loss. + focal_loss_gamma (`float`, *optional*, defaults to 2.0): + Parameter gamma used to compute the focal loss. + weight_loss_vfl (`float`, *optional*, defaults to 1.0): + Relative weight of the varifocal loss in the object detection loss. + weight_loss_bbox (`float`, *optional*, defaults to 5.0): + Relative weight of the L1 bounding box loss in the object detection loss. + weight_loss_giou (`float`, *optional*, defaults to 2.0): + Relative weight of the generalized IoU loss in the object detection loss. + weight_loss_fgl (`float`, *optional*, defaults to 0.15): + Relative weight of the fine-grained localization loss in the object detection loss. + weight_loss_ddf (`float`, *optional*, defaults to 1.5): + Relative weight of the decoupled distillation focal loss in the object detection loss. + eos_coefficient (`float`, *optional*, defaults to 0.0001): + Relative classification weight of the 'no-object' class in the object detection loss. + eval_idx (`int`, *optional*, defaults to -1): + Index of the decoder layer to use for evaluation. If negative, counts from the end + (e.g., -1 means use the last layer). This allows for early prediction in the decoder + stack while still training later layers. + layer_scale (`float`, *optional*, defaults to `1.0`): + Scaling factor for the hidden dimension in later decoder layers. Used to adjust the + model capacity after the evaluation layer. + max_num_bins (`int`, *optional*, defaults to 32): + Maximum number of bins for the distribution-guided bounding box refinement. + Higher values allow for more fine-grained localization but increase computation. + reg_scale (`float`, *optional*, defaults to 4.0): + Scale factor for the regression distribution. Controls the range and granularity + of the bounding box refinement process. + depth_mult (`float`, *optional*, defaults to 1.0): + Multiplier for the number of blocks in RepNCSPELAN4 layers. Used to scale the model's + depth while maintaining its architecture. + top_prob_values (`int`, *optional*, defaults to 4): + Number of top probability values to consider from each corner's distribution. + lqe_hidden_dim (`int`, *optional*, defaults to 64): + Hidden dimension size for the Location Quality Estimator (LQE) network. + lqe_layers (`int`, *optional*, defaults to 2): + Number of layers in the Location Quality Estimator MLP. + decoder_offset_scale (`float`, *optional*, defaults to 0.5): + Offset scale used in deformable attention. + decoder_method (`str`, *optional*, defaults to `"default"`): + The method to use for the decoder: `"default"` or `"discrete"`. + up (`float`, *optional*, defaults to 0.5): + Controls the upper bounds of the Weighting Function. + """ + + model_type = "d_fine" + layer_types = ["basic", "bottleneck"] + attribute_map = { + "hidden_size": "d_model", + "num_attention_heads": "encoder_attention_heads", + } + + def __init__( + self, + initializer_range=0.01, + initializer_bias_prior_prob=None, + layer_norm_eps=1e-5, + batch_norm_eps=1e-5, + # backbone + backbone_config=None, + backbone=None, + use_pretrained_backbone=False, + use_timm_backbone=False, + freeze_backbone_batch_norms=True, + backbone_kwargs=None, + # encoder HybridEncoder + encoder_hidden_dim=256, + encoder_in_channels=[512, 1024, 2048], + feat_strides=[8, 16, 32], + encoder_layers=1, + encoder_ffn_dim=1024, + encoder_attention_heads=8, + dropout=0.0, + activation_dropout=0.0, + encode_proj_layers=[2], + positional_encoding_temperature=10000, + encoder_activation_function="gelu", + activation_function="silu", + eval_size=None, + normalize_before=False, + hidden_expansion=1.0, + # decoder DFineTransformer + d_model=256, + num_queries=300, + decoder_in_channels=[256, 256, 256], + decoder_ffn_dim=1024, + num_feature_levels=3, + decoder_n_points=4, + decoder_layers=6, + decoder_attention_heads=8, + decoder_activation_function="relu", + attention_dropout=0.0, + num_denoising=100, + label_noise_ratio=0.5, + box_noise_scale=1.0, + learn_initial_query=False, + anchor_image_size=None, + with_box_refine=True, + is_encoder_decoder=True, + # Loss + matcher_alpha=0.25, + matcher_gamma=2.0, + matcher_class_cost=2.0, + matcher_bbox_cost=5.0, + matcher_giou_cost=2.0, + use_focal_loss=True, + auxiliary_loss=True, + focal_loss_alpha=0.75, + focal_loss_gamma=2.0, + weight_loss_vfl=1.0, + weight_loss_bbox=5.0, + weight_loss_giou=2.0, + weight_loss_fgl=0.15, + weight_loss_ddf=1.5, + eos_coefficient=1e-4, + eval_idx=-1, + layer_scale=1, + max_num_bins=32, + reg_scale=4.0, + depth_mult=1.0, + top_prob_values=4, + lqe_hidden_dim=64, + lqe_layers=2, + decoder_offset_scale=0.5, + decoder_method="default", + up=0.5, + **kwargs, + ): + self.initializer_range = initializer_range + self.initializer_bias_prior_prob = initializer_bias_prior_prob + self.layer_norm_eps = layer_norm_eps + self.batch_norm_eps = batch_norm_eps + # backbone + if backbone_config is None and backbone is None: + logger.info( + "`backbone_config` and `backbone` are `None`. Initializing the config with the default `HGNet-V2` backbone." + ) + backbone_model_type = "hgnet_v2" + config_class = CONFIG_MAPPING[backbone_model_type] + # this will map it to RTDetrResNetConfig + # note: we can instead create HGNetV2Config + # and we would need to create HGNetV2Backbone + backbone_config = config_class( + num_channels=3, + embedding_size=64, + hidden_sizes=[256, 512, 1024, 2048], + depths=[3, 4, 6, 3], + layer_type="bottleneck", + hidden_act="relu", + downsample_in_first_stage=False, + downsample_in_bottleneck=False, + out_features=None, + out_indices=[2, 3, 4], + ) + elif isinstance(backbone_config, dict): + backbone_model_type = backbone_config.pop("model_type") + config_class = CONFIG_MAPPING[backbone_model_type] + backbone_config = config_class.from_dict(backbone_config) + + verify_backbone_config_arguments( + use_timm_backbone=use_timm_backbone, + use_pretrained_backbone=use_pretrained_backbone, + backbone=backbone, + backbone_config=backbone_config, + backbone_kwargs=backbone_kwargs, + ) + + self.backbone_config = backbone_config + self.backbone = backbone + self.use_pretrained_backbone = use_pretrained_backbone + self.use_timm_backbone = use_timm_backbone + self.freeze_backbone_batch_norms = freeze_backbone_batch_norms + self.backbone_kwargs = backbone_kwargs + # encoder + self.encoder_hidden_dim = encoder_hidden_dim + self.encoder_in_channels = encoder_in_channels + self.feat_strides = feat_strides + self.encoder_attention_heads = encoder_attention_heads + self.encoder_ffn_dim = encoder_ffn_dim + self.dropout = dropout + self.activation_dropout = activation_dropout + self.encode_proj_layers = encode_proj_layers + self.encoder_layers = encoder_layers + self.positional_encoding_temperature = positional_encoding_temperature + self.eval_size = eval_size + self.normalize_before = normalize_before + self.encoder_activation_function = encoder_activation_function + self.activation_function = activation_function + self.hidden_expansion = hidden_expansion + # decoder + self.d_model = d_model + self.num_queries = num_queries + self.decoder_ffn_dim = decoder_ffn_dim + self.decoder_in_channels = decoder_in_channels + self.num_feature_levels = num_feature_levels + self.decoder_n_points = decoder_n_points + self.decoder_layers = decoder_layers + self.decoder_attention_heads = decoder_attention_heads + self.decoder_activation_function = decoder_activation_function + self.attention_dropout = attention_dropout + self.num_denoising = num_denoising + self.label_noise_ratio = label_noise_ratio + self.box_noise_scale = box_noise_scale + self.learn_initial_query = learn_initial_query + self.anchor_image_size = anchor_image_size + self.auxiliary_loss = auxiliary_loss + self.with_box_refine = with_box_refine + # Loss + self.matcher_alpha = matcher_alpha + self.matcher_gamma = matcher_gamma + self.matcher_class_cost = matcher_class_cost + self.matcher_bbox_cost = matcher_bbox_cost + self.matcher_giou_cost = matcher_giou_cost + self.use_focal_loss = use_focal_loss + self.focal_loss_alpha = focal_loss_alpha + self.focal_loss_gamma = focal_loss_gamma + self.weight_loss_vfl = weight_loss_vfl + self.weight_loss_bbox = weight_loss_bbox + self.weight_loss_giou = weight_loss_giou + self.weight_loss_fgl = weight_loss_fgl + self.weight_loss_ddf = weight_loss_ddf + self.eos_coefficient = eos_coefficient + # add the new attributes with the given values or defaults + self.eval_idx = eval_idx + self.layer_scale = layer_scale + self.max_num_bins = max_num_bins + self.reg_scale = reg_scale + self.depth_mult = depth_mult + self.decoder_offset_scale = decoder_offset_scale + self.decoder_method = decoder_method + self.top_prob_values = top_prob_values + self.lqe_hidden_dim = lqe_hidden_dim + self.lqe_layers = lqe_layers + self.up = up + + if isinstance(self.decoder_n_points, list): + if len(self.decoder_n_points) != self.num_feature_levels: + raise ValueError( + f"Length of decoder_n_points list ({len(self.decoder_n_points)}) must match num_feature_levels ({self.num_feature_levels})." + ) + + head_dim = self.d_model // self.decoder_attention_heads + if head_dim * self.decoder_attention_heads != self.d_model: + raise ValueError( + f"Embedded dimension {self.d_model} must be divisible by decoder_attention_heads {self.decoder_attention_heads}" + ) + super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) + + @property + def num_attention_heads(self) -> int: + return self.encoder_attention_heads + + @property + def hidden_size(self) -> int: + return self.d_model + + @property + def sub_configs(self): + return ( + {"backbone_config": type(self.backbone_config)} + if getattr(self, "backbone_config", None) is not None + else {} + ) + + @classmethod + def from_backbone_configs(cls, backbone_config: PretrainedConfig, **kwargs): + """Instantiate a [`DFineConfig`] (or a derived class) from a pre-trained backbone model configuration and DETR model + configuration. + + Args: + backbone_config ([`PretrainedConfig`]): + The backbone configuration. + + Returns: + [`DFineConfig`]: An instance of a configuration object + """ + return cls( + backbone_config=backbone_config, + **kwargs, + ) + + +__all__ = ["DFineConfig"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/d_fine/modeling_d_fine.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/d_fine/modeling_d_fine.py new file mode 100644 index 0000000000000000000000000000000000000000..cdc008e3c7bbfc6f9f6c14be256c782c11846feb --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/d_fine/modeling_d_fine.py @@ -0,0 +1,2196 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/d_fine/modular_d_fine.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_d_fine.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright 2025 Baidu Inc and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import math +from dataclasses import dataclass +from typing import Any, Optional, Union + +import torch +import torch.nn.functional as F +import torch.nn.init as init +from torch import Tensor, nn + +from ...activations import ACT2CLS, ACT2FN +from ...image_transforms import center_to_corners_format, corners_to_center_format +from ...modeling_outputs import BaseModelOutput +from ...modeling_utils import PreTrainedModel +from ...pytorch_utils import compile_compatible_method_lru_cache +from ...utils import ModelOutput, auto_docstring, is_torchdynamo_compiling, torch_int +from ...utils.backbone_utils import load_backbone +from .configuration_d_fine import DFineConfig + + +def multi_scale_deformable_attention_v2( + value: Tensor, + value_spatial_shapes: Tensor, + sampling_locations: Tensor, + attention_weights: Tensor, + num_points_list: list[int], + method="default", +) -> Tensor: + batch_size, _, num_heads, hidden_dim = value.shape + _, num_queries, num_heads, num_levels, num_points = sampling_locations.shape + value_list = ( + value.permute(0, 2, 3, 1) + .flatten(0, 1) + .split([height * width for height, width in value_spatial_shapes], dim=-1) + ) + # sampling_offsets [8, 480, 8, 12, 2] + if method == "default": + sampling_grids = 2 * sampling_locations - 1 + elif method == "discrete": + sampling_grids = sampling_locations + sampling_grids = sampling_grids.permute(0, 2, 1, 3, 4).flatten(0, 1) + sampling_grids = sampling_grids.split(num_points_list, dim=-2) + sampling_value_list = [] + for level_id, (height, width) in enumerate(value_spatial_shapes): + # batch_size, height*width, num_heads, hidden_dim + # -> batch_size, height*width, num_heads*hidden_dim + # -> batch_size, num_heads*hidden_dim, height*width + # -> batch_size*num_heads, hidden_dim, height, width + value_l_ = value_list[level_id].reshape(batch_size * num_heads, hidden_dim, height, width) + # batch_size, num_queries, num_heads, num_points, 2 + # -> batch_size, num_heads, num_queries, num_points, 2 + # -> batch_size*num_heads, num_queries, num_points, 2 + sampling_grid_l_ = sampling_grids[level_id] + # batch_size*num_heads, hidden_dim, num_queries, num_points + if method == "default": + sampling_value_l_ = nn.functional.grid_sample( + value_l_, sampling_grid_l_, mode="bilinear", padding_mode="zeros", align_corners=False + ) + elif method == "discrete": + sampling_coord = (sampling_grid_l_ * torch.tensor([[width, height]], device=value.device) + 0.5).to( + torch.int64 + ) + + # Separate clamping for x and y coordinates + sampling_coord_x = sampling_coord[..., 0].clamp(0, width - 1) + sampling_coord_y = sampling_coord[..., 1].clamp(0, height - 1) + + # Combine the clamped coordinates + sampling_coord = torch.stack([sampling_coord_x, sampling_coord_y], dim=-1) + sampling_coord = sampling_coord.reshape(batch_size * num_heads, num_queries * num_points_list[level_id], 2) + sampling_idx = ( + torch.arange(sampling_coord.shape[0], device=value.device) + .unsqueeze(-1) + .repeat(1, sampling_coord.shape[1]) + ) + sampling_value_l_ = value_l_[sampling_idx, :, sampling_coord[..., 1], sampling_coord[..., 0]] + sampling_value_l_ = sampling_value_l_.permute(0, 2, 1).reshape( + batch_size * num_heads, hidden_dim, num_queries, num_points_list[level_id] + ) + sampling_value_list.append(sampling_value_l_) + # (batch_size, num_queries, num_heads, num_levels, num_points) + # -> (batch_size, num_heads, num_queries, num_levels, num_points) + # -> (batch_size, num_heads, 1, num_queries, num_levels*num_points) + attention_weights = attention_weights.permute(0, 2, 1, 3).reshape( + batch_size * num_heads, 1, num_queries, sum(num_points_list) + ) + output = ( + (torch.concat(sampling_value_list, dim=-1) * attention_weights) + .sum(-1) + .view(batch_size, num_heads * hidden_dim, num_queries) + ) + return output.transpose(1, 2).contiguous() + + +class DFineMultiscaleDeformableAttention(nn.Module): + def __init__(self, config: DFineConfig): + """ + D-Fine version of multiscale deformable attention + """ + super().__init__() + self.d_model = config.d_model + self.n_heads = config.decoder_attention_heads + self.n_levels = config.num_feature_levels + self.offset_scale = config.decoder_offset_scale + self.decoder_method = config.decoder_method + self.n_points = config.decoder_n_points + + if isinstance(self.n_points, list): + num_points_list = self.n_points + else: + num_points_list = [self.n_points for _ in range(self.n_levels)] + + self.num_points_list = num_points_list + num_points_scale = [1 / n for n in self.num_points_list for _ in range(n)] + self.register_buffer("num_points_scale", torch.tensor(num_points_scale, dtype=torch.float32)) + + self.total_points = self.n_heads * sum(self.num_points_list) + + self.sampling_offsets = nn.Linear(self.d_model, self.total_points * 2) + self.attention_weights = nn.Linear(self.d_model, self.total_points) + + self.ms_deformable_attn_core = multi_scale_deformable_attention_v2 + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + reference_points=None, + encoder_hidden_states=None, + spatial_shapes=None, + spatial_shapes_list=None, + ) -> tuple[torch.Tensor, torch.Tensor]: + batch_size, num_queries, _ = hidden_states.shape + batch_size, sequence_length, _ = encoder_hidden_states.shape + + if not is_torchdynamo_compiling() and (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() != sequence_length: + raise ValueError( + "Make sure to align the spatial shapes with the sequence length of the encoder hidden states" + ) + + # Reshape for multi-head attention + value = encoder_hidden_states.reshape(batch_size, sequence_length, self.n_heads, self.d_model // self.n_heads) + if attention_mask is not None: + value = value.masked_fill(~attention_mask[..., None], float(0)) + + sampling_offsets: torch.Tensor = self.sampling_offsets(hidden_states) + sampling_offsets = sampling_offsets.reshape( + batch_size, num_queries, self.n_heads, sum(self.num_points_list), 2 + ) + + attention_weights = self.attention_weights(hidden_states).reshape( + batch_size, num_queries, self.n_heads, sum(self.num_points_list) + ) + attention_weights = F.softmax(attention_weights, dim=-1) + + if reference_points.shape[-1] == 2: + offset_normalizer = torch.tensor(spatial_shapes) + offset_normalizer = offset_normalizer.flip([1]).reshape(1, 1, 1, self.n_levels, 1, 2) + sampling_locations = ( + reference_points.reshape(batch_size, sequence_length, 1, self.n_levels, 1, 2) + + sampling_offsets / offset_normalizer + ) + elif reference_points.shape[-1] == 4: + # reference_points [8, 480, None, 1, 4] + # sampling_offsets [8, 480, 8, 12, 2] + num_points_scale = self.num_points_scale.to(dtype=hidden_states.dtype).unsqueeze(-1) + offset = sampling_offsets * num_points_scale * reference_points[:, :, None, :, 2:] * self.offset_scale + sampling_locations = reference_points[:, :, None, :, :2] + offset + else: + raise ValueError( + f"Last dim of reference_points must be 2 or 4, but get {reference_points.shape[-1]} instead." + ) + + output = self.ms_deformable_attn_core( + value, + spatial_shapes_list, + sampling_locations, + attention_weights, + self.num_points_list, + self.decoder_method, + ) + + return output, attention_weights + + +class DFineGate(nn.Module): + def __init__(self, d_model: int): + super().__init__() + self.gate = nn.Linear(2 * d_model, 2 * d_model) + self.norm = nn.LayerNorm(d_model) + + def forward(self, second_residual: torch.Tensor, hidden_states: torch.Tensor) -> torch.Tensor: + gate_input = torch.cat([second_residual, hidden_states], dim=-1) + gates = torch.sigmoid(self.gate(gate_input)) + gate1, gate2 = gates.chunk(2, dim=-1) + hidden_states = self.norm(gate1 * second_residual + gate2 * hidden_states) + return hidden_states + + +class DFineMultiheadAttention(nn.Module): + """ + Multi-headed attention from 'Attention Is All You Need' paper. + + Here, we add position embeddings to the queries and keys (as explained in the Deformable DETR paper). + """ + + def __init__( + self, + embed_dim: int, + num_heads: int, + dropout: float = 0.0, + bias: bool = True, + ): + super().__init__() + self.embed_dim = embed_dim + self.num_heads = num_heads + self.dropout = dropout + self.head_dim = embed_dim // num_heads + if self.head_dim * num_heads != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:" + f" {num_heads})." + ) + self.scaling = self.head_dim**-0.5 + + self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + + def _reshape(self, tensor: torch.Tensor, seq_len: int, batch_size: int): + return tensor.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() + + def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]): + return tensor if position_embeddings is None else tensor + position_embeddings + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_embeddings: Optional[torch.Tensor] = None, + output_attentions: bool = False, + ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]: + """Input shape: Batch x Time x Channel""" + + batch_size, target_len, embed_dim = hidden_states.size() + # add position embeddings to the hidden states before projecting to queries and keys + if position_embeddings is not None: + hidden_states_original = hidden_states + hidden_states = self.with_pos_embed(hidden_states, position_embeddings) + + # get queries, keys and values + query_states = self.q_proj(hidden_states) * self.scaling + key_states = self._reshape(self.k_proj(hidden_states), -1, batch_size) + value_states = self._reshape(self.v_proj(hidden_states_original), -1, batch_size) + + proj_shape = (batch_size * self.num_heads, -1, self.head_dim) + query_states = self._reshape(query_states, target_len, batch_size).view(*proj_shape) + key_states = key_states.view(*proj_shape) + value_states = value_states.view(*proj_shape) + + source_len = key_states.size(1) + + attn_weights = torch.bmm(query_states, key_states.transpose(1, 2)) + + if attn_weights.size() != (batch_size * self.num_heads, target_len, source_len): + raise ValueError( + f"Attention weights should be of size {(batch_size * self.num_heads, target_len, source_len)}, but is" + f" {attn_weights.size()}" + ) + + # expand attention_mask + if attention_mask is not None: + # [seq_len, seq_len] -> [batch_size, 1, target_seq_len, source_seq_len] + attention_mask = attention_mask.expand(batch_size, 1, *attention_mask.size()) + + if attention_mask is not None: + if attention_mask.size() != (batch_size, 1, target_len, source_len): + raise ValueError( + f"Attention mask should be of size {(batch_size, 1, target_len, source_len)}, but is" + f" {attention_mask.size()}" + ) + if attention_mask.dtype == torch.bool: + attention_mask = torch.zeros_like(attention_mask, dtype=attn_weights.dtype).masked_fill_( + attention_mask, -torch.inf + ) + attn_weights = attn_weights.view(batch_size, self.num_heads, target_len, source_len) + attention_mask + attn_weights = attn_weights.view(batch_size * self.num_heads, target_len, source_len) + + attn_weights = nn.functional.softmax(attn_weights, dim=-1) + + if output_attentions: + # this operation is a bit awkward, but it's required to + # make sure that attn_weights keeps its gradient. + # In order to do so, attn_weights have to reshaped + # twice and have to be reused in the following + attn_weights_reshaped = attn_weights.view(batch_size, self.num_heads, target_len, source_len) + attn_weights = attn_weights_reshaped.view(batch_size * self.num_heads, target_len, source_len) + else: + attn_weights_reshaped = None + + attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) + + attn_output = torch.bmm(attn_probs, value_states) + + if attn_output.size() != (batch_size * self.num_heads, target_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(batch_size, self.num_heads, target_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.view(batch_size, self.num_heads, target_len, self.head_dim) + attn_output = attn_output.transpose(1, 2) + attn_output = attn_output.reshape(batch_size, target_len, embed_dim) + + attn_output = self.out_proj(attn_output) + + return attn_output, attn_weights_reshaped + + +class DFineDecoderLayer(nn.Module): + def __init__(self, config: DFineConfig): + super().__init__() + # self-attention + self.self_attn = DFineMultiheadAttention( + embed_dim=config.d_model, + num_heads=config.decoder_attention_heads, + dropout=config.attention_dropout, + ) + self.dropout = config.dropout + self.activation_fn = ACT2FN[config.decoder_activation_function] + self.activation_dropout = config.activation_dropout + + self.self_attn_layer_norm = nn.LayerNorm(config.d_model, eps=config.layer_norm_eps) + + # override the encoder attention module with d-fine version + self.encoder_attn = DFineMultiscaleDeformableAttention(config=config) + # feedforward neural networks + self.fc1 = nn.Linear(config.d_model, config.decoder_ffn_dim) + self.fc2 = nn.Linear(config.decoder_ffn_dim, config.d_model) + self.final_layer_norm = nn.LayerNorm(config.d_model, eps=config.layer_norm_eps) + # gate + self.gateway = DFineGate(config.d_model) + + def forward( + self, + hidden_states: torch.Tensor, + position_embeddings: Optional[torch.Tensor] = None, + reference_points=None, + spatial_shapes=None, + spatial_shapes_list=None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = False, + ) -> tuple[torch.Tensor, Any, Any]: + """ + Args: + hidden_states (`torch.FloatTensor`): + Input to the layer of shape `(seq_len, batch, embed_dim)`. + position_embeddings (`torch.FloatTensor`, *optional*): + Position embeddings that are added to the queries and keys in the self-attention layer. + reference_points (`torch.FloatTensor`, *optional*): + Reference points. + spatial_shapes (`torch.LongTensor`, *optional*): + Spatial shapes. + level_start_index (`torch.LongTensor`, *optional*): + Level start index. + encoder_hidden_states (`torch.FloatTensor`): + cross attention input to the layer of shape `(seq_len, batch, embed_dim)` + encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size + `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative + values. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + """ + # Self Attention + hidden_states_2, self_attn_weights = self.self_attn( + hidden_states=hidden_states, + attention_mask=encoder_attention_mask, + position_embeddings=position_embeddings, + output_attentions=output_attentions, + ) + + hidden_states_2 = nn.functional.dropout(hidden_states_2, p=self.dropout, training=self.training) + hidden_states = hidden_states + hidden_states_2 + hidden_states = self.self_attn_layer_norm(hidden_states) + residual = hidden_states + + # Cross-Attention + cross_attn_weights = None + hidden_states = hidden_states if position_embeddings is None else hidden_states + position_embeddings + hidden_states_2, cross_attn_weights = self.encoder_attn( + hidden_states=hidden_states, + encoder_hidden_states=encoder_hidden_states, + reference_points=reference_points, + spatial_shapes=spatial_shapes, + spatial_shapes_list=spatial_shapes_list, + ) + + hidden_states_2 = nn.functional.dropout(hidden_states_2, p=self.dropout, training=self.training) + hidden_states = self.gateway(residual, hidden_states_2) + + # Fully Connected + hidden_states_2 = self.activation_fn(self.fc1(hidden_states)) + hidden_states_2 = nn.functional.dropout(hidden_states_2, p=self.activation_dropout, training=self.training) + hidden_states_2 = self.fc2(hidden_states_2) + hidden_states_2 = nn.functional.dropout(hidden_states_2, p=self.dropout, training=self.training) + hidden_states = hidden_states + hidden_states_2 + hidden_states = self.final_layer_norm(hidden_states.clamp(min=-65504, max=65504)) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights, cross_attn_weights) + + return outputs + + +@auto_docstring +class DFinePreTrainedModel(PreTrainedModel): + config: DFineConfig + base_model_prefix = "d_fine" + main_input_name = "pixel_values" + _no_split_modules = [r"DFineHybridEncoder", r"DFineDecoderLayer"] + + def _init_weights(self, module): + """Initialize the weights""" + # initialize linear layer bias value according to a given probability value. + if isinstance(module, (DFineForObjectDetection, DFineDecoder)): + if module.class_embed is not None: + for layer in module.class_embed: + prior_prob = self.config.initializer_bias_prior_prob or 1 / (self.config.num_labels + 1) + bias = float(-math.log((1 - prior_prob) / prior_prob)) + nn.init.xavier_uniform_(layer.weight) + nn.init.constant_(layer.bias, bias) + + if module.bbox_embed is not None: + for layer in module.bbox_embed: + nn.init.constant_(layer.layers[-1].weight, 0) + nn.init.constant_(layer.layers[-1].bias, 0) + + if hasattr(module, "reg_scale"): + module.reg_scale.fill_(self.config.reg_scale) + + if hasattr(module, "up"): + module.up.fill_(self.config.up) + + if isinstance(module, DFineMultiscaleDeformableAttention): + nn.init.constant_(module.sampling_offsets.weight.data, 0.0) + default_dtype = torch.get_default_dtype() + thetas = torch.arange(module.n_heads, dtype=torch.int64).to(default_dtype) * ( + 2.0 * math.pi / module.n_heads + ) + grid_init = torch.stack([thetas.cos(), thetas.sin()], -1) + grid_init = grid_init / grid_init.abs().max(-1, keepdim=True).values + grid_init = grid_init.reshape(module.n_heads, 1, 2).tile([1, sum(module.num_points_list), 1]) + scaling = torch.concat([torch.arange(1, n + 1) for n in module.num_points_list]).reshape(1, -1, 1) + grid_init *= scaling + with torch.no_grad(): + module.sampling_offsets.bias.data[...] = grid_init.flatten() + + nn.init.constant_(module.attention_weights.weight.data, 0.0) + nn.init.constant_(module.attention_weights.bias.data, 0.0) + + if isinstance(module, DFineModel): + prior_prob = self.config.initializer_bias_prior_prob or 1 / (self.config.num_labels + 1) + bias = float(-math.log((1 - prior_prob) / prior_prob)) + nn.init.xavier_uniform_(module.enc_score_head.weight) + nn.init.constant_(module.enc_score_head.bias, bias) + + if isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + + if isinstance(module, DFineGate): + bias = float(-math.log((1 - 0.5) / 0.5)) + init.constant_(module.gate.bias, bias) + init.constant_(module.gate.weight, 0) + + if isinstance(module, DFineLQE): + init.constant_(module.reg_conf.layers[-1].bias, 0) + init.constant_(module.reg_conf.layers[-1].weight, 0) + + if isinstance(module, nn.LayerNorm): + module.weight.data.fill_(1.0) + module.bias.data.zero_() + + if hasattr(module, "weight_embedding") and self.config.learn_initial_query: + nn.init.xavier_uniform_(module.weight_embedding.weight) + if hasattr(module, "denoising_class_embed") and self.config.num_denoising > 0: + nn.init.xavier_uniform_(module.denoising_class_embed.weight) + + +class DFineIntegral(nn.Module): + """ + A static layer that calculates integral results from a distribution. + + This layer computes the target location using the formula: `sum{Pr(n) * W(n)}`, + where Pr(n) is the softmax probability vector representing the discrete + distribution, and W(n) is the non-uniform Weighting Function. + + Args: + max_num_bins (int): Max number of the discrete bins. Default is 32. + It can be adjusted based on the dataset or task requirements. + """ + + def __init__(self, config: DFineConfig): + super().__init__() + self.max_num_bins = config.max_num_bins + + def forward(self, pred_corners: torch.Tensor, project: torch.Tensor) -> torch.Tensor: + batch_size, num_queries, _ = pred_corners.shape + pred_corners = F.softmax(pred_corners.reshape(-1, self.max_num_bins + 1), dim=1) + pred_corners = F.linear(pred_corners, project.to(pred_corners.device)).reshape(-1, 4) + pred_corners = pred_corners.reshape(batch_size, num_queries, -1) + return pred_corners + + +@dataclass +@auto_docstring( + custom_intro=""" + Base class for outputs of the DFineDecoder. This class adds two attributes to + BaseModelOutputWithCrossAttentions, namely: + - a stacked tensor of intermediate decoder hidden states (i.e. the output of each decoder layer) + - a stacked tensor of intermediate reference points. + """ +) +class DFineDecoderOutput(ModelOutput): + r""" + intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`): + Stacked intermediate hidden states (output of each layer of the decoder). + intermediate_logits (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, sequence_length, config.num_labels)`): + Stacked intermediate logits (logits of each layer of the decoder). + intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, sequence_length, hidden_size)`): + Stacked intermediate reference points (reference points of each layer of the decoder). + intermediate_predicted_corners (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`): + Stacked intermediate predicted corners (predicted corners of each layer of the decoder). + initial_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`): + Stacked initial reference points (initial reference points of each layer of the decoder). + cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax, + used to compute the weighted average in the cross-attention heads. + """ + + last_hidden_state: Optional[torch.FloatTensor] = None + intermediate_hidden_states: Optional[torch.FloatTensor] = None + intermediate_logits: Optional[torch.FloatTensor] = None + intermediate_reference_points: Optional[torch.FloatTensor] = None + intermediate_predicted_corners: Optional[torch.FloatTensor] = None + initial_reference_points: Optional[torch.FloatTensor] = None + hidden_states: Optional[tuple[torch.FloatTensor]] = None + attentions: Optional[tuple[torch.FloatTensor]] = None + cross_attentions: Optional[tuple[torch.FloatTensor]] = None + + +def inverse_sigmoid(x, eps=1e-5): + x = x.clamp(min=0, max=1) + x1 = x.clamp(min=eps) + x2 = (1 - x).clamp(min=eps) + return torch.log(x1 / x2) + + +def weighting_function(max_num_bins: int, up: torch.Tensor, reg_scale: int) -> torch.Tensor: + """ + Generates the non-uniform Weighting Function W(n) for bounding box regression. + + Args: + max_num_bins (int): Max number of the discrete bins. + up (Tensor): Controls upper bounds of the sequence, + where maximum offset is ±up * H / W. + reg_scale (float): Controls the curvature of the Weighting Function. + Larger values result in flatter weights near the central axis W(max_num_bins/2)=0 + and steeper weights at both ends. + Returns: + Tensor: Sequence of Weighting Function. + """ + upper_bound1 = abs(up[0]) * abs(reg_scale) + upper_bound2 = abs(up[0]) * abs(reg_scale) * 2 + step = (upper_bound1 + 1) ** (2 / (max_num_bins - 2)) + left_values = [-((step) ** i) + 1 for i in range(max_num_bins // 2 - 1, 0, -1)] + right_values = [(step) ** i - 1 for i in range(1, max_num_bins // 2)] + values = [-upper_bound2] + left_values + [torch.zeros_like(up[0][None])] + right_values + [upper_bound2] + values = torch.cat(values, 0) + return values + + +def distance2bbox(points, distance: torch.Tensor, reg_scale: float) -> torch.Tensor: + """ + Decodes edge-distances into bounding box coordinates. + + Args: + points (`torch.Tensor`): + (batch_size, num_boxes, 4) or (num_boxes, 4) format, representing [x_center, y_center, width, height] + distance (`torch.Tensor`): + (batch_size, num_boxes, 4) or (num_boxes, 4), representing distances from the point to the left, top, right, and bottom boundaries. + reg_scale (`float`): + Controls the curvature of the Weighting Function. + Returns: + `torch.Tensor`: Bounding boxes in (batch_size, num_boxes, 4) or (num_boxes, 4) format, representing [x_center, y_center, width, height] + """ + reg_scale = abs(reg_scale) + top_left_x = points[..., 0] - (0.5 * reg_scale + distance[..., 0]) * (points[..., 2] / reg_scale) + top_left_y = points[..., 1] - (0.5 * reg_scale + distance[..., 1]) * (points[..., 3] / reg_scale) + bottom_right_x = points[..., 0] + (0.5 * reg_scale + distance[..., 2]) * (points[..., 2] / reg_scale) + bottom_right_y = points[..., 1] + (0.5 * reg_scale + distance[..., 3]) * (points[..., 3] / reg_scale) + + bboxes = torch.stack([top_left_x, top_left_y, bottom_right_x, bottom_right_y], -1) + + return corners_to_center_format(bboxes) + + +class DFineDecoder(DFinePreTrainedModel): + """ + D-FINE Decoder implementing Fine-grained Distribution Refinement (FDR). + + This decoder refines object detection predictions through iterative updates across multiple layers, + utilizing attention mechanisms, location quality estimators, and distribution refinement techniques + to improve bounding box accuracy and robustness. + """ + + def __init__(self, config: DFineConfig): + super().__init__(config) + self.eval_idx = config.eval_idx if config.eval_idx >= 0 else config.decoder_layers + config.eval_idx + + self.dropout = config.dropout + self.layers = nn.ModuleList( + [DFineDecoderLayer(config) for _ in range(config.decoder_layers)] + + [DFineDecoderLayer(config) for _ in range(config.decoder_layers - self.eval_idx - 1)] + ) + self.query_pos_head = DFineMLPPredictionHead(config, 4, 2 * config.d_model, config.d_model, num_layers=2) + + # hack implementation for iterative bounding box refinement and two-stage Deformable DETR + self.bbox_embed = None + self.class_embed = None + self.reg_scale = nn.Parameter(torch.tensor([config.reg_scale]), requires_grad=False) + self.max_num_bins = config.max_num_bins + self.d_model = config.d_model + self.layer_scale = config.layer_scale + self.pre_bbox_head = DFineMLP(config.hidden_size, config.hidden_size, 4, 3) + self.integral = DFineIntegral(config) + self.num_head = config.decoder_attention_heads + self.up = nn.Parameter(torch.tensor([config.up]), requires_grad=False) + self.lqe_layers = nn.ModuleList([DFineLQE(config) for _ in range(config.decoder_layers)]) + + # Initialize weights and apply final processing + self.post_init() + + def forward( + self, + encoder_hidden_states: torch.Tensor, + reference_points: torch.Tensor, + inputs_embeds: torch.Tensor, + spatial_shapes, + level_start_index=None, + spatial_shapes_list=None, + output_hidden_states=None, + encoder_attention_mask=None, + memory_mask=None, + output_attentions=None, + return_dict=None, + ) -> DFineDecoderOutput: + r""" + Args: + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`): + The query embeddings that are passed into the decoder. + encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention + of the decoder. + encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing cross-attention on padding pixel_values of the encoder. Mask values selected + in `[0, 1]`: + - 1 for pixels that are real (i.e. **not masked**), + - 0 for pixels that are padding (i.e. **masked**). + position_embeddings (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*): + Position embeddings that are added to the queries and keys in each self-attention layer. + reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)` is `as_two_stage` else `(batch_size, num_queries, 2)` or , *optional*): + Reference point in range `[0, 1]`, top-left (0,0), bottom-right (1, 1), including padding area. + spatial_shapes (`torch.FloatTensor` of shape `(num_feature_levels, 2)`): + Spatial shapes of the feature maps. + level_start_index (`torch.LongTensor` of shape `(num_feature_levels)`, *optional*): + Indexes for the start of each feature level. In range `[0, sequence_length]`. + valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`, *optional*): + Ratio of valid area in each feature level. + + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if inputs_embeds is not None: + hidden_states = inputs_embeds + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None + intermediate = () + intermediate_reference_points = () + intermediate_logits = () + intermediate_predicted_corners = () + initial_reference_points = () + + output_detach = pred_corners_undetach = 0 + + project = weighting_function(self.max_num_bins, self.up, self.reg_scale) + ref_points_detach = F.sigmoid(reference_points) + + for i, decoder_layer in enumerate(self.layers): + ref_points_input = ref_points_detach.unsqueeze(2) + query_pos_embed = self.query_pos_head(ref_points_detach).clamp(min=-10, max=10) + + if output_hidden_states: + all_hidden_states += (hidden_states,) + + output = decoder_layer( + hidden_states=hidden_states, + position_embeddings=query_pos_embed, + reference_points=ref_points_input, + spatial_shapes=spatial_shapes, + spatial_shapes_list=spatial_shapes_list, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + output_attentions=output_attentions, + ) + + hidden_states = output[0] + + if i == 0: + # Initial bounding box predictions with inverse sigmoid refinement + new_reference_points = F.sigmoid(self.pre_bbox_head(output[0]) + inverse_sigmoid(ref_points_detach)) + ref_points_initial = new_reference_points.detach() + + # Refine bounding box corners using FDR, integrating previous layer's corrections + if self.bbox_embed is not None: + pred_corners = self.bbox_embed[i](hidden_states + output_detach) + pred_corners_undetach + inter_ref_bbox = distance2bbox( + ref_points_initial, self.integral(pred_corners, project), self.reg_scale + ) + pred_corners_undetach = pred_corners + ref_points_detach = inter_ref_bbox.detach() + + output_detach = hidden_states.detach() + + intermediate += (hidden_states,) + + if self.class_embed is not None and (self.training or i == self.eval_idx): + scores = self.class_embed[i](hidden_states) + # Add initial logits and reference points with pre-bbox head + if i == 0: + intermediate_logits += (scores,) + intermediate_reference_points += (new_reference_points,) + # Lqe does not affect the performance here. + scores = self.lqe_layers[i](scores, pred_corners) + intermediate_logits += (scores,) + intermediate_reference_points += (inter_ref_bbox,) + initial_reference_points += (ref_points_initial,) + intermediate_predicted_corners += (pred_corners,) + + if output_attentions: + all_self_attns += (output[1],) + + if encoder_hidden_states is not None: + all_cross_attentions += (output[2],) + + # Keep batch_size as first dimension + intermediate = torch.stack(intermediate) + if self.class_embed is not None and self.bbox_embed is not None: + intermediate_logits = torch.stack(intermediate_logits, dim=1) + intermediate_predicted_corners = torch.stack(intermediate_predicted_corners, dim=1) + initial_reference_points = torch.stack(initial_reference_points, dim=1) + intermediate_reference_points = torch.stack(intermediate_reference_points, dim=1) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if not return_dict: + return tuple( + v + for v in [ + hidden_states, + intermediate, + intermediate_logits, + intermediate_reference_points, + intermediate_predicted_corners, + initial_reference_points, + all_hidden_states, + all_self_attns, + all_cross_attentions, + ] + if v is not None + ) + + return DFineDecoderOutput( + last_hidden_state=hidden_states, + intermediate_hidden_states=intermediate, + intermediate_logits=intermediate_logits, + intermediate_reference_points=intermediate_reference_points, + intermediate_predicted_corners=intermediate_predicted_corners, + initial_reference_points=initial_reference_points, + hidden_states=all_hidden_states, + attentions=all_self_attns, + cross_attentions=all_cross_attentions, + ) + + +@dataclass +@auto_docstring( + custom_intro=""" + Base class for outputs of the RT-DETR encoder-decoder model. + """ +) +class DFineModelOutput(ModelOutput): + r""" + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the decoder of the model. + intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`): + Stacked intermediate hidden states (output of each layer of the decoder). + intermediate_logits (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, sequence_length, config.num_labels)`): + Stacked intermediate logits (logits of each layer of the decoder). + intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`): + Stacked intermediate reference points (reference points of each layer of the decoder). + intermediate_predicted_corners (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`): + Stacked intermediate predicted corners (predicted corners of each layer of the decoder). + initial_reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`): + Initial reference points used for the first decoder layer. + init_reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`): + Initial reference points sent through the Transformer decoder. + enc_topk_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`): + Predicted bounding boxes scores where the top `config.two_stage_num_proposals` scoring bounding boxes are + picked as region proposals in the encoder stage. Output of bounding box binary classification (i.e. + foreground and background). + enc_topk_bboxes (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`): + Logits of predicted bounding boxes coordinates in the encoder stage. + enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`): + Predicted bounding boxes scores where the top `config.two_stage_num_proposals` scoring bounding boxes are + picked as region proposals in the first stage. Output of bounding box binary classification (i.e. + foreground and background). + enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`): + Logits of predicted bounding boxes coordinates in the first stage. + denoising_meta_values (`dict`): + Extra dictionary for the denoising related values. + """ + + last_hidden_state: Optional[torch.FloatTensor] = None + intermediate_hidden_states: Optional[torch.FloatTensor] = None + intermediate_logits: Optional[torch.FloatTensor] = None + intermediate_reference_points: Optional[torch.FloatTensor] = None + intermediate_predicted_corners: Optional[torch.FloatTensor] = None + initial_reference_points: Optional[torch.FloatTensor] = None + decoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None + decoder_attentions: Optional[tuple[torch.FloatTensor]] = None + cross_attentions: Optional[tuple[torch.FloatTensor]] = None + encoder_last_hidden_state: Optional[torch.FloatTensor] = None + encoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None + encoder_attentions: Optional[tuple[torch.FloatTensor]] = None + init_reference_points: Optional[torch.FloatTensor] = None + enc_topk_logits: Optional[torch.FloatTensor] = None + enc_topk_bboxes: Optional[torch.FloatTensor] = None + enc_outputs_class: Optional[torch.FloatTensor] = None + enc_outputs_coord_logits: Optional[torch.FloatTensor] = None + denoising_meta_values: Optional[dict] = None + + +class DFineFrozenBatchNorm2d(nn.Module): + """ + BatchNorm2d where the batch statistics and the affine parameters are fixed. + + Copy-paste from torchvision.misc.ops with added eps before rqsrt, without which any other models than + torchvision.models.resnet[18,34,50,101] produce nans. + """ + + def __init__(self, n): + super().__init__() + self.register_buffer("weight", torch.ones(n)) + self.register_buffer("bias", torch.zeros(n)) + self.register_buffer("running_mean", torch.zeros(n)) + self.register_buffer("running_var", torch.ones(n)) + + def _load_from_state_dict( + self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs + ): + num_batches_tracked_key = prefix + "num_batches_tracked" + if num_batches_tracked_key in state_dict: + del state_dict[num_batches_tracked_key] + + super()._load_from_state_dict( + state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs + ) + + def forward(self, x): + # move reshapes to the beginning + # to make it user-friendly + weight = self.weight.reshape(1, -1, 1, 1) + bias = self.bias.reshape(1, -1, 1, 1) + running_var = self.running_var.reshape(1, -1, 1, 1) + running_mean = self.running_mean.reshape(1, -1, 1, 1) + epsilon = 1e-5 + scale = weight * (running_var + epsilon).rsqrt() + bias = bias - running_mean * scale + return x * scale + bias + + +def replace_batch_norm(model): + r""" + Recursively replace all `torch.nn.BatchNorm2d` with `DFineFrozenBatchNorm2d`. + + Args: + model (torch.nn.Module): + input model + """ + for name, module in model.named_children(): + if isinstance(module, nn.BatchNorm2d): + new_module = DFineFrozenBatchNorm2d(module.num_features) + + if module.weight.device != torch.device("meta"): + new_module.weight.data.copy_(module.weight) + new_module.bias.data.copy_(module.bias) + new_module.running_mean.data.copy_(module.running_mean) + new_module.running_var.data.copy_(module.running_var) + + model._modules[name] = new_module + + if len(list(module.children())) > 0: + replace_batch_norm(module) + + +class DFineConvEncoder(nn.Module): + """ + Convolutional backbone using the modeling_d_fine_resnet.py. + + nn.BatchNorm2d layers are replaced by DFineFrozenBatchNorm2d as defined above. + https://github.com/lyuwenyu/RT-DETR/blob/main/DFine_pytorch/src/nn/backbone/presnet.py#L142 + """ + + def __init__(self, config): + super().__init__() + + backbone = load_backbone(config) + + if config.freeze_backbone_batch_norms: + # replace batch norm by frozen batch norm + with torch.no_grad(): + replace_batch_norm(backbone) + self.model = backbone + self.intermediate_channel_sizes = self.model.channels + + def forward(self, pixel_values: torch.Tensor, pixel_mask: torch.Tensor): + # send pixel_values through the model to get list of feature maps + features = self.model(pixel_values).feature_maps + + out = [] + for feature_map in features: + # downsample pixel_mask to match shape of corresponding feature_map + mask = nn.functional.interpolate(pixel_mask[None].float(), size=feature_map.shape[-2:]).to(torch.bool)[0] + out.append((feature_map, mask)) + return out + + +def get_contrastive_denoising_training_group( + targets, + num_classes, + num_queries, + class_embed, + num_denoising_queries=100, + label_noise_ratio=0.5, + box_noise_scale=1.0, +): + """ + Creates a contrastive denoising training group using ground-truth samples. It adds noise to labels and boxes. + + Args: + targets (`list[dict]`): + The target objects, each containing 'class_labels' and 'boxes' for objects in an image. + num_classes (`int`): + Total number of classes in the dataset. + num_queries (`int`): + Number of query slots in the transformer. + class_embed (`callable`): + A function or a model layer to embed class labels. + num_denoising_queries (`int`, *optional*, defaults to 100): + Number of denoising queries. + label_noise_ratio (`float`, *optional*, defaults to 0.5): + Ratio of noise applied to labels. + box_noise_scale (`float`, *optional*, defaults to 1.0): + Scale of noise applied to bounding boxes. + Returns: + `tuple` comprising various elements: + - **input_query_class** (`torch.FloatTensor`) -- + Class queries with applied label noise. + - **input_query_bbox** (`torch.FloatTensor`) -- + Bounding box queries with applied box noise. + - **attn_mask** (`torch.FloatTensor`) -- + Attention mask for separating denoising and reconstruction queries. + - **denoising_meta_values** (`dict`) -- + Metadata including denoising positive indices, number of groups, and split sizes. + """ + + if num_denoising_queries <= 0: + return None, None, None, None + + num_ground_truths = [len(t["class_labels"]) for t in targets] + device = targets[0]["class_labels"].device + + max_gt_num = max(num_ground_truths) + if max_gt_num == 0: + return None, None, None, None + + num_groups_denoising_queries = num_denoising_queries // max_gt_num + num_groups_denoising_queries = 1 if num_groups_denoising_queries == 0 else num_groups_denoising_queries + # pad gt to max_num of a batch + batch_size = len(num_ground_truths) + + input_query_class = torch.full([batch_size, max_gt_num], num_classes, dtype=torch.int32, device=device) + input_query_bbox = torch.zeros([batch_size, max_gt_num, 4], device=device) + pad_gt_mask = torch.zeros([batch_size, max_gt_num], dtype=torch.bool, device=device) + + for i in range(batch_size): + num_gt = num_ground_truths[i] + if num_gt > 0: + input_query_class[i, :num_gt] = targets[i]["class_labels"] + input_query_bbox[i, :num_gt] = targets[i]["boxes"] + pad_gt_mask[i, :num_gt] = 1 + # each group has positive and negative queries. + input_query_class = input_query_class.tile([1, 2 * num_groups_denoising_queries]) + input_query_bbox = input_query_bbox.tile([1, 2 * num_groups_denoising_queries, 1]) + pad_gt_mask = pad_gt_mask.tile([1, 2 * num_groups_denoising_queries]) + # positive and negative mask + negative_gt_mask = torch.zeros([batch_size, max_gt_num * 2, 1], device=device) + negative_gt_mask[:, max_gt_num:] = 1 + negative_gt_mask = negative_gt_mask.tile([1, num_groups_denoising_queries, 1]) + positive_gt_mask = 1 - negative_gt_mask + # contrastive denoising training positive index + positive_gt_mask = positive_gt_mask.squeeze(-1) * pad_gt_mask + denoise_positive_idx = torch.nonzero(positive_gt_mask)[:, 1] + denoise_positive_idx = torch.split( + denoise_positive_idx, [n * num_groups_denoising_queries for n in num_ground_truths] + ) + # total denoising queries + num_denoising_queries = torch_int(max_gt_num * 2 * num_groups_denoising_queries) + + if label_noise_ratio > 0: + mask = torch.rand_like(input_query_class, dtype=torch.float) < (label_noise_ratio * 0.5) + # randomly put a new one here + new_label = torch.randint_like(mask, 0, num_classes, dtype=input_query_class.dtype) + input_query_class = torch.where(mask & pad_gt_mask, new_label, input_query_class) + + if box_noise_scale > 0: + known_bbox = center_to_corners_format(input_query_bbox) + diff = torch.tile(input_query_bbox[..., 2:] * 0.5, [1, 1, 2]) * box_noise_scale + rand_sign = torch.randint_like(input_query_bbox, 0, 2) * 2.0 - 1.0 + rand_part = torch.rand_like(input_query_bbox) + rand_part = (rand_part + 1.0) * negative_gt_mask + rand_part * (1 - negative_gt_mask) + rand_part *= rand_sign + known_bbox += rand_part * diff + known_bbox.clip_(min=0.0, max=1.0) + input_query_bbox = corners_to_center_format(known_bbox) + input_query_bbox = inverse_sigmoid(input_query_bbox) + + input_query_class = class_embed(input_query_class) + + target_size = num_denoising_queries + num_queries + attn_mask = torch.full([target_size, target_size], 0, dtype=torch.float, device=device) + # match query cannot see the reconstruction + attn_mask[num_denoising_queries:, :num_denoising_queries] = -torch.inf + + # reconstructions cannot see each other + for i in range(num_groups_denoising_queries): + idx_block_start = max_gt_num * 2 * i + idx_block_end = max_gt_num * 2 * (i + 1) + attn_mask[idx_block_start:idx_block_end, :idx_block_start] = -torch.inf + attn_mask[idx_block_start:idx_block_end, idx_block_end:num_denoising_queries] = -torch.inf + + denoising_meta_values = { + "dn_positive_idx": denoise_positive_idx, + "dn_num_group": num_groups_denoising_queries, + "dn_num_split": [num_denoising_queries, num_queries], + } + + return input_query_class, input_query_bbox, attn_mask, denoising_meta_values + + +@auto_docstring( + custom_intro=""" + RT-DETR Model (consisting of a backbone and encoder-decoder) outputting raw hidden states without any head on top. + """ +) +class DFineModel(DFinePreTrainedModel): + def __init__(self, config: DFineConfig): + super().__init__(config) + + # Create backbone + self.backbone = DFineConvEncoder(config) + intermediate_channel_sizes = self.backbone.intermediate_channel_sizes + num_backbone_outs = len(config.decoder_in_channels) + encoder_input_proj_list = [] + for _ in range(num_backbone_outs): + in_channels = intermediate_channel_sizes[_] + encoder_input_proj_list.append( + nn.Sequential( + nn.Conv2d(in_channels, config.encoder_hidden_dim, kernel_size=1, bias=False), + nn.BatchNorm2d(config.encoder_hidden_dim), + ) + ) + self.encoder_input_proj = nn.ModuleList(encoder_input_proj_list) + self.encoder = DFineHybridEncoder(config=config) + + # denoising part + if config.num_denoising > 0: + self.denoising_class_embed = nn.Embedding( + config.num_labels + 1, config.d_model, padding_idx=config.num_labels + ) + + # decoder embedding + if config.learn_initial_query: + self.weight_embedding = nn.Embedding(config.num_queries, config.d_model) + + # encoder head + self.enc_output = nn.Sequential( + nn.Linear(config.d_model, config.d_model), + nn.LayerNorm(config.d_model, eps=config.layer_norm_eps), + ) + self.enc_score_head = nn.Linear(config.d_model, config.num_labels) + self.enc_bbox_head = DFineMLPPredictionHead(config, config.d_model, config.d_model, 4, num_layers=3) + + # init encoder output anchors and valid_mask + if config.anchor_image_size: + self.anchors, self.valid_mask = self.generate_anchors(dtype=self.dtype) + num_backbone_outs = len(config.decoder_in_channels) + decoder_input_proj_list = [] + for _ in range(num_backbone_outs): + in_channels = config.decoder_in_channels[_] + decoder_input_proj_list.append( + nn.Sequential( + nn.Conv2d(in_channels, config.d_model, kernel_size=1, bias=False), + nn.BatchNorm2d(config.d_model, config.batch_norm_eps), + ) + ) + for _ in range(config.num_feature_levels - num_backbone_outs): + decoder_input_proj_list.append( + nn.Sequential( + nn.Conv2d(in_channels, config.d_model, kernel_size=3, stride=2, padding=1, bias=False), + nn.BatchNorm2d(config.d_model, config.batch_norm_eps), + ) + ) + in_channels = config.d_model + self.decoder = DFineDecoder(config) + decoder_input_proj = [] + in_channels = config.decoder_in_channels[-1] + for _ in range(num_backbone_outs): + if config.hidden_size == config.decoder_in_channels[-1]: + decoder_input_proj.append(nn.Identity()) + else: + conv = nn.Conv2d(in_channels, config.d_model, kernel_size=1, bias=False) + batchnorm = nn.BatchNorm2d(config.d_model, config.batch_norm_eps) + decoder_input_proj.append(nn.Sequential(conv, batchnorm)) + for _ in range(config.num_feature_levels - num_backbone_outs): + if config.hidden_size == config.decoder_in_channels[-1]: + decoder_input_proj.append(nn.Identity()) + else: + conv = nn.Conv2d(in_channels, config.d_model, kernel_size=3, stride=2, padding=1, bias=False) + batchnorm = nn.BatchNorm2d(config.d_model, config.batch_norm_eps) + decoder_input_proj.append(nn.Sequential(conv, batchnorm)) + self.decoder_input_proj = nn.ModuleList(decoder_input_proj) + + self.post_init() + + def get_encoder(self): + return self.encoder + + def freeze_backbone(self): + for param in self.backbone.parameters(): + param.requires_grad_(False) + + def unfreeze_backbone(self): + for param in self.backbone.parameters(): + param.requires_grad_(True) + + @compile_compatible_method_lru_cache(maxsize=32) + def generate_anchors(self, spatial_shapes=None, grid_size=0.05, device="cpu", dtype=torch.float32): + if spatial_shapes is None: + spatial_shapes = [ + [int(self.config.anchor_image_size[0] / s), int(self.config.anchor_image_size[1] / s)] + for s in self.config.feat_strides + ] + anchors = [] + for level, (height, width) in enumerate(spatial_shapes): + grid_y, grid_x = torch.meshgrid( + torch.arange(end=height, device=device).to(dtype), + torch.arange(end=width, device=device).to(dtype), + indexing="ij", + ) + grid_xy = torch.stack([grid_x, grid_y], -1) + grid_xy = grid_xy.unsqueeze(0) + 0.5 + grid_xy[..., 0] /= width + grid_xy[..., 1] /= height + wh = torch.ones_like(grid_xy) * grid_size * (2.0**level) + anchors.append(torch.concat([grid_xy, wh], -1).reshape(-1, height * width, 4)) + # define the valid range for anchor coordinates + eps = 1e-2 + anchors = torch.concat(anchors, 1) + valid_mask = ((anchors > eps) * (anchors < 1 - eps)).all(-1, keepdim=True) + anchors = torch.log(anchors / (1 - anchors)) + anchors = torch.where(valid_mask, anchors, torch.tensor(torch.finfo(dtype).max, dtype=dtype, device=device)) + + return anchors, valid_mask + + @auto_docstring + def forward( + self, + pixel_values: torch.FloatTensor, + pixel_mask: Optional[torch.LongTensor] = None, + encoder_outputs: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + decoder_inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[list[dict]] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple[torch.FloatTensor], DFineModelOutput]: + r""" + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you + can choose to directly pass a flattened representation of an image. + decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*): + Optionally, instead of initializing the queries with a tensor of zeros, you can choose to directly pass an + embedded representation. + labels (`list[Dict]` of len `(batch_size,)`, *optional*): + Labels for computing the bipartite matching loss. List of dicts, each dictionary containing at least the + following 2 keys: 'class_labels' and 'boxes' (the class labels and bounding boxes of an image in the batch + respectively). The class labels themselves should be a `torch.LongTensor` of len `(number of bounding boxes + in the image,)` and the boxes a `torch.FloatTensor` of shape `(number of bounding boxes in the image, 4)`. + + Examples: + + ```python + >>> from transformers import AutoImageProcessor, DFineModel + >>> from PIL import Image + >>> import requests + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> image_processor = AutoImageProcessor.from_pretrained("PekingU/DFine_r50vd") + >>> model = DFineModel.from_pretrained("PekingU/DFine_r50vd") + + >>> inputs = image_processor(images=image, return_tensors="pt") + + >>> outputs = model(**inputs) + + >>> last_hidden_states = outputs.last_hidden_state + >>> list(last_hidden_states.shape) + [1, 300, 256] + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + batch_size, num_channels, height, width = pixel_values.shape + device = pixel_values.device + + if pixel_mask is None: + pixel_mask = torch.ones(((batch_size, height, width)), device=device) + + features = self.backbone(pixel_values, pixel_mask) + + proj_feats = [self.encoder_input_proj[level](source) for level, (source, mask) in enumerate(features)] + + if encoder_outputs is None: + encoder_outputs = self.encoder( + proj_feats, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True + elif return_dict and not isinstance(encoder_outputs, BaseModelOutput): + encoder_outputs = BaseModelOutput( + last_hidden_state=encoder_outputs[0], + hidden_states=encoder_outputs[1] if output_hidden_states else None, + attentions=encoder_outputs[2] + if len(encoder_outputs) > 2 + else encoder_outputs[1] + if output_attentions + else None, + ) + + # Equivalent to def _get_encoder_input + # https://github.com/lyuwenyu/RT-DETR/blob/94f5e16708329d2f2716426868ec89aa774af016/DFine_pytorch/src/zoo/DFine/DFine_decoder.py#L412 + sources = [] + for level, source in enumerate(encoder_outputs[0]): + sources.append(self.decoder_input_proj[level](source)) + + # Lowest resolution feature maps are obtained via 3x3 stride 2 convolutions on the final stage + if self.config.num_feature_levels > len(sources): + _len_sources = len(sources) + sources.append(self.decoder_input_proj[_len_sources](encoder_outputs[0])[-1]) + for i in range(_len_sources + 1, self.config.num_feature_levels): + sources.append(self.decoder_input_proj[i](encoder_outputs[0][-1])) + + # Prepare encoder inputs (by flattening) + source_flatten = [] + spatial_shapes_list = [] + spatial_shapes = torch.empty((len(sources), 2), device=device, dtype=torch.long) + for level, source in enumerate(sources): + height, width = source.shape[-2:] + spatial_shapes[level, 0] = height + spatial_shapes[level, 1] = width + spatial_shapes_list.append((height, width)) + source = source.flatten(2).transpose(1, 2) + source_flatten.append(source) + source_flatten = torch.cat(source_flatten, 1) + level_start_index = torch.cat((spatial_shapes.new_zeros((1,)), spatial_shapes.prod(1).cumsum(0)[:-1])) + + # prepare denoising training + if self.training and self.config.num_denoising > 0 and labels is not None: + ( + denoising_class, + denoising_bbox_unact, + attention_mask, + denoising_meta_values, + ) = get_contrastive_denoising_training_group( + targets=labels, + num_classes=self.config.num_labels, + num_queries=self.config.num_queries, + class_embed=self.denoising_class_embed, + num_denoising_queries=self.config.num_denoising, + label_noise_ratio=self.config.label_noise_ratio, + box_noise_scale=self.config.box_noise_scale, + ) + else: + denoising_class, denoising_bbox_unact, attention_mask, denoising_meta_values = None, None, None, None + + batch_size = len(source_flatten) + device = source_flatten.device + dtype = source_flatten.dtype + + # prepare input for decoder + if self.training or self.config.anchor_image_size is None: + # Pass spatial_shapes as tuple to make it hashable and make sure + # lru_cache is working for generate_anchors() + spatial_shapes_tuple = tuple(spatial_shapes_list) + anchors, valid_mask = self.generate_anchors(spatial_shapes_tuple, device=device, dtype=dtype) + else: + anchors, valid_mask = self.anchors, self.valid_mask + anchors, valid_mask = anchors.to(device, dtype), valid_mask.to(device, dtype) + + # use the valid_mask to selectively retain values in the feature map where the mask is `True` + memory = valid_mask.to(source_flatten.dtype) * source_flatten + + output_memory = self.enc_output(memory) + + enc_outputs_class = self.enc_score_head(output_memory) + enc_outputs_coord_logits = self.enc_bbox_head(output_memory) + anchors + + _, topk_ind = torch.topk(enc_outputs_class.max(-1).values, self.config.num_queries, dim=1) + + reference_points_unact = enc_outputs_coord_logits.gather( + dim=1, index=topk_ind.unsqueeze(-1).repeat(1, 1, enc_outputs_coord_logits.shape[-1]) + ) + + enc_topk_bboxes = F.sigmoid(reference_points_unact) + if denoising_bbox_unact is not None: + reference_points_unact = torch.concat([denoising_bbox_unact, reference_points_unact], 1) + + enc_topk_logits = enc_outputs_class.gather( + dim=1, index=topk_ind.unsqueeze(-1).repeat(1, 1, enc_outputs_class.shape[-1]) + ) + + # extract region features + if self.config.learn_initial_query: + target = self.weight_embedding.tile([batch_size, 1, 1]) + else: + target = output_memory.gather(dim=1, index=topk_ind.unsqueeze(-1).repeat(1, 1, output_memory.shape[-1])) + target = target.detach() + + if denoising_class is not None: + target = torch.concat([denoising_class, target], 1) + + init_reference_points = reference_points_unact.detach() + + # decoder + decoder_outputs = self.decoder( + inputs_embeds=target, + encoder_hidden_states=source_flatten, + encoder_attention_mask=attention_mask, + reference_points=init_reference_points, + spatial_shapes=spatial_shapes, + spatial_shapes_list=spatial_shapes_list, + level_start_index=level_start_index, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + if not return_dict: + enc_outputs = tuple( + value + for value in [enc_topk_logits, enc_topk_bboxes, enc_outputs_class, enc_outputs_coord_logits] + if value is not None + ) + dn_outputs = tuple(value if value is not None else None for value in [denoising_meta_values]) + tuple_outputs = decoder_outputs + encoder_outputs + (init_reference_points,) + enc_outputs + dn_outputs + + return tuple_outputs + + return DFineModelOutput( + last_hidden_state=decoder_outputs.last_hidden_state, + intermediate_hidden_states=decoder_outputs.intermediate_hidden_states, + intermediate_logits=decoder_outputs.intermediate_logits, + intermediate_reference_points=decoder_outputs.intermediate_reference_points, + intermediate_predicted_corners=decoder_outputs.intermediate_predicted_corners, + initial_reference_points=decoder_outputs.initial_reference_points, + decoder_hidden_states=decoder_outputs.hidden_states, + decoder_attentions=decoder_outputs.attentions, + cross_attentions=decoder_outputs.cross_attentions, + encoder_last_hidden_state=encoder_outputs.last_hidden_state, + encoder_hidden_states=encoder_outputs.hidden_states, + encoder_attentions=encoder_outputs.attentions, + init_reference_points=init_reference_points, + enc_topk_logits=enc_topk_logits, + enc_topk_bboxes=enc_topk_bboxes, + enc_outputs_class=enc_outputs_class, + enc_outputs_coord_logits=enc_outputs_coord_logits, + denoising_meta_values=denoising_meta_values, + ) + + +@dataclass +@auto_docstring( + custom_intro=""" + Output type of [`DFineForObjectDetection`]. + """ +) +class DFineObjectDetectionOutput(ModelOutput): + r""" + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)): + Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a + bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized + scale-invariant IoU loss. + loss_dict (`Dict`, *optional*): + A dictionary containing the individual losses. Useful for logging. + logits (`torch.FloatTensor` of shape `(batch_size, num_queries, num_classes + 1)`): + Classification logits (including no-object) for all queries. + pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`): + Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These + values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding + possible padding). You can use [`~DFineImageProcessor.post_process_object_detection`] to retrieve the + unnormalized (absolute) bounding boxes. + auxiliary_outputs (`list[Dict]`, *optional*): + Optional, only returned when auxiliary losses are activated (i.e. `config.auxiliary_loss` is set to `True`) + and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and + `pred_boxes`) for each decoder layer. + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the decoder of the model. + intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`): + Stacked intermediate hidden states (output of each layer of the decoder). + intermediate_logits (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, config.num_labels)`): + Stacked intermediate logits (logits of each layer of the decoder). + intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`): + Stacked intermediate reference points (reference points of each layer of the decoder). + intermediate_predicted_corners (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`): + Stacked intermediate predicted corners (predicted corners of each layer of the decoder). + initial_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`): + Stacked initial reference points (initial reference points of each layer of the decoder). + init_reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`): + Initial reference points sent through the Transformer decoder. + enc_topk_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`): + Logits of predicted bounding boxes coordinates in the encoder. + enc_topk_bboxes (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`): + Logits of predicted bounding boxes coordinates in the encoder. + enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`): + Predicted bounding boxes scores where the top `config.two_stage_num_proposals` scoring bounding boxes are + picked as region proposals in the first stage. Output of bounding box binary classification (i.e. + foreground and background). + enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`): + Logits of predicted bounding boxes coordinates in the first stage. + denoising_meta_values (`dict`): + Extra dictionary for the denoising related values + """ + + loss: Optional[torch.FloatTensor] = None + loss_dict: Optional[dict] = None + logits: Optional[torch.FloatTensor] = None + pred_boxes: Optional[torch.FloatTensor] = None + auxiliary_outputs: Optional[list[dict]] = None + last_hidden_state: Optional[torch.FloatTensor] = None + intermediate_hidden_states: Optional[torch.FloatTensor] = None + intermediate_logits: Optional[torch.FloatTensor] = None + intermediate_reference_points: Optional[torch.FloatTensor] = None + intermediate_predicted_corners: Optional[torch.FloatTensor] = None + initial_reference_points: Optional[torch.FloatTensor] = None + decoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None + decoder_attentions: Optional[tuple[torch.FloatTensor]] = None + cross_attentions: Optional[tuple[torch.FloatTensor]] = None + encoder_last_hidden_state: Optional[torch.FloatTensor] = None + encoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None + encoder_attentions: Optional[tuple[torch.FloatTensor]] = None + init_reference_points: Optional[tuple[torch.FloatTensor]] = None + enc_topk_logits: Optional[torch.FloatTensor] = None + enc_topk_bboxes: Optional[torch.FloatTensor] = None + enc_outputs_class: Optional[torch.FloatTensor] = None + enc_outputs_coord_logits: Optional[torch.FloatTensor] = None + denoising_meta_values: Optional[dict] = None + + +@auto_docstring( + custom_intro=""" + RT-DETR Model (consisting of a backbone and encoder-decoder) outputting bounding boxes and logits to be further + decoded into scores and classes. + """ +) +class DFineForObjectDetection(DFinePreTrainedModel): + # When using clones, all layers > 0 will be clones, but layer 0 *is* required + _tied_weights_keys = ["bbox_embed", "class_embed"] + # We can't initialize the model on meta device as some weights are modified during the initialization + _no_split_modules = None + + def __init__(self, config: DFineConfig): + super().__init__(config) + + # D-FINE encoder-decoder model + self.eval_idx = config.eval_idx if config.eval_idx >= 0 else config.decoder_layers + config.eval_idx + self.model = DFineModel(config) + scaled_dim = round(config.layer_scale * config.hidden_size) + num_pred = config.decoder_layers + self.class_embed = nn.ModuleList([nn.Linear(config.d_model, config.num_labels) for _ in range(num_pred)]) + self.bbox_embed = nn.ModuleList( + [ + DFineMLP(config.hidden_size, config.hidden_size, 4 * (config.max_num_bins + 1), 3) + for _ in range(self.eval_idx + 1) + ] + + [ + DFineMLP(scaled_dim, scaled_dim, 4 * (config.max_num_bins + 1), 3) + for _ in range(config.decoder_layers - self.eval_idx - 1) + ] + ) + + # here self.model.decoder.bbox_embed is null, but not self.bbox_embed + self.model.decoder.class_embed = self.class_embed + self.model.decoder.bbox_embed = self.bbox_embed + + # Initialize weights and apply final processing + self.post_init() + + @torch.jit.unused + def _set_aux_loss(self, outputs_class, outputs_coord): + # this is a workaround to make torchscript happy, as torchscript + # doesn't support dictionary with non-homogeneous values, such + # as a dict having both a Tensor and a list. + return [{"logits": a, "pred_boxes": b} for a, b in zip(outputs_class, outputs_coord)] + + @auto_docstring + def forward( + self, + pixel_values: torch.FloatTensor, + pixel_mask: Optional[torch.LongTensor] = None, + encoder_outputs: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + decoder_inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[list[dict]] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + **kwargs, + ) -> Union[tuple[torch.FloatTensor], DFineObjectDetectionOutput]: + r""" + Example: + + ```python + >>> import torch + >>> from transformers.image_utils import load_image + >>> from transformers import AutoImageProcessor, DFineForObjectDetection + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = load_image(url) + + >>> image_processor = AutoImageProcessor.from_pretrained("ustc-community/dfine-xlarge-coco") + >>> model = DFineForObjectDetection.from_pretrained("ustc-community/dfine-xlarge-coco") + + >>> # prepare image for the model + >>> inputs = image_processor(images=image, return_tensors="pt") + + >>> # forward pass + >>> outputs = model(**inputs) + + >>> logits = outputs.logits + >>> list(logits.shape) + [1, 300, 80] + + >>> boxes = outputs.pred_boxes + >>> list(boxes.shape) + [1, 300, 4] + + >>> # convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax) + >>> target_sizes = torch.tensor([image.size[::-1]]) + >>> results = image_processor.post_process_object_detection(outputs, threshold=0.9, target_sizes=target_sizes) + >>> result = results[0] # first image in batch + + >>> for score, label, box in zip(result["scores"], result["labels"], result["boxes"]): + ... box = [round(i, 2) for i in box.tolist()] + ... print( + ... f"Detected {model.config.id2label[label.item()]} with confidence " + ... f"{round(score.item(), 3)} at location {box}" + ... ) + Detected cat with confidence 0.958 at location [344.49, 23.4, 639.84, 374.27] + Detected cat with confidence 0.956 at location [11.71, 53.52, 316.64, 472.33] + Detected remote with confidence 0.947 at location [40.46, 73.7, 175.62, 117.57] + Detected sofa with confidence 0.918 at location [0.59, 1.88, 640.25, 474.74] + ``` + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.model( + pixel_values, + pixel_mask=pixel_mask, + encoder_outputs=encoder_outputs, + inputs_embeds=inputs_embeds, + decoder_inputs_embeds=decoder_inputs_embeds, + labels=labels, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + denoising_meta_values = ( + outputs.denoising_meta_values if return_dict else outputs[-1] if self.training else None + ) + + outputs_class = outputs.intermediate_logits if return_dict else outputs[2] + outputs_coord = outputs.intermediate_reference_points if return_dict else outputs[3] + predicted_corners = outputs.intermediate_predicted_corners if return_dict else outputs[4] + initial_reference_points = outputs.initial_reference_points if return_dict else outputs[5] + + logits = outputs_class[:, -1] + pred_boxes = outputs_coord[:, -1] + + loss, loss_dict, auxiliary_outputs, enc_topk_logits, enc_topk_bboxes = None, None, None, None, None + if labels is not None: + enc_topk_logits = outputs.enc_topk_logits if return_dict else outputs[-5] + enc_topk_bboxes = outputs.enc_topk_bboxes if return_dict else outputs[-4] + loss, loss_dict, auxiliary_outputs = self.loss_function( + logits, + labels, + self.device, + pred_boxes, + self.config, + outputs_class, + outputs_coord, + enc_topk_logits=enc_topk_logits, + enc_topk_bboxes=enc_topk_bboxes, + denoising_meta_values=denoising_meta_values, + predicted_corners=predicted_corners, + initial_reference_points=initial_reference_points, + **kwargs, + ) + + if not return_dict: + if auxiliary_outputs is not None: + output = (logits, pred_boxes) + (auxiliary_outputs,) + outputs + else: + output = (logits, pred_boxes) + outputs + return ((loss, loss_dict) + output) if loss is not None else output + + return DFineObjectDetectionOutput( + loss=loss, + loss_dict=loss_dict, + logits=logits, + pred_boxes=pred_boxes, + auxiliary_outputs=auxiliary_outputs, + last_hidden_state=outputs.last_hidden_state, + intermediate_hidden_states=outputs.intermediate_hidden_states, + intermediate_logits=outputs.intermediate_logits, + intermediate_reference_points=outputs.intermediate_reference_points, + intermediate_predicted_corners=outputs.intermediate_predicted_corners, + initial_reference_points=outputs.initial_reference_points, + decoder_hidden_states=outputs.decoder_hidden_states, + decoder_attentions=outputs.decoder_attentions, + cross_attentions=outputs.cross_attentions, + encoder_last_hidden_state=outputs.encoder_last_hidden_state, + encoder_hidden_states=outputs.encoder_hidden_states, + encoder_attentions=outputs.encoder_attentions, + init_reference_points=outputs.init_reference_points, + enc_topk_logits=outputs.enc_topk_logits, + enc_topk_bboxes=outputs.enc_topk_bboxes, + enc_outputs_class=outputs.enc_outputs_class, + enc_outputs_coord_logits=outputs.enc_outputs_coord_logits, + denoising_meta_values=outputs.denoising_meta_values, + ) + + +# taken from https://github.com/facebookresearch/detr/blob/master/models/detr.py +class DFineMLPPredictionHead(nn.Module): + """ + Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates, + height and width of a bounding box w.r.t. an image. + + Copied from https://github.com/facebookresearch/detr/blob/master/models/detr.py + Origin from https://github.com/lyuwenyu/RT-DETR/blob/94f5e16708329d2f2716426868ec89aa774af016/DFine_paddle/ppdet/modeling/transformers/utils.py#L453 + + """ + + def __init__(self, config, input_dim, d_model, output_dim, num_layers): + super().__init__() + self.num_layers = num_layers + h = [d_model] * (num_layers - 1) + self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])) + + def forward(self, x): + for i, layer in enumerate(self.layers): + x = nn.functional.relu(layer(x)) if i < self.num_layers - 1 else layer(x) + return x + + +class DFineMLP(nn.Module): + def __init__(self, input_dim: int, hidden_dim: int, output_dim: int, num_layers: int, act: str = "relu"): + super().__init__() + self.num_layers = num_layers + hidden_dims = [hidden_dim] * (num_layers - 1) + input_dims = [input_dim] + hidden_dims + output_dims = hidden_dims + [output_dim] + self.layers = nn.ModuleList(nn.Linear(in_dim, out_dim) for in_dim, out_dim in zip(input_dims, output_dims)) + self.act = ACT2CLS[act]() + + def forward(self, stat_features: torch.Tensor) -> torch.Tensor: + for i, layer in enumerate(self.layers): + stat_features = self.act(layer(stat_features)) if i < self.num_layers - 1 else layer(stat_features) + return stat_features + + +class DFineLQE(nn.Module): + def __init__(self, config: DFineConfig): + super().__init__() + self.top_prob_values = config.top_prob_values + self.max_num_bins = config.max_num_bins + self.reg_conf = DFineMLP(4 * (self.top_prob_values + 1), config.lqe_hidden_dim, 1, config.lqe_layers) + + def forward(self, scores: torch.Tensor, pred_corners: torch.Tensor) -> torch.Tensor: + batch_size, length, _ = pred_corners.size() + prob = F.softmax(pred_corners.reshape(batch_size, length, 4, self.max_num_bins + 1), dim=-1) + prob_topk, _ = prob.topk(self.top_prob_values, dim=-1) + stat = torch.cat([prob_topk, prob_topk.mean(dim=-1, keepdim=True)], dim=-1) + quality_score = self.reg_conf(stat.reshape(batch_size, length, -1)) + scores = scores + quality_score + return scores + + +class DFineConvNormLayer(nn.Module): + def __init__( + self, + config: DFineConfig, + in_channels: int, + out_channels: int, + kernel_size: int, + stride: int, + groups: int = 1, + padding: Optional[int] = None, + activation: Optional[str] = None, + ): + super().__init__() + self.conv = nn.Conv2d( + in_channels, + out_channels, + kernel_size, + stride, + groups=groups, + padding=(kernel_size - 1) // 2 if padding is None else padding, + bias=False, + ) + self.norm = nn.BatchNorm2d(out_channels, config.batch_norm_eps) + self.activation = nn.Identity() if activation is None else ACT2CLS[activation]() + + def forward(self, hidden_state): + hidden_state = self.conv(hidden_state) + hidden_state = self.norm(hidden_state) + hidden_state = self.activation(hidden_state) + return hidden_state + + +class DFineRepVggBlock(nn.Module): + """ + RepVGG architecture block introduced by the work "RepVGG: Making VGG-style ConvNets Great Again". + """ + + def __init__(self, config: DFineConfig, in_channels: int, out_channels: int): + super().__init__() + + activation = config.activation_function + hidden_channels = in_channels + self.conv1 = DFineConvNormLayer(config, hidden_channels, out_channels, 3, 1, padding=1) + self.conv2 = DFineConvNormLayer(config, hidden_channels, out_channels, 1, 1, padding=0) + self.activation = nn.Identity() if activation is None else ACT2CLS[activation]() + + def forward(self, x): + y = self.conv1(x) + self.conv2(x) + return self.activation(y) + + +class DFineCSPRepLayer(nn.Module): + """ + Cross Stage Partial (CSP) network layer with RepVGG blocks. + """ + + def __init__( + self, config: DFineConfig, in_channels: int, out_channels: int, num_blocks: int, expansion: float = 1.0 + ): + super().__init__() + activation = config.activation_function + + hidden_channels = int(out_channels * expansion) + self.conv1 = DFineConvNormLayer(config, in_channels, hidden_channels, 1, 1, activation=activation) + self.conv2 = DFineConvNormLayer(config, in_channels, hidden_channels, 1, 1, activation=activation) + self.bottlenecks = nn.ModuleList( + [DFineRepVggBlock(config, hidden_channels, hidden_channels) for _ in range(num_blocks)] + ) + if hidden_channels != out_channels: + self.conv3 = DFineConvNormLayer(config, hidden_channels, out_channels, 1, 1, activation=activation) + else: + self.conv3 = nn.Identity() + + def forward(self, hidden_state: torch.Tensor) -> torch.Tensor: + hidden_state_1 = self.conv1(hidden_state) + for bottleneck in self.bottlenecks: + hidden_state_1 = bottleneck(hidden_state_1) + hidden_state_2 = self.conv2(hidden_state) + hidden_state_3 = self.conv3(hidden_state_1 + hidden_state_2) + return hidden_state_3 + + +class DFineRepNCSPELAN4(nn.Module): + def __init__(self, config: DFineConfig, act: str = "silu", numb_blocks: int = 3): + super().__init__() + conv1_dim = config.encoder_hidden_dim * 2 + conv2_dim = config.encoder_hidden_dim + conv3_dim = config.encoder_hidden_dim * 2 + conv4_dim = round(config.hidden_expansion * config.encoder_hidden_dim // 2) + self.conv_dim = conv3_dim // 2 + self.conv1 = DFineConvNormLayer(config, conv1_dim, conv3_dim, 1, 1, activation=act) + self.csp_rep1 = DFineCSPRepLayer(config, conv3_dim // 2, conv4_dim, num_blocks=numb_blocks) + self.conv2 = DFineConvNormLayer(config, conv4_dim, conv4_dim, 3, 1, activation=act) + self.csp_rep2 = DFineCSPRepLayer(config, conv4_dim, conv4_dim, num_blocks=numb_blocks) + self.conv3 = DFineConvNormLayer(config, conv4_dim, conv4_dim, 3, 1, activation=act) + self.conv4 = DFineConvNormLayer(config, conv3_dim + (2 * conv4_dim), conv2_dim, 1, 1, activation=act) + + def forward(self, input_features: torch.Tensor) -> torch.Tensor: + # Split initial features into two branches after first convolution + split_features = list(self.conv1(input_features).split((self.conv_dim, self.conv_dim), 1)) + + # Process branches sequentially + branch1 = self.csp_rep1(split_features[-1]) + branch1 = self.conv2(branch1) + branch2 = self.csp_rep2(branch1) + branch2 = self.conv3(branch2) + + split_features.extend([branch1, branch2]) + merged_features = torch.cat(split_features, 1) + merged_features = self.conv4(merged_features) + return merged_features + + +class DFineSCDown(nn.Module): + def __init__(self, config: DFineConfig, kernel_size: int, stride: int): + super().__init__() + self.conv1 = DFineConvNormLayer(config, config.encoder_hidden_dim, config.encoder_hidden_dim, 1, 1) + self.conv2 = DFineConvNormLayer( + config, + config.encoder_hidden_dim, + config.encoder_hidden_dim, + kernel_size, + stride, + config.encoder_hidden_dim, + ) + + def forward(self, input_features: torch.Tensor) -> torch.Tensor: + input_features = self.conv1(input_features) + input_features = self.conv2(input_features) + return input_features + + +class DFineEncoderLayer(nn.Module): + def __init__(self, config: DFineConfig): + super().__init__() + self.normalize_before = config.normalize_before + + # self-attention + self.self_attn = DFineMultiheadAttention( + embed_dim=config.encoder_hidden_dim, + num_heads=config.num_attention_heads, + dropout=config.dropout, + ) + self.self_attn_layer_norm = nn.LayerNorm(config.encoder_hidden_dim, eps=config.layer_norm_eps) + self.dropout = config.dropout + self.activation_fn = ACT2FN[config.encoder_activation_function] + self.activation_dropout = config.activation_dropout + self.fc1 = nn.Linear(config.encoder_hidden_dim, config.encoder_ffn_dim) + self.fc2 = nn.Linear(config.encoder_ffn_dim, config.encoder_hidden_dim) + self.final_layer_norm = nn.LayerNorm(config.encoder_hidden_dim, eps=config.layer_norm_eps) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: torch.Tensor, + position_embeddings: Optional[torch.Tensor] = None, + output_attentions: bool = False, + **kwargs, + ): + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`): attention mask of size + `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative + values. + position_embeddings (`torch.FloatTensor`, *optional*): + Object queries (also called content embeddings), to be added to the hidden states. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + """ + residual = hidden_states + if self.normalize_before: + hidden_states = self.self_attn_layer_norm(hidden_states) + + hidden_states, attn_weights = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_embeddings=position_embeddings, + output_attentions=output_attentions, + ) + + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + if not self.normalize_before: + hidden_states = self.self_attn_layer_norm(hidden_states) + + if self.normalize_before: + hidden_states = self.final_layer_norm(hidden_states) + residual = hidden_states + + hidden_states = self.activation_fn(self.fc1(hidden_states)) + hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) + + hidden_states = self.fc2(hidden_states) + + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + + hidden_states = residual + hidden_states + if not self.normalize_before: + hidden_states = self.final_layer_norm(hidden_states) + + if self.training: + if torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any(): + clamp_value = torch.finfo(hidden_states.dtype).max - 1000 + hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (attn_weights,) + + return outputs + + +class DFineEncoder(nn.Module): + def __init__(self, config: DFineConfig): + super().__init__() + + self.layers = nn.ModuleList([DFineEncoderLayer(config) for _ in range(config.encoder_layers)]) + + def forward(self, src, src_mask=None, pos_embed=None, output_attentions: bool = False) -> torch.Tensor: + hidden_states = src + for layer in self.layers: + hidden_states = layer( + hidden_states, + attention_mask=src_mask, + position_embeddings=pos_embed, + output_attentions=output_attentions, + ) + return hidden_states + + +class DFineHybridEncoder(nn.Module): + """ + Decoder consisting of a projection layer, a set of `DFineEncoder`, a top-down Feature Pyramid Network + (FPN) and a bottom-up Path Aggregation Network (PAN). More details on the paper: https://huggingface.co/papers/2304.08069 + + Args: + config: DFineConfig + """ + + def __init__(self, config: DFineConfig): + super().__init__() + self.config = config + self.in_channels = config.encoder_in_channels + self.num_fpn_stages = len(self.in_channels) - 1 + self.feat_strides = config.feat_strides + self.encoder_hidden_dim = config.encoder_hidden_dim + self.encode_proj_layers = config.encode_proj_layers + self.positional_encoding_temperature = config.positional_encoding_temperature + self.eval_size = config.eval_size + self.out_channels = [self.encoder_hidden_dim for _ in self.in_channels] + self.out_strides = self.feat_strides + + # encoder transformer + self.encoder = nn.ModuleList([DFineEncoder(config) for _ in range(len(self.encode_proj_layers))]) + # top-down fpn + self.lateral_convs = nn.ModuleList() + self.fpn_blocks = nn.ModuleList() + for _ in range(len(self.in_channels) - 1, 0, -1): + lateral_layer = DFineConvNormLayer(config, self.encoder_hidden_dim, self.encoder_hidden_dim, 1, 1) + self.lateral_convs.append(lateral_layer) + num_blocks = round(3 * config.depth_mult) + fpn_layer = DFineRepNCSPELAN4(config, numb_blocks=num_blocks) + self.fpn_blocks.append(fpn_layer) + + # bottom-up pan + self.downsample_convs = nn.ModuleList() + self.pan_blocks = nn.ModuleList() + for _ in range(len(self.in_channels) - 1): + self.downsample_convs.append(DFineSCDown(config, 3, 2)) + num_blocks = round(3 * config.depth_mult) + self.pan_blocks.append(DFineRepNCSPELAN4(config, numb_blocks=num_blocks)) + + @staticmethod + def build_2d_sincos_position_embedding( + width, height, embed_dim=256, temperature=10000.0, device="cpu", dtype=torch.float32 + ): + grid_w = torch.arange(torch_int(width), device=device).to(dtype) + grid_h = torch.arange(torch_int(height), device=device).to(dtype) + grid_w, grid_h = torch.meshgrid(grid_w, grid_h, indexing="ij") + if embed_dim % 4 != 0: + raise ValueError("Embed dimension must be divisible by 4 for 2D sin-cos position embedding") + pos_dim = embed_dim // 4 + omega = torch.arange(pos_dim, device=device).to(dtype) / pos_dim + omega = 1.0 / (temperature**omega) + + out_w = grid_w.flatten()[..., None] @ omega[None] + out_h = grid_h.flatten()[..., None] @ omega[None] + + return torch.concat([out_w.sin(), out_w.cos(), out_h.sin(), out_h.cos()], dim=1)[None, :, :] + + def forward( + self, + inputs_embeds=None, + attention_mask=None, + position_embeddings=None, + spatial_shapes=None, + level_start_index=None, + valid_ratios=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + Args: + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Flattened feature map (output of the backbone + projection layer) that is passed to the encoder. + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding pixel features. Mask values selected in `[0, 1]`: + - 1 for pixel features that are real (i.e. **not masked**), + - 0 for pixel features that are padding (i.e. **masked**). + [What are attention masks?](../glossary#attention-mask) + position_embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Position embeddings that are added to the queries and keys in each self-attention layer. + spatial_shapes (`torch.LongTensor` of shape `(num_feature_levels, 2)`): + Spatial shapes of each feature map. + level_start_index (`torch.LongTensor` of shape `(num_feature_levels)`): + Starting index of each feature map. + valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`): + Ratio of valid area in each feature level. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + hidden_states = inputs_embeds + + encoder_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + + # encoder + if self.config.encoder_layers > 0: + for i, enc_ind in enumerate(self.encode_proj_layers): + if output_hidden_states: + encoder_states = encoder_states + (hidden_states[enc_ind],) + height, width = hidden_states[enc_ind].shape[2:] + # flatten [batch, channel, height, width] to [batch, height*width, channel] + src_flatten = hidden_states[enc_ind].flatten(2).permute(0, 2, 1) + if self.training or self.eval_size is None: + pos_embed = self.build_2d_sincos_position_embedding( + width, + height, + self.encoder_hidden_dim, + self.positional_encoding_temperature, + device=src_flatten.device, + dtype=src_flatten.dtype, + ) + else: + pos_embed = None + + layer_outputs = self.encoder[i]( + src_flatten, + pos_embed=pos_embed, + output_attentions=output_attentions, + ) + hidden_states[enc_ind] = ( + layer_outputs[0].permute(0, 2, 1).reshape(-1, self.encoder_hidden_dim, height, width).contiguous() + ) + + if output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + + if output_hidden_states: + encoder_states = encoder_states + (hidden_states[enc_ind],) + + # top-down FPN + fpn_feature_maps = [hidden_states[-1]] + for idx, (lateral_conv, fpn_block) in enumerate(zip(self.lateral_convs, self.fpn_blocks)): + backbone_feature_map = hidden_states[self.num_fpn_stages - idx - 1] + top_fpn_feature_map = fpn_feature_maps[-1] + # apply lateral block + top_fpn_feature_map = lateral_conv(top_fpn_feature_map) + fpn_feature_maps[-1] = top_fpn_feature_map + # apply fpn block + top_fpn_feature_map = F.interpolate(top_fpn_feature_map, scale_factor=2.0, mode="nearest") + fused_feature_map = torch.concat([top_fpn_feature_map, backbone_feature_map], dim=1) + new_fpn_feature_map = fpn_block(fused_feature_map) + fpn_feature_maps.append(new_fpn_feature_map) + + fpn_feature_maps = fpn_feature_maps[::-1] + + # bottom-up PAN + pan_feature_maps = [fpn_feature_maps[0]] + for idx, (downsample_conv, pan_block) in enumerate(zip(self.downsample_convs, self.pan_blocks)): + top_pan_feature_map = pan_feature_maps[-1] + fpn_feature_map = fpn_feature_maps[idx + 1] + downsampled_feature_map = downsample_conv(top_pan_feature_map) + fused_feature_map = torch.concat([downsampled_feature_map, fpn_feature_map], dim=1) + new_pan_feature_map = pan_block(fused_feature_map) + pan_feature_maps.append(new_pan_feature_map) + + if not return_dict: + return tuple(v for v in [pan_feature_maps, encoder_states, all_attentions] if v is not None) + return BaseModelOutput( + last_hidden_state=pan_feature_maps, hidden_states=encoder_states, attentions=all_attentions + ) + + +__all__ = ["DFineModel", "DFinePreTrainedModel", "DFineForObjectDetection"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/d_fine/modular_d_fine.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/d_fine/modular_d_fine.py new file mode 100644 index 0000000000000000000000000000000000000000..9a41fb23308eefb77878b3a3f6bee4a25992eb10 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/d_fine/modular_d_fine.py @@ -0,0 +1,1229 @@ +# coding=utf-8 +# Copyright 2025 Baidu Inc and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import math +from typing import Any, Optional + +import torch +import torch.nn.functional as F +import torch.nn.init as init +from torch import nn + +from ...activations import ACT2CLS +from ...configuration_utils import PretrainedConfig +from ...image_transforms import corners_to_center_format +from ...utils import is_torchdynamo_compiling, logging +from ...utils.backbone_utils import verify_backbone_config_arguments +from ..auto import CONFIG_MAPPING +from ..rt_detr.modeling_rt_detr import ( + RTDetrConvNormLayer, + RTDetrDecoder, + RTDetrDecoderLayer, + RTDetrDecoderOutput, + RTDetrEncoder, + RTDetrForObjectDetection, + RTDetrHybridEncoder, + RTDetrMLPPredictionHead, + RTDetrModel, + RTDetrPreTrainedModel, + RTDetrRepVggBlock, + inverse_sigmoid, +) +from ..rt_detr_v2.modeling_rt_detr_v2 import multi_scale_deformable_attention_v2 + + +logger = logging.get_logger(__name__) + + +# TODO: Attribute map assignment logic should be fixed in modular +# as well as super() call parsing because otherwise we cannot re-write args after initialization +class DFineConfig(PretrainedConfig): + """ + This is the configuration class to store the configuration of a [`DFineModel`]. It is used to instantiate a D-FINE + model according to the specified arguments, defining the model architecture. Instantiating a configuration with the + defaults will yield a similar configuration to that of D-FINE-X-COCO "[ustc-community/dfine-xlarge-coco"](https://huggingface.co/ustc-community/dfine-xlarge-coco"). + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + initializer_range (`float`, *optional*, defaults to 0.01): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + initializer_bias_prior_prob (`float`, *optional*): + The prior probability used by the bias initializer to initialize biases for `enc_score_head` and `class_embed`. + If `None`, `prior_prob` computed as `prior_prob = 1 / (num_labels + 1)` while initializing model weights. + layer_norm_eps (`float`, *optional*, defaults to 1e-05): + The epsilon used by the layer normalization layers. + batch_norm_eps (`float`, *optional*, defaults to 1e-05): + The epsilon used by the batch normalization layers. + backbone_config (`Dict`, *optional*, defaults to `RTDetrResNetConfig()`): + The configuration of the backbone model. + backbone (`str`, *optional*): + Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this + will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone` + is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights. + use_pretrained_backbone (`bool`, *optional*, defaults to `False`): + Whether to use pretrained weights for the backbone. + use_timm_backbone (`bool`, *optional*, defaults to `False`): + Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers + library. + freeze_backbone_batch_norms (`bool`, *optional*, defaults to `True`): + Whether to freeze the batch normalization layers in the backbone. + backbone_kwargs (`dict`, *optional*): + Keyword arguments to be passed to AutoBackbone when loading from a checkpoint + e.g. `{'out_indices': (0, 1, 2, 3)}`. Cannot be specified if `backbone_config` is set. + encoder_hidden_dim (`int`, *optional*, defaults to 256): + Dimension of the layers in hybrid encoder. + encoder_in_channels (`list`, *optional*, defaults to `[512, 1024, 2048]`): + Multi level features input for encoder. + feat_strides (`list[int]`, *optional*, defaults to `[8, 16, 32]`): + Strides used in each feature map. + encoder_layers (`int`, *optional*, defaults to 1): + Total of layers to be used by the encoder. + encoder_ffn_dim (`int`, *optional*, defaults to 1024): + Dimension of the "intermediate" (often named feed-forward) layer in decoder. + encoder_attention_heads (`int`, *optional*, defaults to 8): + Number of attention heads for each attention layer in the Transformer encoder. + dropout (`float`, *optional*, defaults to 0.0): + The ratio for all dropout layers. + activation_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for activations inside the fully connected layer. + encode_proj_layers (`list[int]`, *optional*, defaults to `[2]`): + Indexes of the projected layers to be used in the encoder. + positional_encoding_temperature (`int`, *optional*, defaults to 10000): + The temperature parameter used to create the positional encodings. + encoder_activation_function (`str`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"silu"` and `"gelu_new"` are supported. + activation_function (`str`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the general layer. If string, `"gelu"`, + `"relu"`, `"silu"` and `"gelu_new"` are supported. + eval_size (`tuple[int, int]`, *optional*): + Height and width used to computes the effective height and width of the position embeddings after taking + into account the stride. + normalize_before (`bool`, *optional*, defaults to `False`): + Determine whether to apply layer normalization in the transformer encoder layer before self-attention and + feed-forward modules. + hidden_expansion (`float`, *optional*, defaults to 1.0): + Expansion ratio to enlarge the dimension size of RepVGGBlock and CSPRepLayer. + d_model (`int`, *optional*, defaults to 256): + Dimension of the layers exclude hybrid encoder. + num_queries (`int`, *optional*, defaults to 300): + Number of object queries. + decoder_in_channels (`list`, *optional*, defaults to `[256, 256, 256]`): + Multi level features dimension for decoder + decoder_ffn_dim (`int`, *optional*, defaults to 1024): + Dimension of the "intermediate" (often named feed-forward) layer in decoder. + num_feature_levels (`int`, *optional*, defaults to 3): + The number of input feature levels. + decoder_n_points (`int`, *optional*, defaults to 4): + The number of sampled keys in each feature level for each attention head in the decoder. + decoder_layers (`int`, *optional*, defaults to 6): + Number of decoder layers. + decoder_attention_heads (`int`, *optional*, defaults to 8): + Number of attention heads for each attention layer in the Transformer decoder. + decoder_activation_function (`str`, *optional*, defaults to `"relu"`): + The non-linear activation function (function or string) in the decoder. If string, `"gelu"`, + `"relu"`, `"silu"` and `"gelu_new"` are supported. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + num_denoising (`int`, *optional*, defaults to 100): + The total number of denoising tasks or queries to be used for contrastive denoising. + label_noise_ratio (`float`, *optional*, defaults to 0.5): + The fraction of denoising labels to which random noise should be added. + box_noise_scale (`float`, *optional*, defaults to 1.0): + Scale or magnitude of noise to be added to the bounding boxes. + learn_initial_query (`bool`, *optional*, defaults to `False`): + Indicates whether the initial query embeddings for the decoder should be learned during training + anchor_image_size (`tuple[int, int]`, *optional*): + Height and width of the input image used during evaluation to generate the bounding box anchors. If None, automatic generate anchor is applied. + with_box_refine (`bool`, *optional*, defaults to `True`): + Whether to apply iterative bounding box refinement, where each decoder layer refines the bounding boxes + based on the predictions from the previous layer. + is_encoder_decoder (`bool`, *optional*, defaults to `True`): + Whether the architecture has an encoder decoder structure. + matcher_alpha (`float`, *optional*, defaults to 0.25): + Parameter alpha used by the Hungarian Matcher. + matcher_gamma (`float`, *optional*, defaults to 2.0): + Parameter gamma used by the Hungarian Matcher. + matcher_class_cost (`float`, *optional*, defaults to 2.0): + The relative weight of the class loss used by the Hungarian Matcher. + matcher_bbox_cost (`float`, *optional*, defaults to 5.0): + The relative weight of the bounding box loss used by the Hungarian Matcher. + matcher_giou_cost (`float`, *optional*, defaults to 2.0): + The relative weight of the giou loss of used by the Hungarian Matcher. + use_focal_loss (`bool`, *optional*, defaults to `True`): + Parameter informing if focal focal should be used. + auxiliary_loss (`bool`, *optional*, defaults to `True`): + Whether auxiliary decoding losses (loss at each decoder layer) are to be used. + focal_loss_alpha (`float`, *optional*, defaults to 0.75): + Parameter alpha used to compute the focal loss. + focal_loss_gamma (`float`, *optional*, defaults to 2.0): + Parameter gamma used to compute the focal loss. + weight_loss_vfl (`float`, *optional*, defaults to 1.0): + Relative weight of the varifocal loss in the object detection loss. + weight_loss_bbox (`float`, *optional*, defaults to 5.0): + Relative weight of the L1 bounding box loss in the object detection loss. + weight_loss_giou (`float`, *optional*, defaults to 2.0): + Relative weight of the generalized IoU loss in the object detection loss. + weight_loss_fgl (`float`, *optional*, defaults to 0.15): + Relative weight of the fine-grained localization loss in the object detection loss. + weight_loss_ddf (`float`, *optional*, defaults to 1.5): + Relative weight of the decoupled distillation focal loss in the object detection loss. + eos_coefficient (`float`, *optional*, defaults to 0.0001): + Relative classification weight of the 'no-object' class in the object detection loss. + eval_idx (`int`, *optional*, defaults to -1): + Index of the decoder layer to use for evaluation. If negative, counts from the end + (e.g., -1 means use the last layer). This allows for early prediction in the decoder + stack while still training later layers. + layer_scale (`float`, *optional*, defaults to `1.0`): + Scaling factor for the hidden dimension in later decoder layers. Used to adjust the + model capacity after the evaluation layer. + max_num_bins (`int`, *optional*, defaults to 32): + Maximum number of bins for the distribution-guided bounding box refinement. + Higher values allow for more fine-grained localization but increase computation. + reg_scale (`float`, *optional*, defaults to 4.0): + Scale factor for the regression distribution. Controls the range and granularity + of the bounding box refinement process. + depth_mult (`float`, *optional*, defaults to 1.0): + Multiplier for the number of blocks in RepNCSPELAN4 layers. Used to scale the model's + depth while maintaining its architecture. + top_prob_values (`int`, *optional*, defaults to 4): + Number of top probability values to consider from each corner's distribution. + lqe_hidden_dim (`int`, *optional*, defaults to 64): + Hidden dimension size for the Location Quality Estimator (LQE) network. + lqe_layers (`int`, *optional*, defaults to 2): + Number of layers in the Location Quality Estimator MLP. + decoder_offset_scale (`float`, *optional*, defaults to 0.5): + Offset scale used in deformable attention. + decoder_method (`str`, *optional*, defaults to `"default"`): + The method to use for the decoder: `"default"` or `"discrete"`. + up (`float`, *optional*, defaults to 0.5): + Controls the upper bounds of the Weighting Function. + """ + + model_type = "d_fine" + layer_types = ["basic", "bottleneck"] + attribute_map = { + "hidden_size": "d_model", + "num_attention_heads": "encoder_attention_heads", + } + + def __init__( + self, + initializer_range=0.01, + initializer_bias_prior_prob=None, + layer_norm_eps=1e-5, + batch_norm_eps=1e-5, + # backbone + backbone_config=None, + backbone=None, + use_pretrained_backbone=False, + use_timm_backbone=False, + freeze_backbone_batch_norms=True, + backbone_kwargs=None, + # encoder HybridEncoder + encoder_hidden_dim=256, + encoder_in_channels=[512, 1024, 2048], + feat_strides=[8, 16, 32], + encoder_layers=1, + encoder_ffn_dim=1024, + encoder_attention_heads=8, + dropout=0.0, + activation_dropout=0.0, + encode_proj_layers=[2], + positional_encoding_temperature=10000, + encoder_activation_function="gelu", + activation_function="silu", + eval_size=None, + normalize_before=False, + hidden_expansion=1.0, + # decoder DFineTransformer + d_model=256, + num_queries=300, + decoder_in_channels=[256, 256, 256], + decoder_ffn_dim=1024, + num_feature_levels=3, + decoder_n_points=4, + decoder_layers=6, + decoder_attention_heads=8, + decoder_activation_function="relu", + attention_dropout=0.0, + num_denoising=100, + label_noise_ratio=0.5, + box_noise_scale=1.0, + learn_initial_query=False, + anchor_image_size=None, + with_box_refine=True, + is_encoder_decoder=True, + # Loss + matcher_alpha=0.25, + matcher_gamma=2.0, + matcher_class_cost=2.0, + matcher_bbox_cost=5.0, + matcher_giou_cost=2.0, + use_focal_loss=True, + auxiliary_loss=True, + focal_loss_alpha=0.75, + focal_loss_gamma=2.0, + weight_loss_vfl=1.0, + weight_loss_bbox=5.0, + weight_loss_giou=2.0, + weight_loss_fgl=0.15, + weight_loss_ddf=1.5, + eos_coefficient=1e-4, + eval_idx=-1, + layer_scale=1, + max_num_bins=32, + reg_scale=4.0, + depth_mult=1.0, + top_prob_values=4, + lqe_hidden_dim=64, + lqe_layers=2, + decoder_offset_scale=0.5, + decoder_method="default", + up=0.5, + **kwargs, + ): + self.initializer_range = initializer_range + self.initializer_bias_prior_prob = initializer_bias_prior_prob + self.layer_norm_eps = layer_norm_eps + self.batch_norm_eps = batch_norm_eps + # backbone + if backbone_config is None and backbone is None: + logger.info( + "`backbone_config` and `backbone` are `None`. Initializing the config with the default `HGNet-V2` backbone." + ) + backbone_model_type = "hgnet_v2" + config_class = CONFIG_MAPPING[backbone_model_type] + # this will map it to RTDetrResNetConfig + # note: we can instead create HGNetV2Config + # and we would need to create HGNetV2Backbone + backbone_config = config_class( + num_channels=3, + embedding_size=64, + hidden_sizes=[256, 512, 1024, 2048], + depths=[3, 4, 6, 3], + layer_type="bottleneck", + hidden_act="relu", + downsample_in_first_stage=False, + downsample_in_bottleneck=False, + out_features=None, + out_indices=[2, 3, 4], + ) + elif isinstance(backbone_config, dict): + backbone_model_type = backbone_config.pop("model_type") + config_class = CONFIG_MAPPING[backbone_model_type] + backbone_config = config_class.from_dict(backbone_config) + + verify_backbone_config_arguments( + use_timm_backbone=use_timm_backbone, + use_pretrained_backbone=use_pretrained_backbone, + backbone=backbone, + backbone_config=backbone_config, + backbone_kwargs=backbone_kwargs, + ) + + self.backbone_config = backbone_config + self.backbone = backbone + self.use_pretrained_backbone = use_pretrained_backbone + self.use_timm_backbone = use_timm_backbone + self.freeze_backbone_batch_norms = freeze_backbone_batch_norms + self.backbone_kwargs = backbone_kwargs + # encoder + self.encoder_hidden_dim = encoder_hidden_dim + self.encoder_in_channels = encoder_in_channels + self.feat_strides = feat_strides + self.encoder_attention_heads = encoder_attention_heads + self.encoder_ffn_dim = encoder_ffn_dim + self.dropout = dropout + self.activation_dropout = activation_dropout + self.encode_proj_layers = encode_proj_layers + self.encoder_layers = encoder_layers + self.positional_encoding_temperature = positional_encoding_temperature + self.eval_size = eval_size + self.normalize_before = normalize_before + self.encoder_activation_function = encoder_activation_function + self.activation_function = activation_function + self.hidden_expansion = hidden_expansion + # decoder + self.d_model = d_model + self.num_queries = num_queries + self.decoder_ffn_dim = decoder_ffn_dim + self.decoder_in_channels = decoder_in_channels + self.num_feature_levels = num_feature_levels + self.decoder_n_points = decoder_n_points + self.decoder_layers = decoder_layers + self.decoder_attention_heads = decoder_attention_heads + self.decoder_activation_function = decoder_activation_function + self.attention_dropout = attention_dropout + self.num_denoising = num_denoising + self.label_noise_ratio = label_noise_ratio + self.box_noise_scale = box_noise_scale + self.learn_initial_query = learn_initial_query + self.anchor_image_size = anchor_image_size + self.auxiliary_loss = auxiliary_loss + self.with_box_refine = with_box_refine + # Loss + self.matcher_alpha = matcher_alpha + self.matcher_gamma = matcher_gamma + self.matcher_class_cost = matcher_class_cost + self.matcher_bbox_cost = matcher_bbox_cost + self.matcher_giou_cost = matcher_giou_cost + self.use_focal_loss = use_focal_loss + self.focal_loss_alpha = focal_loss_alpha + self.focal_loss_gamma = focal_loss_gamma + self.weight_loss_vfl = weight_loss_vfl + self.weight_loss_bbox = weight_loss_bbox + self.weight_loss_giou = weight_loss_giou + self.weight_loss_fgl = weight_loss_fgl + self.weight_loss_ddf = weight_loss_ddf + self.eos_coefficient = eos_coefficient + # add the new attributes with the given values or defaults + self.eval_idx = eval_idx + self.layer_scale = layer_scale + self.max_num_bins = max_num_bins + self.reg_scale = reg_scale + self.depth_mult = depth_mult + self.decoder_offset_scale = decoder_offset_scale + self.decoder_method = decoder_method + self.top_prob_values = top_prob_values + self.lqe_hidden_dim = lqe_hidden_dim + self.lqe_layers = lqe_layers + self.up = up + + if isinstance(self.decoder_n_points, list): + if len(self.decoder_n_points) != self.num_feature_levels: + raise ValueError( + f"Length of decoder_n_points list ({len(self.decoder_n_points)}) must match num_feature_levels ({self.num_feature_levels})." + ) + + head_dim = self.d_model // self.decoder_attention_heads + if head_dim * self.decoder_attention_heads != self.d_model: + raise ValueError( + f"Embedded dimension {self.d_model} must be divisible by decoder_attention_heads {self.decoder_attention_heads}" + ) + super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) + + @property + def num_attention_heads(self) -> int: + return self.encoder_attention_heads + + @property + def hidden_size(self) -> int: + return self.d_model + + @property + def sub_configs(self): + return ( + {"backbone_config": type(self.backbone_config)} + if getattr(self, "backbone_config", None) is not None + else {} + ) + + @classmethod + def from_backbone_configs(cls, backbone_config: PretrainedConfig, **kwargs): + """Instantiate a [`DFineConfig`] (or a derived class) from a pre-trained backbone model configuration and DETR model + configuration. + + Args: + backbone_config ([`PretrainedConfig`]): + The backbone configuration. + + Returns: + [`DFineConfig`]: An instance of a configuration object + """ + return cls( + backbone_config=backbone_config, + **kwargs, + ) + + +class DFineMultiscaleDeformableAttention(nn.Module): + def __init__(self, config: DFineConfig): + """ + D-Fine version of multiscale deformable attention + """ + super().__init__() + self.d_model = config.d_model + self.n_heads = config.decoder_attention_heads + self.n_levels = config.num_feature_levels + self.offset_scale = config.decoder_offset_scale + self.decoder_method = config.decoder_method + self.n_points = config.decoder_n_points + + if isinstance(self.n_points, list): + num_points_list = self.n_points + else: + num_points_list = [self.n_points for _ in range(self.n_levels)] + + self.num_points_list = num_points_list + num_points_scale = [1 / n for n in self.num_points_list for _ in range(n)] + self.register_buffer("num_points_scale", torch.tensor(num_points_scale, dtype=torch.float32)) + + self.total_points = self.n_heads * sum(self.num_points_list) + + self.sampling_offsets = nn.Linear(self.d_model, self.total_points * 2) + self.attention_weights = nn.Linear(self.d_model, self.total_points) + + self.ms_deformable_attn_core = multi_scale_deformable_attention_v2 + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + reference_points=None, + encoder_hidden_states=None, + spatial_shapes=None, + spatial_shapes_list=None, + ) -> tuple[torch.Tensor, torch.Tensor]: + batch_size, num_queries, _ = hidden_states.shape + batch_size, sequence_length, _ = encoder_hidden_states.shape + + if not is_torchdynamo_compiling() and (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() != sequence_length: + raise ValueError( + "Make sure to align the spatial shapes with the sequence length of the encoder hidden states" + ) + + # Reshape for multi-head attention + value = encoder_hidden_states.reshape(batch_size, sequence_length, self.n_heads, self.d_model // self.n_heads) + if attention_mask is not None: + value = value.masked_fill(~attention_mask[..., None], float(0)) + + sampling_offsets: torch.Tensor = self.sampling_offsets(hidden_states) + sampling_offsets = sampling_offsets.reshape( + batch_size, num_queries, self.n_heads, sum(self.num_points_list), 2 + ) + + attention_weights = self.attention_weights(hidden_states).reshape( + batch_size, num_queries, self.n_heads, sum(self.num_points_list) + ) + attention_weights = F.softmax(attention_weights, dim=-1) + + if reference_points.shape[-1] == 2: + offset_normalizer = torch.tensor(spatial_shapes) + offset_normalizer = offset_normalizer.flip([1]).reshape(1, 1, 1, self.n_levels, 1, 2) + sampling_locations = ( + reference_points.reshape(batch_size, sequence_length, 1, self.n_levels, 1, 2) + + sampling_offsets / offset_normalizer + ) + elif reference_points.shape[-1] == 4: + # reference_points [8, 480, None, 1, 4] + # sampling_offsets [8, 480, 8, 12, 2] + num_points_scale = self.num_points_scale.to(dtype=hidden_states.dtype).unsqueeze(-1) + offset = sampling_offsets * num_points_scale * reference_points[:, :, None, :, 2:] * self.offset_scale + sampling_locations = reference_points[:, :, None, :, :2] + offset + else: + raise ValueError( + f"Last dim of reference_points must be 2 or 4, but get {reference_points.shape[-1]} instead." + ) + + output = self.ms_deformable_attn_core( + value, + spatial_shapes_list, + sampling_locations, + attention_weights, + self.num_points_list, + self.decoder_method, + ) + + return output, attention_weights + + +class DFineGate(nn.Module): + def __init__(self, d_model: int): + super().__init__() + self.gate = nn.Linear(2 * d_model, 2 * d_model) + self.norm = nn.LayerNorm(d_model) + + def forward(self, second_residual: torch.Tensor, hidden_states: torch.Tensor) -> torch.Tensor: + gate_input = torch.cat([second_residual, hidden_states], dim=-1) + gates = torch.sigmoid(self.gate(gate_input)) + gate1, gate2 = gates.chunk(2, dim=-1) + hidden_states = self.norm(gate1 * second_residual + gate2 * hidden_states) + return hidden_states + + +class DFineDecoderLayer(RTDetrDecoderLayer): + def __init__(self, config: DFineConfig): + super().__init__(config) + + # override the encoder attention module with d-fine version + self.encoder_attn = DFineMultiscaleDeformableAttention(config=config) + # gate + self.gateway = DFineGate(config.d_model) + + del self.encoder_attn_layer_norm + + def forward( + self, + hidden_states: torch.Tensor, + position_embeddings: Optional[torch.Tensor] = None, + reference_points=None, + spatial_shapes=None, + spatial_shapes_list=None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = False, + ) -> tuple[torch.Tensor, Any, Any]: + # Self Attention + hidden_states_2, self_attn_weights = self.self_attn( + hidden_states=hidden_states, + attention_mask=encoder_attention_mask, + position_embeddings=position_embeddings, + output_attentions=output_attentions, + ) + + hidden_states_2 = nn.functional.dropout(hidden_states_2, p=self.dropout, training=self.training) + hidden_states = hidden_states + hidden_states_2 + hidden_states = self.self_attn_layer_norm(hidden_states) + residual = hidden_states + + # Cross-Attention + cross_attn_weights = None + hidden_states = hidden_states if position_embeddings is None else hidden_states + position_embeddings + hidden_states_2, cross_attn_weights = self.encoder_attn( + hidden_states=hidden_states, + encoder_hidden_states=encoder_hidden_states, + reference_points=reference_points, + spatial_shapes=spatial_shapes, + spatial_shapes_list=spatial_shapes_list, + ) + + hidden_states_2 = nn.functional.dropout(hidden_states_2, p=self.dropout, training=self.training) + hidden_states = self.gateway(residual, hidden_states_2) + + # Fully Connected + hidden_states_2 = self.activation_fn(self.fc1(hidden_states)) + hidden_states_2 = nn.functional.dropout(hidden_states_2, p=self.activation_dropout, training=self.training) + hidden_states_2 = self.fc2(hidden_states_2) + hidden_states_2 = nn.functional.dropout(hidden_states_2, p=self.dropout, training=self.training) + hidden_states = hidden_states + hidden_states_2 + hidden_states = self.final_layer_norm(hidden_states.clamp(min=-65504, max=65504)) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights, cross_attn_weights) + + return outputs + + +class DFinePreTrainedModel(RTDetrPreTrainedModel): + def _init_weights(self, module): + # initialize linear layer bias value according to a given probability value. + if isinstance(module, (DFineForObjectDetection, DFineDecoder)): + if module.class_embed is not None: + for layer in module.class_embed: + prior_prob = self.config.initializer_bias_prior_prob or 1 / (self.config.num_labels + 1) + bias = float(-math.log((1 - prior_prob) / prior_prob)) + nn.init.xavier_uniform_(layer.weight) + nn.init.constant_(layer.bias, bias) + + if module.bbox_embed is not None: + for layer in module.bbox_embed: + nn.init.constant_(layer.layers[-1].weight, 0) + nn.init.constant_(layer.layers[-1].bias, 0) + + if hasattr(module, "reg_scale"): + module.reg_scale.fill_(self.config.reg_scale) + + if hasattr(module, "up"): + module.up.fill_(self.config.up) + + if isinstance(module, DFineMultiscaleDeformableAttention): + nn.init.constant_(module.sampling_offsets.weight.data, 0.0) + default_dtype = torch.get_default_dtype() + thetas = torch.arange(module.n_heads, dtype=torch.int64).to(default_dtype) * ( + 2.0 * math.pi / module.n_heads + ) + grid_init = torch.stack([thetas.cos(), thetas.sin()], -1) + grid_init = grid_init / grid_init.abs().max(-1, keepdim=True).values + grid_init = grid_init.reshape(module.n_heads, 1, 2).tile([1, sum(module.num_points_list), 1]) + scaling = torch.concat([torch.arange(1, n + 1) for n in module.num_points_list]).reshape(1, -1, 1) + grid_init *= scaling + with torch.no_grad(): + module.sampling_offsets.bias.data[...] = grid_init.flatten() + + nn.init.constant_(module.attention_weights.weight.data, 0.0) + nn.init.constant_(module.attention_weights.bias.data, 0.0) + + if isinstance(module, DFineModel): + prior_prob = self.config.initializer_bias_prior_prob or 1 / (self.config.num_labels + 1) + bias = float(-math.log((1 - prior_prob) / prior_prob)) + nn.init.xavier_uniform_(module.enc_score_head.weight) + nn.init.constant_(module.enc_score_head.bias, bias) + + if isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + + if isinstance(module, DFineGate): + bias = float(-math.log((1 - 0.5) / 0.5)) + init.constant_(module.gate.bias, bias) + init.constant_(module.gate.weight, 0) + + if isinstance(module, DFineLQE): + init.constant_(module.reg_conf.layers[-1].bias, 0) + init.constant_(module.reg_conf.layers[-1].weight, 0) + + if isinstance(module, nn.LayerNorm): + module.weight.data.fill_(1.0) + module.bias.data.zero_() + + if hasattr(module, "weight_embedding") and self.config.learn_initial_query: + nn.init.xavier_uniform_(module.weight_embedding.weight) + if hasattr(module, "denoising_class_embed") and self.config.num_denoising > 0: + nn.init.xavier_uniform_(module.denoising_class_embed.weight) + + +class DFineIntegral(nn.Module): + """ + A static layer that calculates integral results from a distribution. + + This layer computes the target location using the formula: `sum{Pr(n) * W(n)}`, + where Pr(n) is the softmax probability vector representing the discrete + distribution, and W(n) is the non-uniform Weighting Function. + + Args: + max_num_bins (int): Max number of the discrete bins. Default is 32. + It can be adjusted based on the dataset or task requirements. + """ + + def __init__(self, config: DFineConfig): + super().__init__() + self.max_num_bins = config.max_num_bins + + def forward(self, pred_corners: torch.Tensor, project: torch.Tensor) -> torch.Tensor: + batch_size, num_queries, _ = pred_corners.shape + pred_corners = F.softmax(pred_corners.reshape(-1, self.max_num_bins + 1), dim=1) + pred_corners = F.linear(pred_corners, project.to(pred_corners.device)).reshape(-1, 4) + pred_corners = pred_corners.reshape(batch_size, num_queries, -1) + return pred_corners + + +class DFineDecoderOutput(RTDetrDecoderOutput): + pass + + +class DFineDecoder(RTDetrDecoder): + """ + D-FINE Decoder implementing Fine-grained Distribution Refinement (FDR). + + This decoder refines object detection predictions through iterative updates across multiple layers, + utilizing attention mechanisms, location quality estimators, and distribution refinement techniques + to improve bounding box accuracy and robustness. + """ + + def __init__(self, config: DFineConfig): + self.eval_idx = config.eval_idx if config.eval_idx >= 0 else config.decoder_layers + config.eval_idx + super().__init__(config=config) + self.reg_scale = nn.Parameter(torch.tensor([config.reg_scale]), requires_grad=False) + self.max_num_bins = config.max_num_bins + self.d_model = config.d_model + self.layer_scale = config.layer_scale + self.pre_bbox_head = DFineMLP(config.hidden_size, config.hidden_size, 4, 3) + self.integral = DFineIntegral(config) + self.num_head = config.decoder_attention_heads + self.up = nn.Parameter(torch.tensor([config.up]), requires_grad=False) + self.lqe_layers = nn.ModuleList([DFineLQE(config) for _ in range(config.decoder_layers)]) + self.layers = nn.ModuleList( + [DFineDecoderLayer(config) for _ in range(config.decoder_layers)] + + [DFineDecoderLayer(config) for _ in range(config.decoder_layers - self.eval_idx - 1)] + ) + + def forward( + self, + encoder_hidden_states: torch.Tensor, + reference_points: torch.Tensor, + inputs_embeds: torch.Tensor, + spatial_shapes, + level_start_index=None, + spatial_shapes_list=None, + output_hidden_states=None, + encoder_attention_mask=None, + memory_mask=None, + output_attentions=None, + return_dict=None, + ) -> DFineDecoderOutput: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if inputs_embeds is not None: + hidden_states = inputs_embeds + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None + intermediate = () + intermediate_reference_points = () + intermediate_logits = () + intermediate_predicted_corners = () + initial_reference_points = () + + output_detach = pred_corners_undetach = 0 + + project = weighting_function(self.max_num_bins, self.up, self.reg_scale) + ref_points_detach = F.sigmoid(reference_points) + + for i, decoder_layer in enumerate(self.layers): + ref_points_input = ref_points_detach.unsqueeze(2) + query_pos_embed = self.query_pos_head(ref_points_detach).clamp(min=-10, max=10) + + if output_hidden_states: + all_hidden_states += (hidden_states,) + + output = decoder_layer( + hidden_states=hidden_states, + position_embeddings=query_pos_embed, + reference_points=ref_points_input, + spatial_shapes=spatial_shapes, + spatial_shapes_list=spatial_shapes_list, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + output_attentions=output_attentions, + ) + + hidden_states = output[0] + + if i == 0: + # Initial bounding box predictions with inverse sigmoid refinement + new_reference_points = F.sigmoid(self.pre_bbox_head(output[0]) + inverse_sigmoid(ref_points_detach)) + ref_points_initial = new_reference_points.detach() + + # Refine bounding box corners using FDR, integrating previous layer's corrections + if self.bbox_embed is not None: + pred_corners = self.bbox_embed[i](hidden_states + output_detach) + pred_corners_undetach + inter_ref_bbox = distance2bbox( + ref_points_initial, self.integral(pred_corners, project), self.reg_scale + ) + pred_corners_undetach = pred_corners + ref_points_detach = inter_ref_bbox.detach() + + output_detach = hidden_states.detach() + + intermediate += (hidden_states,) + + if self.class_embed is not None and (self.training or i == self.eval_idx): + scores = self.class_embed[i](hidden_states) + # Add initial logits and reference points with pre-bbox head + if i == 0: + intermediate_logits += (scores,) + intermediate_reference_points += (new_reference_points,) + # Lqe does not affect the performance here. + scores = self.lqe_layers[i](scores, pred_corners) + intermediate_logits += (scores,) + intermediate_reference_points += (inter_ref_bbox,) + initial_reference_points += (ref_points_initial,) + intermediate_predicted_corners += (pred_corners,) + + if output_attentions: + all_self_attns += (output[1],) + + if encoder_hidden_states is not None: + all_cross_attentions += (output[2],) + + # Keep batch_size as first dimension + intermediate = torch.stack(intermediate) + if self.class_embed is not None and self.bbox_embed is not None: + intermediate_logits = torch.stack(intermediate_logits, dim=1) + intermediate_predicted_corners = torch.stack(intermediate_predicted_corners, dim=1) + initial_reference_points = torch.stack(initial_reference_points, dim=1) + intermediate_reference_points = torch.stack(intermediate_reference_points, dim=1) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if not return_dict: + return tuple( + v + for v in [ + hidden_states, + intermediate, + intermediate_logits, + intermediate_reference_points, + intermediate_predicted_corners, + initial_reference_points, + all_hidden_states, + all_self_attns, + all_cross_attentions, + ] + if v is not None + ) + + return DFineDecoderOutput( + last_hidden_state=hidden_states, + intermediate_hidden_states=intermediate, + intermediate_logits=intermediate_logits, + intermediate_reference_points=intermediate_reference_points, + intermediate_predicted_corners=intermediate_predicted_corners, + initial_reference_points=initial_reference_points, + hidden_states=all_hidden_states, + attentions=all_self_attns, + cross_attentions=all_cross_attentions, + ) + + +class DFineModel(RTDetrModel): + def __init__(self, config: DFineConfig): + super().__init__(config) + del self.decoder_input_proj + self.encoder = DFineHybridEncoder(config=config) + num_backbone_outs = len(config.decoder_in_channels) + decoder_input_proj = [] + in_channels = config.decoder_in_channels[-1] + for _ in range(num_backbone_outs): + if config.hidden_size == config.decoder_in_channels[-1]: + decoder_input_proj.append(nn.Identity()) + else: + conv = nn.Conv2d(in_channels, config.d_model, kernel_size=1, bias=False) + batchnorm = nn.BatchNorm2d(config.d_model, config.batch_norm_eps) + decoder_input_proj.append(nn.Sequential(conv, batchnorm)) + for _ in range(config.num_feature_levels - num_backbone_outs): + if config.hidden_size == config.decoder_in_channels[-1]: + decoder_input_proj.append(nn.Identity()) + else: + conv = nn.Conv2d(in_channels, config.d_model, kernel_size=3, stride=2, padding=1, bias=False) + batchnorm = nn.BatchNorm2d(config.d_model, config.batch_norm_eps) + decoder_input_proj.append(nn.Sequential(conv, batchnorm)) + self.decoder_input_proj = nn.ModuleList(decoder_input_proj) + self.decoder = DFineDecoder(config) + + +class DFineForObjectDetection(RTDetrForObjectDetection, DFinePreTrainedModel): + def __init__(self, config: DFineConfig): + DFinePreTrainedModel.__init__(self, config) + + # D-FINE encoder-decoder model + self.eval_idx = config.eval_idx if config.eval_idx >= 0 else config.decoder_layers + config.eval_idx + self.model = DFineModel(config) + scaled_dim = round(config.layer_scale * config.hidden_size) + num_pred = config.decoder_layers + self.class_embed = nn.ModuleList([nn.Linear(config.d_model, config.num_labels) for _ in range(num_pred)]) + self.bbox_embed = nn.ModuleList( + [ + DFineMLP(config.hidden_size, config.hidden_size, 4 * (config.max_num_bins + 1), 3) + for _ in range(self.eval_idx + 1) + ] + + [ + DFineMLP(scaled_dim, scaled_dim, 4 * (config.max_num_bins + 1), 3) + for _ in range(config.decoder_layers - self.eval_idx - 1) + ] + ) + + # here self.model.decoder.bbox_embed is null, but not self.bbox_embed + self.model.decoder.class_embed = self.class_embed + self.model.decoder.bbox_embed = self.bbox_embed + + # Initialize weights and apply final processing + self.post_init() + + def forward(**super_kwargs): + r""" + Example: + + ```python + >>> import torch + >>> from transformers.image_utils import load_image + >>> from transformers import AutoImageProcessor, DFineForObjectDetection + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = load_image(url) + + >>> image_processor = AutoImageProcessor.from_pretrained("ustc-community/dfine-xlarge-coco") + >>> model = DFineForObjectDetection.from_pretrained("ustc-community/dfine-xlarge-coco") + + >>> # prepare image for the model + >>> inputs = image_processor(images=image, return_tensors="pt") + + >>> # forward pass + >>> outputs = model(**inputs) + + >>> logits = outputs.logits + >>> list(logits.shape) + [1, 300, 80] + + >>> boxes = outputs.pred_boxes + >>> list(boxes.shape) + [1, 300, 4] + + >>> # convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax) + >>> target_sizes = torch.tensor([image.size[::-1]]) + >>> results = image_processor.post_process_object_detection(outputs, threshold=0.9, target_sizes=target_sizes) + >>> result = results[0] # first image in batch + + >>> for score, label, box in zip(result["scores"], result["labels"], result["boxes"]): + ... box = [round(i, 2) for i in box.tolist()] + ... print( + ... f"Detected {model.config.id2label[label.item()]} with confidence " + ... f"{round(score.item(), 3)} at location {box}" + ... ) + Detected cat with confidence 0.958 at location [344.49, 23.4, 639.84, 374.27] + Detected cat with confidence 0.956 at location [11.71, 53.52, 316.64, 472.33] + Detected remote with confidence 0.947 at location [40.46, 73.7, 175.62, 117.57] + Detected sofa with confidence 0.918 at location [0.59, 1.88, 640.25, 474.74] + ``` + """ + super().forward(**super_kwargs) + + +def weighting_function(max_num_bins: int, up: torch.Tensor, reg_scale: int) -> torch.Tensor: + """ + Generates the non-uniform Weighting Function W(n) for bounding box regression. + + Args: + max_num_bins (int): Max number of the discrete bins. + up (Tensor): Controls upper bounds of the sequence, + where maximum offset is ±up * H / W. + reg_scale (float): Controls the curvature of the Weighting Function. + Larger values result in flatter weights near the central axis W(max_num_bins/2)=0 + and steeper weights at both ends. + Returns: + Tensor: Sequence of Weighting Function. + """ + upper_bound1 = abs(up[0]) * abs(reg_scale) + upper_bound2 = abs(up[0]) * abs(reg_scale) * 2 + step = (upper_bound1 + 1) ** (2 / (max_num_bins - 2)) + left_values = [-((step) ** i) + 1 for i in range(max_num_bins // 2 - 1, 0, -1)] + right_values = [(step) ** i - 1 for i in range(1, max_num_bins // 2)] + values = [-upper_bound2] + left_values + [torch.zeros_like(up[0][None])] + right_values + [upper_bound2] + values = torch.cat(values, 0) + return values + + +class DFineMLPPredictionHead(RTDetrMLPPredictionHead): + pass + + +def distance2bbox(points, distance: torch.Tensor, reg_scale: float) -> torch.Tensor: + """ + Decodes edge-distances into bounding box coordinates. + + Args: + points (`torch.Tensor`): + (batch_size, num_boxes, 4) or (num_boxes, 4) format, representing [x_center, y_center, width, height] + distance (`torch.Tensor`): + (batch_size, num_boxes, 4) or (num_boxes, 4), representing distances from the point to the left, top, right, and bottom boundaries. + reg_scale (`float`): + Controls the curvature of the Weighting Function. + Returns: + `torch.Tensor`: Bounding boxes in (batch_size, num_boxes, 4) or (num_boxes, 4) format, representing [x_center, y_center, width, height] + """ + reg_scale = abs(reg_scale) + top_left_x = points[..., 0] - (0.5 * reg_scale + distance[..., 0]) * (points[..., 2] / reg_scale) + top_left_y = points[..., 1] - (0.5 * reg_scale + distance[..., 1]) * (points[..., 3] / reg_scale) + bottom_right_x = points[..., 0] + (0.5 * reg_scale + distance[..., 2]) * (points[..., 2] / reg_scale) + bottom_right_y = points[..., 1] + (0.5 * reg_scale + distance[..., 3]) * (points[..., 3] / reg_scale) + + bboxes = torch.stack([top_left_x, top_left_y, bottom_right_x, bottom_right_y], -1) + + return corners_to_center_format(bboxes) + + +class DFineMLP(nn.Module): + def __init__(self, input_dim: int, hidden_dim: int, output_dim: int, num_layers: int, act: str = "relu"): + super().__init__() + self.num_layers = num_layers + hidden_dims = [hidden_dim] * (num_layers - 1) + input_dims = [input_dim] + hidden_dims + output_dims = hidden_dims + [output_dim] + self.layers = nn.ModuleList(nn.Linear(in_dim, out_dim) for in_dim, out_dim in zip(input_dims, output_dims)) + self.act = ACT2CLS[act]() + + def forward(self, stat_features: torch.Tensor) -> torch.Tensor: + for i, layer in enumerate(self.layers): + stat_features = self.act(layer(stat_features)) if i < self.num_layers - 1 else layer(stat_features) + return stat_features + + +class DFineLQE(nn.Module): + def __init__(self, config: DFineConfig): + super().__init__() + self.top_prob_values = config.top_prob_values + self.max_num_bins = config.max_num_bins + self.reg_conf = DFineMLP(4 * (self.top_prob_values + 1), config.lqe_hidden_dim, 1, config.lqe_layers) + + def forward(self, scores: torch.Tensor, pred_corners: torch.Tensor) -> torch.Tensor: + batch_size, length, _ = pred_corners.size() + prob = F.softmax(pred_corners.reshape(batch_size, length, 4, self.max_num_bins + 1), dim=-1) + prob_topk, _ = prob.topk(self.top_prob_values, dim=-1) + stat = torch.cat([prob_topk, prob_topk.mean(dim=-1, keepdim=True)], dim=-1) + quality_score = self.reg_conf(stat.reshape(batch_size, length, -1)) + scores = scores + quality_score + return scores + + +class DFineConvNormLayer(RTDetrConvNormLayer): + def __init__( + self, + config: DFineConfig, + in_channels: int, + out_channels: int, + kernel_size: int, + stride: int, + groups: int = 1, + padding: Optional[int] = None, + activation: Optional[str] = None, + ): + super().__init__(config, in_channels, out_channels, kernel_size, stride, padding=None, activation=activation) + self.conv = nn.Conv2d( + in_channels, + out_channels, + kernel_size, + stride, + groups=groups, + padding=(kernel_size - 1) // 2 if padding is None else padding, + bias=False, + ) + + +class DFineRepVggBlock(RTDetrRepVggBlock): + def __init__(self, config: DFineConfig, in_channels: int, out_channels: int): + super().__init__(config) + hidden_channels = in_channels + self.conv1 = DFineConvNormLayer(config, hidden_channels, out_channels, 3, 1, padding=1) + self.conv2 = DFineConvNormLayer(config, hidden_channels, out_channels, 1, 1, padding=0) + + +class DFineCSPRepLayer(nn.Module): + """ + Cross Stage Partial (CSP) network layer with RepVGG blocks. + """ + + def __init__( + self, config: DFineConfig, in_channels: int, out_channels: int, num_blocks: int, expansion: float = 1.0 + ): + super().__init__() + activation = config.activation_function + + hidden_channels = int(out_channels * expansion) + self.conv1 = DFineConvNormLayer(config, in_channels, hidden_channels, 1, 1, activation=activation) + self.conv2 = DFineConvNormLayer(config, in_channels, hidden_channels, 1, 1, activation=activation) + self.bottlenecks = nn.ModuleList( + [DFineRepVggBlock(config, hidden_channels, hidden_channels) for _ in range(num_blocks)] + ) + if hidden_channels != out_channels: + self.conv3 = DFineConvNormLayer(config, hidden_channels, out_channels, 1, 1, activation=activation) + else: + self.conv3 = nn.Identity() + + def forward(self, hidden_state: torch.Tensor) -> torch.Tensor: + hidden_state_1 = self.conv1(hidden_state) + for bottleneck in self.bottlenecks: + hidden_state_1 = bottleneck(hidden_state_1) + hidden_state_2 = self.conv2(hidden_state) + hidden_state_3 = self.conv3(hidden_state_1 + hidden_state_2) + return hidden_state_3 + + +class DFineRepNCSPELAN4(nn.Module): + def __init__(self, config: DFineConfig, act: str = "silu", numb_blocks: int = 3): + super().__init__() + conv1_dim = config.encoder_hidden_dim * 2 + conv2_dim = config.encoder_hidden_dim + conv3_dim = config.encoder_hidden_dim * 2 + conv4_dim = round(config.hidden_expansion * config.encoder_hidden_dim // 2) + self.conv_dim = conv3_dim // 2 + self.conv1 = DFineConvNormLayer(config, conv1_dim, conv3_dim, 1, 1, activation=act) + self.csp_rep1 = DFineCSPRepLayer(config, conv3_dim // 2, conv4_dim, num_blocks=numb_blocks) + self.conv2 = DFineConvNormLayer(config, conv4_dim, conv4_dim, 3, 1, activation=act) + self.csp_rep2 = DFineCSPRepLayer(config, conv4_dim, conv4_dim, num_blocks=numb_blocks) + self.conv3 = DFineConvNormLayer(config, conv4_dim, conv4_dim, 3, 1, activation=act) + self.conv4 = DFineConvNormLayer(config, conv3_dim + (2 * conv4_dim), conv2_dim, 1, 1, activation=act) + + def forward(self, input_features: torch.Tensor) -> torch.Tensor: + # Split initial features into two branches after first convolution + split_features = list(self.conv1(input_features).split((self.conv_dim, self.conv_dim), 1)) + + # Process branches sequentially + branch1 = self.csp_rep1(split_features[-1]) + branch1 = self.conv2(branch1) + branch2 = self.csp_rep2(branch1) + branch2 = self.conv3(branch2) + + split_features.extend([branch1, branch2]) + merged_features = torch.cat(split_features, 1) + merged_features = self.conv4(merged_features) + return merged_features + + +class DFineSCDown(nn.Module): + def __init__(self, config: DFineConfig, kernel_size: int, stride: int): + super().__init__() + self.conv1 = DFineConvNormLayer(config, config.encoder_hidden_dim, config.encoder_hidden_dim, 1, 1) + self.conv2 = DFineConvNormLayer( + config, + config.encoder_hidden_dim, + config.encoder_hidden_dim, + kernel_size, + stride, + config.encoder_hidden_dim, + ) + + def forward(self, input_features: torch.Tensor) -> torch.Tensor: + input_features = self.conv1(input_features) + input_features = self.conv2(input_features) + return input_features + + +class DFineEncoder(RTDetrEncoder): + pass + + +class DFineHybridEncoder(RTDetrHybridEncoder): + def __init__(self, config: DFineConfig): + nn.Module.__init__(self) + self.config = config + self.in_channels = config.encoder_in_channels + self.num_fpn_stages = len(self.in_channels) - 1 + self.feat_strides = config.feat_strides + self.encoder_hidden_dim = config.encoder_hidden_dim + self.encode_proj_layers = config.encode_proj_layers + self.positional_encoding_temperature = config.positional_encoding_temperature + self.eval_size = config.eval_size + self.out_channels = [self.encoder_hidden_dim for _ in self.in_channels] + self.out_strides = self.feat_strides + + # encoder transformer + self.encoder = nn.ModuleList([DFineEncoder(config) for _ in range(len(self.encode_proj_layers))]) + # top-down fpn + self.lateral_convs = nn.ModuleList() + self.fpn_blocks = nn.ModuleList() + for _ in range(len(self.in_channels) - 1, 0, -1): + lateral_layer = DFineConvNormLayer(config, self.encoder_hidden_dim, self.encoder_hidden_dim, 1, 1) + self.lateral_convs.append(lateral_layer) + num_blocks = round(3 * config.depth_mult) + fpn_layer = DFineRepNCSPELAN4(config, numb_blocks=num_blocks) + self.fpn_blocks.append(fpn_layer) + + # bottom-up pan + self.downsample_convs = nn.ModuleList() + self.pan_blocks = nn.ModuleList() + for _ in range(len(self.in_channels) - 1): + self.downsample_convs.append(DFineSCDown(config, 3, 2)) + num_blocks = round(3 * config.depth_mult) + self.pan_blocks.append(DFineRepNCSPELAN4(config, numb_blocks=num_blocks)) + + +__all__ = [ + "DFineConfig", + "DFineModel", + "DFinePreTrainedModel", + "DFineForObjectDetection", +] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/dab_detr/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/dab_detr/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..bfa364bd2152049fc92f575e102bf7b934ba4a6c --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/dab_detr/__init__.py @@ -0,0 +1,28 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_dab_detr import * + from .modeling_dab_detr import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/dab_detr/configuration_dab_detr.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/dab_detr/configuration_dab_detr.py new file mode 100644 index 0000000000000000000000000000000000000000..e53d7783a6f431feafb8a3946b85118c5c436858 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/dab_detr/configuration_dab_detr.py @@ -0,0 +1,268 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""DAB-DETR model configuration""" + +from ...configuration_utils import PretrainedConfig +from ...utils import logging +from ...utils.backbone_utils import verify_backbone_config_arguments +from ..auto import CONFIG_MAPPING + + +logger = logging.get_logger(__name__) + + +class DabDetrConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`DabDetrModel`]. It is used to instantiate + a DAB-DETR model according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of the DAB-DETR + [IDEA-Research/dab_detr-base](https://huggingface.co/IDEA-Research/dab_detr-base) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + use_timm_backbone (`bool`, *optional*, defaults to `True`): + Whether or not to use the `timm` library for the backbone. If set to `False`, will use the [`AutoBackbone`] + API. + backbone_config (`PretrainedConfig` or `dict`, *optional*): + The configuration of the backbone model. Only used in case `use_timm_backbone` is set to `False` in which + case it will default to `ResNetConfig()`. + backbone (`str`, *optional*, defaults to `"resnet50"`): + Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this + will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone` + is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights. + use_pretrained_backbone (`bool`, *optional*, defaults to `True`): + Whether to use pretrained weights for the backbone. + backbone_kwargs (`dict`, *optional*): + Keyword arguments to be passed to AutoBackbone when loading from a checkpoint + e.g. `{'out_indices': (0, 1, 2, 3)}`. Cannot be specified if `backbone_config` is set. + num_queries (`int`, *optional*, defaults to 300): + Number of object queries, i.e. detection slots. This is the maximal number of objects + [`DabDetrModel`] can detect in a single image. For COCO, we recommend 100 queries. + encoder_layers (`int`, *optional*, defaults to 6): + Number of encoder layers. + encoder_ffn_dim (`int`, *optional*, defaults to 2048): + Dimension of the "intermediate" (often named feed-forward) layer in encoder. + encoder_attention_heads (`int`, *optional*, defaults to 8): + Number of attention heads for each attention layer in the Transformer encoder. + decoder_layers (`int`, *optional*, defaults to 6): + Number of decoder layers. + decoder_ffn_dim (`int`, *optional*, defaults to 2048): + Dimension of the "intermediate" (often named feed-forward) layer in decoder. + decoder_attention_heads (`int`, *optional*, defaults to 8): + Number of attention heads for each attention layer in the Transformer decoder. + is_encoder_decoder (`bool`, *optional*, defaults to `True`): + Indicates whether the transformer model architecture is an encoder-decoder or not. + activation_function (`str` or `function`, *optional*, defaults to `"prelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"silu"` and `"gelu_new"` are supported. + hidden_size (`int`, *optional*, defaults to 256): + This parameter is a general dimension parameter, defining dimensions for components such as the encoder layer and projection parameters in the decoder layer, among others. + dropout (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + activation_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for activations inside the fully connected layer. + init_std (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + init_xavier_std (`float`, *optional*, defaults to 1.0): + The scaling factor used for the Xavier initialization gain in the HM Attention map module. + auxiliary_loss (`bool`, *optional*, defaults to `False`): + Whether auxiliary decoding losses (loss at each decoder layer) are to be used. + dilation (`bool`, *optional*, defaults to `False`): + Whether to replace stride with dilation in the last convolutional block (DC5). Only supported when `use_timm_backbone` = `True`. + class_cost (`float`, *optional*, defaults to 2): + Relative weight of the classification error in the Hungarian matching cost. + bbox_cost (`float`, *optional*, defaults to 5): + Relative weight of the L1 error of the bounding box coordinates in the Hungarian matching cost. + giou_cost (`float`, *optional*, defaults to 2): + Relative weight of the generalized IoU loss of the bounding box in the Hungarian matching cost. + cls_loss_coefficient (`float`, *optional*, defaults to 2): + Relative weight of the classification loss in the object detection loss function. + bbox_loss_coefficient (`float`, *optional*, defaults to 5): + Relative weight of the L1 bounding box loss in the object detection loss. + giou_loss_coefficient (`float`, *optional*, defaults to 2): + Relative weight of the generalized IoU loss in the object detection loss. + focal_alpha (`float`, *optional*, defaults to 0.25): + Alpha parameter in the focal loss. + temperature_height (`int`, *optional*, defaults to 20): + Temperature parameter to tune the flatness of positional attention (HEIGHT) + temperature_width (`int`, *optional*, defaults to 20): + Temperature parameter to tune the flatness of positional attention (WIDTH) + query_dim (`int`, *optional*, defaults to 4): + Query dimension parameter represents the size of the output vector. + random_refpoints_xy (`bool`, *optional*, defaults to `False`): + Whether to fix the x and y coordinates of the anchor boxes with random initialization. + keep_query_pos (`bool`, *optional*, defaults to `False`): + Whether to concatenate the projected positional embedding from the object query into the original query (key) in every decoder layer. + num_patterns (`int`, *optional*, defaults to 0): + Number of pattern embeddings. + normalize_before (`bool`, *optional*, defaults to `False`): + Whether we use a normalization layer in the Encoder or not. + sine_position_embedding_scale (`float`, *optional*, defaults to 'None'): + Scaling factor applied to the normalized positional encodings. + initializer_bias_prior_prob (`float`, *optional*): + The prior probability used by the bias initializer to initialize biases for `enc_score_head` and `class_embed`. + If `None`, `prior_prob` computed as `prior_prob = 1 / (num_labels + 1)` while initializing model weights. + + + Examples: + + ```python + >>> from transformers import DabDetrConfig, DabDetrModel + + >>> # Initializing a DAB-DETR IDEA-Research/dab_detr-base style configuration + >>> configuration = DabDetrConfig() + + >>> # Initializing a model (with random weights) from the IDEA-Research/dab_detr-base style configuration + >>> model = DabDetrModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "dab-detr" + keys_to_ignore_at_inference = ["past_key_values"] + attribute_map = { + "num_attention_heads": "encoder_attention_heads", + } + + def __init__( + self, + use_timm_backbone=True, + backbone_config=None, + backbone="resnet50", + use_pretrained_backbone=True, + backbone_kwargs=None, + num_queries=300, + encoder_layers=6, + encoder_ffn_dim=2048, + encoder_attention_heads=8, + decoder_layers=6, + decoder_ffn_dim=2048, + decoder_attention_heads=8, + is_encoder_decoder=True, + activation_function="prelu", + hidden_size=256, + dropout=0.1, + attention_dropout=0.0, + activation_dropout=0.0, + init_std=0.02, + init_xavier_std=1.0, + auxiliary_loss=False, + dilation=False, + class_cost=2, + bbox_cost=5, + giou_cost=2, + cls_loss_coefficient=2, + bbox_loss_coefficient=5, + giou_loss_coefficient=2, + focal_alpha=0.25, + temperature_height=20, + temperature_width=20, + query_dim=4, + random_refpoints_xy=False, + keep_query_pos=False, + num_patterns=0, + normalize_before=False, + sine_position_embedding_scale=None, + initializer_bias_prior_prob=None, + **kwargs, + ): + if query_dim != 4: + raise ValueError("The query dimensions has to be 4.") + + # We default to values which were previously hard-coded in the model. This enables configurability of the config + # while keeping the default behavior the same. + if use_timm_backbone and backbone_kwargs is None: + backbone_kwargs = {} + if dilation: + backbone_kwargs["output_stride"] = 16 + backbone_kwargs["out_indices"] = [1, 2, 3, 4] + backbone_kwargs["in_chans"] = 3 # num_channels + # Backwards compatibility + elif not use_timm_backbone and backbone in (None, "resnet50"): + if backbone_config is None: + logger.info("`backbone_config` is `None`. Initializing the config with the default `ResNet` backbone.") + backbone_config = CONFIG_MAPPING["resnet"](out_features=["stage4"]) + elif isinstance(backbone_config, dict): + backbone_model_type = backbone_config.get("model_type") + config_class = CONFIG_MAPPING[backbone_model_type] + backbone_config = config_class.from_dict(backbone_config) + backbone = None + # set timm attributes to None + dilation = None + + verify_backbone_config_arguments( + use_timm_backbone=use_timm_backbone, + use_pretrained_backbone=use_pretrained_backbone, + backbone=backbone, + backbone_config=backbone_config, + backbone_kwargs=backbone_kwargs, + ) + + self.use_timm_backbone = use_timm_backbone + self.backbone_config = backbone_config + self.num_queries = num_queries + self.hidden_size = hidden_size + self.encoder_ffn_dim = encoder_ffn_dim + self.encoder_layers = encoder_layers + self.encoder_attention_heads = encoder_attention_heads + self.decoder_ffn_dim = decoder_ffn_dim + self.decoder_layers = decoder_layers + self.decoder_attention_heads = decoder_attention_heads + self.dropout = dropout + self.attention_dropout = attention_dropout + self.activation_dropout = activation_dropout + self.activation_function = activation_function + self.init_std = init_std + self.init_xavier_std = init_xavier_std + self.num_hidden_layers = encoder_layers + self.auxiliary_loss = auxiliary_loss + self.backbone = backbone + self.use_pretrained_backbone = use_pretrained_backbone + self.backbone_kwargs = backbone_kwargs + # Hungarian matcher + self.class_cost = class_cost + self.bbox_cost = bbox_cost + self.giou_cost = giou_cost + # Loss coefficients + self.cls_loss_coefficient = cls_loss_coefficient + self.bbox_loss_coefficient = bbox_loss_coefficient + self.giou_loss_coefficient = giou_loss_coefficient + self.focal_alpha = focal_alpha + self.query_dim = query_dim + self.random_refpoints_xy = random_refpoints_xy + self.keep_query_pos = keep_query_pos + self.num_patterns = num_patterns + self.normalize_before = normalize_before + self.temperature_width = temperature_width + self.temperature_height = temperature_height + self.sine_position_embedding_scale = sine_position_embedding_scale + self.initializer_bias_prior_prob = initializer_bias_prior_prob + super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) + + @property + def sub_configs(self): + return ( + {"backbone_config": type(self.backbone_config)} + if getattr(self, "backbone_config", None) is not None + else {} + ) + + +__all__ = ["DabDetrConfig"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/dab_detr/modeling_dab_detr.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/dab_detr/modeling_dab_detr.py new file mode 100644 index 0000000000000000000000000000000000000000..4b7a27e7663b62fb1fd663595e9c6b4b16e9db9e --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/dab_detr/modeling_dab_detr.py @@ -0,0 +1,1599 @@ +# coding=utf-8 +# Copyright 2024 IDEA Research and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch DAB-DETR model.""" + +import math +from dataclasses import dataclass +from typing import Optional, Union + +import torch +from torch import Tensor, nn + +from ...activations import ACT2FN +from ...modeling_attn_mask_utils import _prepare_4d_attention_mask +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithCrossAttentions, Seq2SeqModelOutput +from ...modeling_utils import PreTrainedModel +from ...utils import ( + ModelOutput, + auto_docstring, + logging, +) +from ...utils.backbone_utils import load_backbone +from .configuration_dab_detr import DabDetrConfig + + +logger = logging.get_logger(__name__) + + +@dataclass +@auto_docstring( + custom_intro=""" + Base class for outputs of the Conditional DETR decoder. This class adds one attribute to + BaseModelOutputWithCrossAttentions, namely an optional stack of intermediate decoder activations, i.e. the output + of each decoder layer, each of them gone through a layernorm. This is useful when training the model with auxiliary + decoding losses. + """ +) +# Copied from transformers.models.conditional_detr.modeling_conditional_detr.ConditionalDetrDecoderOutput with ConditionalDetr->DabDetr,Conditional DETR->DAB-DETR,2 (anchor points)->4 (anchor points) +class DabDetrDecoderOutput(BaseModelOutputWithCrossAttentions): + r""" + cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax, + used to compute the weighted average in the cross-attention heads. + intermediate_hidden_states (`torch.FloatTensor` of shape `(config.decoder_layers, batch_size, num_queries, hidden_size)`, *optional*, returned when `config.auxiliary_loss=True`): + Intermediate decoder activations, i.e. the output of each decoder layer, each of them gone through a + layernorm. + reference_points (`torch.FloatTensor` of shape `(config.decoder_layers, batch_size, num_queries, 2 (anchor points))`): + Reference points (reference points of each layer of the decoder). + """ + + intermediate_hidden_states: Optional[torch.FloatTensor] = None + reference_points: Optional[tuple[torch.FloatTensor]] = None + + +@dataclass +@auto_docstring( + custom_intro=""" + Base class for outputs of the Conditional DETR encoder-decoder model. This class adds one attribute to + Seq2SeqModelOutput, namely an optional stack of intermediate decoder activations, i.e. the output of each decoder + layer, each of them gone through a layernorm. This is useful when training the model with auxiliary decoding + losses. + """ +) +# Copied from transformers.models.conditional_detr.modeling_conditional_detr.ConditionalDetrModelOutput with ConditionalDetr->DabDetr,Conditional DETR->DAB-DETR,2 (anchor points)->4 (anchor points) +class DabDetrModelOutput(Seq2SeqModelOutput): + r""" + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the decoder of the model. + intermediate_hidden_states (`torch.FloatTensor` of shape `(config.decoder_layers, batch_size, sequence_length, hidden_size)`, *optional*, returned when `config.auxiliary_loss=True`): + Intermediate decoder activations, i.e. the output of each decoder layer, each of them gone through a + layernorm. + reference_points (`torch.FloatTensor` of shape `(config.decoder_layers, batch_size, num_queries, 2 (anchor points))`): + Reference points (reference points of each layer of the decoder). + """ + + intermediate_hidden_states: Optional[torch.FloatTensor] = None + reference_points: Optional[tuple[torch.FloatTensor]] = None + + +@dataclass +@auto_docstring( + custom_intro=""" + Output type of [`DabDetrForObjectDetection`]. + """ +) +# Copied from transformers.models.detr.modeling_detr.DetrObjectDetectionOutput with Detr->DabDetr +class DabDetrObjectDetectionOutput(ModelOutput): + r""" + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)): + Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a + bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized + scale-invariant IoU loss. + loss_dict (`Dict`, *optional*): + A dictionary containing the individual losses. Useful for logging. + logits (`torch.FloatTensor` of shape `(batch_size, num_queries, num_classes + 1)`): + Classification logits (including no-object) for all queries. + pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`): + Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These + values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding + possible padding). You can use [`~DabDetrImageProcessor.post_process_object_detection`] to retrieve the + unnormalized bounding boxes. + auxiliary_outputs (`list[Dict]`, *optional*): + Optional, only returned when auxiliary losses are activated (i.e. `config.auxiliary_loss` is set to `True`) + and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and + `pred_boxes`) for each decoder layer. + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the decoder of the model. + """ + + loss: Optional[torch.FloatTensor] = None + loss_dict: Optional[dict] = None + logits: Optional[torch.FloatTensor] = None + pred_boxes: Optional[torch.FloatTensor] = None + auxiliary_outputs: Optional[list[dict]] = None + last_hidden_state: Optional[torch.FloatTensor] = None + decoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None + decoder_attentions: Optional[tuple[torch.FloatTensor]] = None + cross_attentions: Optional[tuple[torch.FloatTensor]] = None + encoder_last_hidden_state: Optional[torch.FloatTensor] = None + encoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None + encoder_attentions: Optional[tuple[torch.FloatTensor]] = None + + +# Copied from transformers.models.detr.modeling_detr.DetrFrozenBatchNorm2d with Detr->DabDetr +class DabDetrFrozenBatchNorm2d(nn.Module): + """ + BatchNorm2d where the batch statistics and the affine parameters are fixed. + + Copy-paste from torchvision.misc.ops with added eps before rqsrt, without which any other models than + torchvision.models.resnet[18,34,50,101] produce nans. + """ + + def __init__(self, n): + super().__init__() + self.register_buffer("weight", torch.ones(n)) + self.register_buffer("bias", torch.zeros(n)) + self.register_buffer("running_mean", torch.zeros(n)) + self.register_buffer("running_var", torch.ones(n)) + + def _load_from_state_dict( + self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs + ): + num_batches_tracked_key = prefix + "num_batches_tracked" + if num_batches_tracked_key in state_dict: + del state_dict[num_batches_tracked_key] + + super()._load_from_state_dict( + state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs + ) + + def forward(self, x): + # move reshapes to the beginning + # to make it user-friendly + weight = self.weight.reshape(1, -1, 1, 1) + bias = self.bias.reshape(1, -1, 1, 1) + running_var = self.running_var.reshape(1, -1, 1, 1) + running_mean = self.running_mean.reshape(1, -1, 1, 1) + epsilon = 1e-5 + scale = weight * (running_var + epsilon).rsqrt() + bias = bias - running_mean * scale + return x * scale + bias + + +# Copied from transformers.models.detr.modeling_detr.replace_batch_norm with Detr->DabDetr +def replace_batch_norm(model): + r""" + Recursively replace all `torch.nn.BatchNorm2d` with `DabDetrFrozenBatchNorm2d`. + + Args: + model (torch.nn.Module): + input model + """ + for name, module in model.named_children(): + if isinstance(module, nn.BatchNorm2d): + new_module = DabDetrFrozenBatchNorm2d(module.num_features) + + if module.weight.device != torch.device("meta"): + new_module.weight.data.copy_(module.weight) + new_module.bias.data.copy_(module.bias) + new_module.running_mean.data.copy_(module.running_mean) + new_module.running_var.data.copy_(module.running_var) + + model._modules[name] = new_module + + if len(list(module.children())) > 0: + replace_batch_norm(module) + + +# Modified from transformers.models.detr.modeling_detr.DetrConvEncoder with Detr->DabDetr +class DabDetrConvEncoder(nn.Module): + """ + Convolutional backbone, using either the AutoBackbone API or one from the timm library. + + nn.BatchNorm2d layers are replaced by DabDetrFrozenBatchNorm2d as defined above. + + """ + + def __init__(self, config: DabDetrConfig): + super().__init__() + + self.config = config + backbone = load_backbone(config) + + # replace batch norm by frozen batch norm + with torch.no_grad(): + replace_batch_norm(backbone) + self.model = backbone + self.intermediate_channel_sizes = self.model.channels + + def forward(self, pixel_values: torch.Tensor, pixel_mask: torch.Tensor): + # send pixel_values through the model to get list of feature maps + features = self.model(pixel_values).feature_maps + + out = [] + for feature_map in features: + # downsample pixel_mask to match shape of corresponding feature_map + mask = nn.functional.interpolate(pixel_mask[None].float(), size=feature_map.shape[-2:]).to(torch.bool)[0] + out.append((feature_map, mask)) + return out + + +# Copied from transformers.models.detr.modeling_detr.DetrConvModel with Detr->DabDetr +class DabDetrConvModel(nn.Module): + """ + This module adds 2D position embeddings to all intermediate feature maps of the convolutional encoder. + """ + + def __init__(self, conv_encoder, position_embedding): + super().__init__() + self.conv_encoder = conv_encoder + self.position_embedding = position_embedding + + def forward(self, pixel_values, pixel_mask): + # send pixel_values and pixel_mask through backbone to get list of (feature_map, pixel_mask) tuples + out = self.conv_encoder(pixel_values, pixel_mask) + pos = [] + for feature_map, mask in out: + # position encoding + pos.append(self.position_embedding(feature_map, mask).to(feature_map.dtype)) + + return out, pos + + +# Modified from transformers.models.conditional_detr.modeling_conditional_detr.ConditionalDetrSinePositionEmbedding with ConditionalDetr->DabDetr +class DabDetrSinePositionEmbedding(nn.Module): + """ + This is a more standard version of the position embedding, very similar to the one used by the Attention is all you + need paper, generalized to work on images. + """ + + def __init__(self, config: DabDetrConfig): + super().__init__() + self.config = config + self.embedding_dim = config.hidden_size / 2 + self.temperature_height = config.temperature_height + self.temperature_width = config.temperature_width + scale = config.sine_position_embedding_scale + if scale is None: + scale = 2 * math.pi + self.scale = scale + + def forward(self, pixel_values, pixel_mask): + if pixel_mask is None: + raise ValueError("No pixel mask provided") + y_embed = pixel_mask.cumsum(1, dtype=torch.float32) + x_embed = pixel_mask.cumsum(2, dtype=torch.float32) + y_embed = y_embed / (y_embed[:, -1:, :] + 1e-6) * self.scale + x_embed = x_embed / (x_embed[:, :, -1:] + 1e-6) * self.scale + + # We use float32 to ensure reproducibility of the original implementation + dim_tx = torch.arange(self.embedding_dim, dtype=torch.float32, device=pixel_values.device) + # Modifying dim_tx in place to avoid extra memory allocation -> dim_tx = self.temperature_width ** (2 * (dim_tx // 2) / self.embedding_dim) + dim_tx //= 2 + dim_tx.mul_(2 / self.embedding_dim) + dim_tx.copy_(self.temperature_width**dim_tx) + pos_x = x_embed[:, :, :, None] / dim_tx + + # We use float32 to ensure reproducibility of the original implementation + dim_ty = torch.arange(self.embedding_dim, dtype=torch.float32, device=pixel_values.device) + # Modifying dim_ty in place to avoid extra memory allocation -> dim_ty = self.temperature_height ** (2 * (dim_ty // 2) / self.embedding_dim) + dim_ty //= 2 + dim_ty.mul_(2 / self.embedding_dim) + dim_ty.copy_(self.temperature_height**dim_ty) + pos_y = y_embed[:, :, :, None] / dim_ty + + pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3) + pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3) + pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) + return pos + + +# function to generate sine positional embedding for 4d coordinates +def gen_sine_position_embeddings(pos_tensor, hidden_size=256): + """ + This function computes position embeddings using sine and cosine functions from the input positional tensor, + which has a shape of (batch_size, num_queries, 4). + The last dimension of `pos_tensor` represents the following coordinates: + - 0: x-coord + - 1: y-coord + - 2: width + - 3: height + + The output shape is (batch_size, num_queries, 512), where final dim (hidden_size*2 = 512) is the total embedding dimension + achieved by concatenating the sine and cosine values for each coordinate. + """ + scale = 2 * math.pi + dim = hidden_size // 2 + dim_t = torch.arange(dim, dtype=torch.float32, device=pos_tensor.device) + dim_t = 10000 ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / dim) + x_embed = pos_tensor[:, :, 0] * scale + y_embed = pos_tensor[:, :, 1] * scale + pos_x = x_embed[:, :, None] / dim_t + pos_y = y_embed[:, :, None] / dim_t + pos_x = torch.stack((pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()), dim=3).flatten(2) + pos_y = torch.stack((pos_y[:, :, 0::2].sin(), pos_y[:, :, 1::2].cos()), dim=3).flatten(2) + if pos_tensor.size(-1) == 4: + w_embed = pos_tensor[:, :, 2] * scale + pos_w = w_embed[:, :, None] / dim_t + pos_w = torch.stack((pos_w[:, :, 0::2].sin(), pos_w[:, :, 1::2].cos()), dim=3).flatten(2) + + h_embed = pos_tensor[:, :, 3] * scale + pos_h = h_embed[:, :, None] / dim_t + pos_h = torch.stack((pos_h[:, :, 0::2].sin(), pos_h[:, :, 1::2].cos()), dim=3).flatten(2) + + pos = torch.cat((pos_y, pos_x, pos_w, pos_h), dim=2) + else: + raise ValueError(f"Unknown pos_tensor shape(-1):{pos_tensor.size(-1)}") + return pos.to(pos_tensor.dtype) + + +def inverse_sigmoid(x, eps=1e-5): + x = x.clamp(min=0, max=1) + x1 = x.clamp(min=eps) + x2 = (1 - x).clamp(min=eps) + return torch.log(x1 / x2) + + +# Modified from transformers.models.detr.modeling_detr.DetrAttention +class DetrAttention(nn.Module): + """ + Multi-headed attention from 'Attention Is All You Need' paper. + + Here, we add position embeddings to the queries and keys (as explained in the DETR paper). + """ + + def __init__( + self, + config: DabDetrConfig, + bias: bool = True, + ): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.num_heads = config.encoder_attention_heads + self.attention_dropout = config.attention_dropout + self.head_dim = self.hidden_size // self.num_heads + if self.head_dim * self.num_heads != self.hidden_size: + raise ValueError( + f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size} and `num_heads`:" + f" {self.num_heads})." + ) + self.scaling = self.head_dim**-0.5 + self.k_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=bias) + self.v_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=bias) + self.q_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=bias) + self.out_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=bias) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + object_queries: Optional[torch.Tensor] = None, + key_value_states: Optional[torch.Tensor] = None, + output_attentions: bool = False, + ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]: + """Input shape: Batch x Time x Channel""" + batch_size, q_len, embed_dim = hidden_states.size() + # add position embeddings to the hidden states before projecting to queries and keys + if object_queries is not None: + hidden_states_original = hidden_states + hidden_states = hidden_states + object_queries + + query_states = self.q_proj(hidden_states) * self.scaling + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states_original) + + query_states = query_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2) + attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) + + if attention_mask is not None: + attn_weights = attn_weights + attention_mask + + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) + attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training) + attn_output = torch.matmul(attn_weights, value_states) + + if attn_output.size() != (batch_size, self.num_heads, q_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(batch_size, self.num_heads, q_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + + attn_output = attn_output.reshape(batch_size, q_len, embed_dim) + attn_output = self.out_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights + + +# Modified from transformers.models.conditional_detr.modeling_conditional_detr.ConditionalDetrAttention with ConditionalDetr->DABDETR,Conditional DETR->DabDetr +class DabDetrAttention(nn.Module): + """ + Cross-Attention used in DAB-DETR 'DAB-DETR for Fast Training Convergence' paper. + + The key q_proj, k_proj, v_proj are defined outside the attention. This attention allows the dim of q, k to be + different to v. + """ + + def __init__(self, config: DabDetrConfig, bias: bool = True, is_cross: bool = False): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size * 2 if is_cross else config.hidden_size + self.output_dim = config.hidden_size + self.attention_heads = config.decoder_attention_heads + self.attention_dropout = config.attention_dropout + self.attention_head_dim = self.embed_dim // self.attention_heads + if self.attention_head_dim * self.attention_heads != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `attention_heads`:" + f" {self.attention_heads})." + ) + # head dimension of values + self.values_head_dim = self.output_dim // self.attention_heads + if self.values_head_dim * self.attention_heads != self.output_dim: + raise ValueError( + f"output_dim must be divisible by attention_heads (got `output_dim`: {self.output_dim} and `attention_heads`: {self.attention_heads})." + ) + self.scaling = self.attention_head_dim**-0.5 + self.output_proj = nn.Linear(self.output_dim, self.output_dim, bias=bias) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + key_states: Optional[torch.Tensor] = None, + value_states: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]: + """Input shape: Batch x Time x Channel""" + + batch_size, q_len, _ = hidden_states.size() + + # scaling query and refactor key-, value states + query_states = hidden_states * self.scaling + query_states = query_states.view(batch_size, -1, self.attention_heads, self.attention_head_dim).transpose(1, 2) + key_states = key_states.view(batch_size, -1, self.attention_heads, self.attention_head_dim).transpose(1, 2) + value_states = value_states.view(batch_size, -1, self.attention_heads, self.values_head_dim).transpose(1, 2) + + attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) + + if attention_mask is not None: + attn_weights = attn_weights + attention_mask + + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) + attn_probs = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training) + attn_output = torch.matmul(attn_probs, value_states) + + if attn_output.size() != (batch_size, self.attention_heads, q_len, self.values_head_dim): + raise ValueError( + f"`attn_output` should be of size {(batch_size, self.attention_heads, q_len, self.values_head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + + attn_output = attn_output.reshape(batch_size, q_len, self.output_dim) + attn_output = self.output_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights + + +class DabDetrDecoderLayerSelfAttention(nn.Module): + def __init__(self, config: DabDetrConfig): + super().__init__() + self.dropout = config.dropout + self.self_attn_query_content_proj = nn.Linear(config.hidden_size, config.hidden_size) + self.self_attn_query_pos_proj = nn.Linear(config.hidden_size, config.hidden_size) + self.self_attn_key_content_proj = nn.Linear(config.hidden_size, config.hidden_size) + self.self_attn_key_pos_proj = nn.Linear(config.hidden_size, config.hidden_size) + self.self_attn_value_proj = nn.Linear(config.hidden_size, config.hidden_size) + self.self_attn = DabDetrAttention(config) + self.self_attn_layer_norm = nn.LayerNorm(config.hidden_size) + + def forward( + self, + hidden_states: torch.Tensor, + query_position_embeddings: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + ): + residual = hidden_states + query_content = self.self_attn_query_content_proj(hidden_states) + query_pos = self.self_attn_query_pos_proj(query_position_embeddings) + key_content = self.self_attn_key_content_proj(hidden_states) + key_pos = self.self_attn_key_pos_proj(query_position_embeddings) + value = self.self_attn_value_proj(hidden_states) + + query = query_content + query_pos + key = key_content + key_pos + + hidden_states, attn_weights = self.self_attn( + hidden_states=query, + attention_mask=attention_mask, + key_states=key, + value_states=value, + output_attentions=True, + ) + + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + + return hidden_states, attn_weights + + +class DabDetrDecoderLayerCrossAttention(nn.Module): + def __init__(self, config: DabDetrConfig, is_first: bool = False): + super().__init__() + hidden_size = config.hidden_size + self.cross_attn_query_content_proj = nn.Linear(hidden_size, hidden_size) + self.cross_attn_query_pos_proj = nn.Linear(hidden_size, hidden_size) + self.cross_attn_key_content_proj = nn.Linear(hidden_size, hidden_size) + self.cross_attn_key_pos_proj = nn.Linear(hidden_size, hidden_size) + self.cross_attn_value_proj = nn.Linear(hidden_size, hidden_size) + self.cross_attn_query_pos_sine_proj = nn.Linear(hidden_size, hidden_size) + self.decoder_attention_heads = config.decoder_attention_heads + self.cross_attn_layer_norm = nn.LayerNorm(hidden_size) + self.cross_attn = DabDetrAttention(config, is_cross=True) + + self.keep_query_pos = config.keep_query_pos + + if not self.keep_query_pos and not is_first: + self.cross_attn_query_pos_proj = None + + self.is_first = is_first + self.dropout = config.dropout + + def forward( + self, + hidden_states: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor] = None, + query_position_embeddings: Optional[torch.Tensor] = None, + object_queries: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + query_sine_embed: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + ): + query_content = self.cross_attn_query_content_proj(hidden_states) + key_content = self.cross_attn_key_content_proj(encoder_hidden_states) + value = self.cross_attn_value_proj(encoder_hidden_states) + + batch_size, num_queries, n_model = query_content.shape + _, height_width, _ = key_content.shape + + key_pos = self.cross_attn_key_pos_proj(object_queries) + + # For the first decoder layer, we add the positional embedding predicted from + # the object query (the positional embedding) into the original query (key) in DETR. + if self.is_first or self.keep_query_pos: + query_pos = self.cross_attn_query_pos_proj(query_position_embeddings) + query = query_content + query_pos + key = key_content + key_pos + else: + query = query_content + key = key_content + + query = query.view( + batch_size, num_queries, self.decoder_attention_heads, n_model // self.decoder_attention_heads + ) + query_sine_embed = self.cross_attn_query_pos_sine_proj(query_sine_embed) + query_sine_embed = query_sine_embed.view( + batch_size, num_queries, self.decoder_attention_heads, n_model // self.decoder_attention_heads + ) + query = torch.cat([query, query_sine_embed], dim=3).view(batch_size, num_queries, n_model * 2) + key = key.view(batch_size, height_width, self.decoder_attention_heads, n_model // self.decoder_attention_heads) + key_pos = key_pos.view( + batch_size, height_width, self.decoder_attention_heads, n_model // self.decoder_attention_heads + ) + key = torch.cat([key, key_pos], dim=3).view(batch_size, height_width, n_model * 2) + + # Cross-Attention Block + cross_attn_weights = None + if encoder_hidden_states is not None: + residual = hidden_states + + hidden_states, cross_attn_weights = self.cross_attn( + hidden_states=query, + attention_mask=encoder_attention_mask, + key_states=key, + value_states=value, + output_attentions=output_attentions, + ) + + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + hidden_states = self.cross_attn_layer_norm(hidden_states) + + return hidden_states, cross_attn_weights + + +class DabDetrDecoderLayerFFN(nn.Module): + def __init__(self, config: DabDetrConfig): + super().__init__() + hidden_size = config.hidden_size + self.final_layer_norm = nn.LayerNorm(hidden_size) + self.fc1 = nn.Linear(hidden_size, config.decoder_ffn_dim) + self.fc2 = nn.Linear(config.decoder_ffn_dim, hidden_size) + self.activation_fn = ACT2FN[config.activation_function] + self.dropout = config.dropout + self.activation_dropout = config.activation_dropout + self.keep_query_pos = config.keep_query_pos + + def forward(self, hidden_states: torch.Tensor): + residual = hidden_states + hidden_states = self.activation_fn(self.fc1(hidden_states)) + hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) + hidden_states = self.fc2(hidden_states) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + hidden_states = self.final_layer_norm(hidden_states) + + return hidden_states + + +# Modified from transformers.models.detr.modeling_detr.DetrEncoderLayer with DetrEncoderLayer->DabDetrEncoderLayer,DetrConfig->DabDetrConfig +class DabDetrEncoderLayer(GradientCheckpointingLayer): + def __init__(self, config: DabDetrConfig): + super().__init__() + self.hidden_size = config.hidden_size + self.self_attn = DetrAttention(config) + self.self_attn_layer_norm = nn.LayerNorm(self.hidden_size) + self.dropout = config.dropout + self.activation_fn = ACT2FN[config.activation_function] + self.fc1 = nn.Linear(self.hidden_size, config.encoder_ffn_dim) + self.fc2 = nn.Linear(config.encoder_ffn_dim, self.hidden_size) + self.final_layer_norm = nn.LayerNorm(self.hidden_size) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: torch.Tensor, + object_queries: torch.Tensor, + output_attentions: Optional[bool] = None, + ): + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)` + attention_mask (`torch.FloatTensor`): attention mask of size + `(batch, source_len)` where padding elements are indicated by very large negative + values. + object_queries (`torch.FloatTensor`, *optional*): + Object queries (also called content embeddings), to be added to the hidden states. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + """ + residual = hidden_states + hidden_states, attn_weights = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + object_queries=object_queries, + output_attentions=output_attentions, + ) + + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + + residual = hidden_states + hidden_states = self.activation_fn(self.fc1(hidden_states)) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + + hidden_states = self.fc2(hidden_states) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + + hidden_states = residual + hidden_states + hidden_states = self.final_layer_norm(hidden_states) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (attn_weights,) + + return outputs + + +# Modified from transformers.models.conditional_detr.modeling_conditional_detr.ConditionalDetrDecoderLayer with ConditionalDetr->DabDetr +class DabDetrDecoderLayer(GradientCheckpointingLayer): + def __init__(self, config: DabDetrConfig, is_first: bool = False): + super().__init__() + self.self_attn = DabDetrDecoderLayerSelfAttention(config) + self.cross_attn = DabDetrDecoderLayerCrossAttention(config, is_first) + self.mlp = DabDetrDecoderLayerFFN(config) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + object_queries: Optional[torch.Tensor] = None, + query_position_embeddings: Optional[torch.Tensor] = None, + query_sine_embed: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + ): + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)` + attention_mask (`torch.FloatTensor`): attention mask of size + `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative + values. + object_queries (`torch.FloatTensor`, *optional*): + object_queries that are added to the queries and keys + in the cross-attention layer. + query_position_embeddings (`torch.FloatTensor`, *optional*): + object_queries that are added to the queries and keys + in the self-attention layer. + encoder_hidden_states (`torch.FloatTensor`): + cross attention input to the layer of shape `(seq_len, batch, embed_dim)` + encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size + `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative + values. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + + """ + hidden_states, self_attn_weights = self.self_attn( + hidden_states=hidden_states, + query_position_embeddings=query_position_embeddings, + attention_mask=attention_mask, + output_attentions=output_attentions, + ) + + hidden_states, cross_attn_weights = self.cross_attn( + hidden_states=hidden_states, + encoder_hidden_states=encoder_hidden_states, + query_position_embeddings=query_position_embeddings, + object_queries=object_queries, + encoder_attention_mask=encoder_attention_mask, + query_sine_embed=query_sine_embed, + output_attentions=output_attentions, + ) + + hidden_states = self.mlp(hidden_states=hidden_states) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights, cross_attn_weights) + + return outputs + + +# Modified from transformers.models.detr.modeling_detr.DetrMLPPredictionHead with DetrMLPPredictionHead->DabDetrMLP +class DabDetrMLP(nn.Module): + """ + Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates, + height and width of a bounding box w.r.t. an image. + + Copied from https://github.com/facebookresearch/detr/blob/master/models/detr.py + + """ + + def __init__(self, input_dim, hidden_dim, output_dim, num_layers): + super().__init__() + self.num_layers = num_layers + h = [hidden_dim] * (num_layers - 1) + self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])) + + def forward(self, input_tensor): + for i, layer in enumerate(self.layers): + input_tensor = nn.functional.relu(layer(input_tensor)) if i < self.num_layers - 1 else layer(input_tensor) + return input_tensor + + +# Modified from transformers.models.detr.modeling_detr.DetrPreTrainedModel with Detr->DabDetr +@auto_docstring +class DabDetrPreTrainedModel(PreTrainedModel): + config: DabDetrConfig + base_model_prefix = "model" + main_input_name = "pixel_values" + _no_split_modules = [r"DabDetrConvEncoder", r"DabDetrEncoderLayer", r"DabDetrDecoderLayer"] + + def _init_weights(self, module): + std = self.config.init_std + xavier_std = self.config.init_xavier_std + + if isinstance(module, DabDetrMHAttentionMap): + nn.init.zeros_(module.k_linear.bias) + nn.init.zeros_(module.q_linear.bias) + nn.init.xavier_uniform_(module.k_linear.weight, gain=xavier_std) + nn.init.xavier_uniform_(module.q_linear.weight, gain=xavier_std) + if isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.weight.data.fill_(1.0) + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, DabDetrForObjectDetection): + nn.init.constant_(module.bbox_predictor.layers[-1].weight.data, 0) + nn.init.constant_(module.bbox_predictor.layers[-1].bias.data, 0) + + # init prior_prob setting for focal loss + prior_prob = self.config.initializer_bias_prior_prob or 1 / (self.config.num_labels + 1) + bias_value = -math.log((1 - prior_prob) / prior_prob) + module.class_embed.bias.data.fill_(bias_value) + elif isinstance(module, nn.PReLU): + module.reset_parameters() + + +# Modified from transformers.models.detr.modeling_detr.DetrEncoder with Detr->DabDetr,DETR->ConditionalDETR +class DabDetrEncoder(DabDetrPreTrainedModel): + """ + Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a + [`DabDetrEncoderLayer`]. + + The encoder updates the flattened feature map through multiple self-attention layers. + + Small tweak for DAB-DETR: + + - object_queries are added to the forward pass. + + Args: + config: DabDetrConfig + """ + + def __init__(self, config: DabDetrConfig): + super().__init__(config) + + self.dropout = config.dropout + self.query_scale = DabDetrMLP(config.hidden_size, config.hidden_size, config.hidden_size, 2) + self.layers = nn.ModuleList([DabDetrEncoderLayer(config) for _ in range(config.encoder_layers)]) + self.norm = nn.LayerNorm(config.hidden_size) if config.normalize_before else None + self.gradient_checkpointing = False + + # Initialize weights and apply final processing + self.post_init() + + def forward( + self, + inputs_embeds, + attention_mask, + object_queries, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ): + r""" + Args: + inputs_embeds (`torch.FloatTensor` of shape `(sequence_length, batch_size, hidden_size)`): + Flattened feature map (output of the backbone + projection layer) that is passed to the encoder. + + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding pixel features. Mask values selected in `[0, 1]`: + + - 1 for pixel features that are real (i.e. **not masked**), + - 0 for pixel features that are padding (i.e. **masked**). + + [What are attention masks?](../glossary#attention-mask) + + object_queries (`torch.FloatTensor` of shape `(sequence_length, batch_size, hidden_size)`): + Object queries that are added to the queries in each self-attention layer. + + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + hidden_states = inputs_embeds + + # expand attention_mask + if attention_mask is not None: + # [batch_size, seq_len] -> [batch_size, 1, target_seq_len, source_seq_len] + attention_mask = _prepare_4d_attention_mask(attention_mask, inputs_embeds.dtype) + + encoder_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + + for encoder_layer in self.layers: + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + # pos scaler + pos_scales = self.query_scale(hidden_states) + # we add object_queries * pos_scaler as extra input to the encoder_layer + scaled_object_queries = object_queries * pos_scales + + layer_outputs = encoder_layer( + hidden_states, + attention_mask=attention_mask, + object_queries=scaled_object_queries, + output_attentions=output_attentions, + ) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + + if self.norm: + hidden_states = self.norm(hidden_states) + + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) + return BaseModelOutput( + last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions + ) + + +# Modified from transformers.models.conditional_detr.modeling_conditional_detr.ConditionalDetrDecoder with ConditionalDetr->DabDetr,Conditional DETR->DAB-DETR +class DabDetrDecoder(DabDetrPreTrainedModel): + """ + Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`DabDetrDecoderLayer`]. + + The decoder updates the query embeddings through multiple self-attention and cross-attention layers. + + Some small tweaks for DAB-DETR: + + - object_queries and query_position_embeddings are added to the forward pass. + - if self.config.auxiliary_loss is set to True, also returns a stack of activations from all decoding layers. + + Args: + config: DabDetrConfig + """ + + def __init__(self, config: DabDetrConfig): + super().__init__(config) + self.config = config + self.dropout = config.dropout + self.num_layers = config.decoder_layers + self.gradient_checkpointing = False + + self.layers = nn.ModuleList( + [DabDetrDecoderLayer(config, is_first=(layer_id == 0)) for layer_id in range(config.decoder_layers)] + ) + # in DAB-DETR, the decoder uses layernorm after the last decoder layer output + self.hidden_size = config.hidden_size + self.layernorm = nn.LayerNorm(self.hidden_size) + + # Default cond-elewise + self.query_scale = DabDetrMLP(self.hidden_size, self.hidden_size, self.hidden_size, 2) + + self.ref_point_head = DabDetrMLP( + config.query_dim // 2 * self.hidden_size, self.hidden_size, self.hidden_size, 2 + ) + + self.bbox_embed = None + + # Default decoder_modulate_hw_attn is True + self.ref_anchor_head = DabDetrMLP(self.hidden_size, self.hidden_size, 2, 2) + + # Initialize weights and apply final processing + self.post_init() + + def forward( + self, + inputs_embeds, + encoder_hidden_states, + memory_key_padding_mask, + object_queries, + query_position_embeddings, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ): + r""" + Args: + inputs_embeds (`torch.FloatTensor` of shape `(sequence_length, batch_size, hidden_size)`): + The query embeddings that are passed into the decoder. + encoder_hidden_states (`torch.FloatTensor` of shape `(encoder_sequence_length, batch_size, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention + of the decoder. + memory_key_padding_mask (`torch.Tensor.bool` of shape `(batch_size, sequence_length)`): + The memory_key_padding_mask indicates which positions in the memory (encoder outputs) should be ignored during the attention computation, + ensuring padding tokens do not influence the attention mechanism. + object_queries (`torch.FloatTensor` of shape `(sequence_length, batch_size, hidden_size)`, *optional*): + Position embeddings that are added to the queries and keys in each cross-attention layer. + query_position_embeddings (`torch.FloatTensor` of shape `(num_queries, batch_size, number_of_anchor_points)`): + Position embeddings that are added to the queries and keys in each self-attention layer. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if inputs_embeds is not None: + hidden_states = inputs_embeds + input_shape = inputs_embeds.size()[:-1] + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None + + intermediate = [] + reference_points = query_position_embeddings.sigmoid() + ref_points = [reference_points] + + # expand encoder attention mask + if encoder_hidden_states is not None and memory_key_padding_mask is not None: + # [batch_size, seq_len] -> [batch_size, 1, target_seq_len, source_seq_len] + memory_key_padding_mask = _prepare_4d_attention_mask( + memory_key_padding_mask, inputs_embeds.dtype, tgt_len=input_shape[-1] + ) + + for layer_id, decoder_layer in enumerate(self.layers): + if output_hidden_states: + all_hidden_states += (hidden_states,) + + obj_center = reference_points[..., : self.config.query_dim] + query_sine_embed = gen_sine_position_embeddings(obj_center, self.hidden_size) + query_pos = self.ref_point_head(query_sine_embed) + + # For the first decoder layer, we do not apply transformation over p_s + pos_transformation = 1 if layer_id == 0 else self.query_scale(hidden_states) + + # apply transformation + query_sine_embed = query_sine_embed[..., : self.hidden_size] * pos_transformation + + # modulated Height Width attentions + reference_anchor_size = self.ref_anchor_head(hidden_states).sigmoid() # nq, bs, 2 + query_sine_embed[..., self.hidden_size // 2 :] *= ( + reference_anchor_size[..., 0] / obj_center[..., 2] + ).unsqueeze(-1) + query_sine_embed[..., : self.hidden_size // 2] *= ( + reference_anchor_size[..., 1] / obj_center[..., 3] + ).unsqueeze(-1) + + layer_outputs = decoder_layer( + hidden_states, + None, # attention_mask + object_queries, + query_pos, + query_sine_embed, + encoder_hidden_states, # as a positional argument for gradient checkpointing + encoder_attention_mask=memory_key_padding_mask, + output_attentions=output_attentions, + ) + + # iter update + hidden_states = layer_outputs[0] + + if self.bbox_embed is not None: + new_reference_points = self.bbox_embed(hidden_states) + + new_reference_points[..., : self.config.query_dim] += inverse_sigmoid(reference_points) + new_reference_points = new_reference_points[..., : self.config.query_dim].sigmoid() + if layer_id != self.num_layers - 1: + ref_points.append(new_reference_points) + reference_points = new_reference_points.detach() + + intermediate.append(self.layernorm(hidden_states)) + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + if encoder_hidden_states is not None: + all_cross_attentions += (layer_outputs[2],) + + # Layer normalization on hidden states + hidden_states = self.layernorm(hidden_states) + + if output_hidden_states: + all_hidden_states += (hidden_states,) + + output_intermediate_hidden_states = torch.stack(intermediate) + output_reference_points = torch.stack(ref_points) + + if not return_dict: + return tuple( + v + for v in [ + hidden_states, + all_hidden_states, + all_self_attns, + all_cross_attentions, + output_intermediate_hidden_states, + output_reference_points, + ] + if v is not None + ) + return DabDetrDecoderOutput( + last_hidden_state=hidden_states, + hidden_states=all_hidden_states, + attentions=all_self_attns, + cross_attentions=all_cross_attentions, + intermediate_hidden_states=output_intermediate_hidden_states, + reference_points=output_reference_points, + ) + + +@auto_docstring( + custom_intro=""" + The bare DAB-DETR Model (consisting of a backbone and encoder-decoder Transformer) outputting raw + hidden-states, intermediate hidden states, reference points, output coordinates without any specific head on top. + """ +) +class DabDetrModel(DabDetrPreTrainedModel): + def __init__(self, config: DabDetrConfig): + super().__init__(config) + + self.auxiliary_loss = config.auxiliary_loss + + # Create backbone + positional encoding + self.backbone = DabDetrConvEncoder(config) + object_queries = DabDetrSinePositionEmbedding(config) + + self.query_refpoint_embeddings = nn.Embedding(config.num_queries, config.query_dim) + self.random_refpoints_xy = config.random_refpoints_xy + if self.random_refpoints_xy: + self.query_refpoint_embeddings.weight.data[:, :2].uniform_(0, 1) + self.query_refpoint_embeddings.weight.data[:, :2] = inverse_sigmoid( + self.query_refpoint_embeddings.weight.data[:, :2] + ) + self.query_refpoint_embeddings.weight.data[:, :2].requires_grad = False + + # Create projection layer + self.input_projection = nn.Conv2d( + self.backbone.intermediate_channel_sizes[-1], config.hidden_size, kernel_size=1 + ) + self.backbone = DabDetrConvModel(self.backbone, object_queries) + + self.encoder = DabDetrEncoder(config) + self.decoder = DabDetrDecoder(config) + + # decoder related variables + self.hidden_size = config.hidden_size + self.num_queries = config.num_queries + + self.num_patterns = config.num_patterns + if not isinstance(self.num_patterns, int): + logger.warning(f"num_patterns should be int but {type(self.num_patterns)}") + self.num_patterns = 0 + if self.num_patterns > 0: + self.patterns = nn.Embedding(self.num_patterns, self.hidden_size) + + self.aux_loss = config.auxiliary_loss + + # Initialize weights and apply final processing + self.post_init() + + def get_encoder(self): + return self.encoder + + def freeze_backbone(self): + for name, param in self.backbone.conv_encoder.model.named_parameters(): + param.requires_grad_(False) + + def unfreeze_backbone(self): + for name, param in self.backbone.conv_encoder.model.named_parameters(): + param.requires_grad_(True) + + @auto_docstring + def forward( + self, + pixel_values: torch.FloatTensor, + pixel_mask: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.LongTensor] = None, + encoder_outputs: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + decoder_inputs_embeds: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple[torch.FloatTensor], DabDetrModelOutput]: + r""" + decoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*): + Not used by default. Can be used to mask object queries. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you + can choose to directly pass a flattened representation of an image. + decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*): + Optionally, instead of initializing the queries with a tensor of zeros, you can choose to directly pass an + embedded representation. + + Examples: + + ```python + >>> from transformers import AutoImageProcessor, AutoModel + >>> from PIL import Image + >>> import requests + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> image_processor = AutoImageProcessor.from_pretrained("IDEA-Research/dab_detr-base") + >>> model = AutoModel.from_pretrained("IDEA-Research/dab_detr-base") + + >>> # prepare image for the model + >>> inputs = image_processor(images=image, return_tensors="pt") + + >>> # forward pass + >>> outputs = model(**inputs) + + >>> # the last hidden states are the final query embeddings of the Transformer decoder + >>> # these are of shape (batch_size, num_queries, hidden_size) + >>> last_hidden_states = outputs.last_hidden_state + >>> list(last_hidden_states.shape) + [1, 300, 256] + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + batch_size, _, height, width = pixel_values.shape + device = pixel_values.device + + if pixel_mask is None: + pixel_mask = torch.ones(((batch_size, height, width)), device=device) + + # First, sent pixel_values + pixel_mask through Backbone to obtain the features + # pixel_values should be of shape (batch_size, num_channels, height, width) + # pixel_mask should be of shape (batch_size, height, width) + features, object_queries_list = self.backbone(pixel_values, pixel_mask) + + # get final feature map and downsampled mask + feature_map, mask = features[-1] + + if mask is None: + raise ValueError("Backbone does not return downsampled pixel mask") + + flattened_mask = mask.flatten(1) + + # Second, apply 1x1 convolution to reduce the channel dimension to hidden_size (256 by default) + projected_feature_map = self.input_projection(feature_map) + + # Third, flatten the feature map + object_queries of shape NxCxHxW to HWxNxC, and permute it to NxHWxC + # In other words, turn their shape into ( sequence_length, batch_size, hidden_size) + flattened_features = projected_feature_map.flatten(2).permute(0, 2, 1) + object_queries = object_queries_list[-1].flatten(2).permute(0, 2, 1) + reference_position_embeddings = self.query_refpoint_embeddings.weight.unsqueeze(0).repeat(batch_size, 1, 1) + + # Fourth, sent flattened_features + flattened_mask + object_queries through encoder + # flattened_features is a Tensor of shape (height*width, batch_size, hidden_size) + # flattened_mask is a Tensor of shape (batch_size, height*width) + if encoder_outputs is None: + encoder_outputs = self.encoder( + inputs_embeds=flattened_features, + attention_mask=flattened_mask, + object_queries=object_queries, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True + elif return_dict and not isinstance(encoder_outputs, BaseModelOutput): + encoder_outputs = BaseModelOutput( + last_hidden_state=encoder_outputs[0], + hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, + attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None, + ) + + # Fifth, sent query embeddings + object_queries through the decoder (which is conditioned on the encoder output) + num_queries = reference_position_embeddings.shape[1] + if self.num_patterns == 0: + queries = torch.zeros(batch_size, num_queries, self.hidden_size, device=device) + else: + queries = ( + self.patterns.weight[:, None, None, :] + .repeat(1, self.num_queries, batch_size, 1) + .flatten(0, 1) + .permute(1, 0, 2) + ) # bs, n_q*n_pat, hidden_size + reference_position_embeddings = reference_position_embeddings.repeat( + 1, self.num_patterns, 1 + ) # bs, n_q*n_pat, hidden_size + + # decoder outputs consists of (dec_features, dec_hidden, dec_attn) + decoder_outputs = self.decoder( + inputs_embeds=queries, + query_position_embeddings=reference_position_embeddings, + object_queries=object_queries, + encoder_hidden_states=encoder_outputs[0], + memory_key_padding_mask=flattened_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + if not return_dict: + # last_hidden_state + output = (decoder_outputs[0],) + reference_points = decoder_outputs[-1] + intermediate_hidden_states = decoder_outputs[-2] + + # it has to follow the order of DABDETRModelOutput that is based on ModelOutput + # If we only use one of the variables then the indexing will change. + # E.g: if we return everything then 'decoder_attentions' is decoder_outputs[2], if we only use output_attentions then its decoder_outputs[1] + if output_hidden_states and output_attentions: + output += ( + decoder_outputs[1], + decoder_outputs[2], + decoder_outputs[3], + encoder_outputs[0], + encoder_outputs[1], + encoder_outputs[2], + ) + elif output_hidden_states: + # decoder_hidden_states, encoder_last_hidden_state, encoder_hidden_states + output += ( + decoder_outputs[1], + encoder_outputs[0], + encoder_outputs[1], + ) + elif output_attentions: + # decoder_self_attention, decoder_cross_attention, encoder_attentions + output += ( + decoder_outputs[1], + decoder_outputs[2], + encoder_outputs[1], + ) + + output += (intermediate_hidden_states, reference_points) + + return output + + reference_points = decoder_outputs.reference_points + intermediate_hidden_states = decoder_outputs.intermediate_hidden_states + + return DabDetrModelOutput( + last_hidden_state=decoder_outputs.last_hidden_state, + decoder_hidden_states=decoder_outputs.hidden_states if output_hidden_states else None, + decoder_attentions=decoder_outputs.attentions if output_attentions else None, + cross_attentions=decoder_outputs.cross_attentions if output_attentions else None, + encoder_last_hidden_state=encoder_outputs.last_hidden_state if output_hidden_states else None, + encoder_hidden_states=encoder_outputs.hidden_states if output_hidden_states else None, + encoder_attentions=encoder_outputs.attentions if output_attentions else None, + intermediate_hidden_states=intermediate_hidden_states, + reference_points=reference_points, + ) + + +# Copied from transformers.models.detr.modeling_detr.DetrMHAttentionMap with Detr->DabDetr +class DabDetrMHAttentionMap(nn.Module): + """This is a 2D attention module, which only returns the attention softmax (no multiplication by value)""" + + def __init__(self, query_dim, hidden_dim, num_heads, dropout=0.0, bias=True, std=None): + super().__init__() + self.num_heads = num_heads + self.hidden_dim = hidden_dim + self.dropout = nn.Dropout(dropout) + + self.q_linear = nn.Linear(query_dim, hidden_dim, bias=bias) + self.k_linear = nn.Linear(query_dim, hidden_dim, bias=bias) + + self.normalize_fact = float(hidden_dim / self.num_heads) ** -0.5 + + def forward(self, q, k, mask: Optional[Tensor] = None): + q = self.q_linear(q) + k = nn.functional.conv2d(k, self.k_linear.weight.unsqueeze(-1).unsqueeze(-1), self.k_linear.bias) + queries_per_head = q.view(q.shape[0], q.shape[1], self.num_heads, self.hidden_dim // self.num_heads) + keys_per_head = k.view(k.shape[0], self.num_heads, self.hidden_dim // self.num_heads, k.shape[-2], k.shape[-1]) + weights = torch.einsum("bqnc,bnchw->bqnhw", queries_per_head * self.normalize_fact, keys_per_head) + + if mask is not None: + weights = weights.masked_fill(mask.unsqueeze(1).unsqueeze(1), torch.finfo(weights.dtype).min) + weights = nn.functional.softmax(weights.flatten(2), dim=-1).view(weights.size()) + weights = self.dropout(weights) + return weights + + +@auto_docstring( + custom_intro=""" + DAB_DETR Model (consisting of a backbone and encoder-decoder Transformer) with object detection heads on + top, for tasks such as COCO detection. + """ +) +class DabDetrForObjectDetection(DabDetrPreTrainedModel): + # When using clones, all layers > 0 will be clones, but layer 0 *is* required + _tied_weights_keys = [ + r"bbox_predictor\.layers\.\d+\.(weight|bias)", + r"model\.decoder\.bbox_embed\.layers\.\d+\.(weight|bias)", + ] + + def __init__(self, config: DabDetrConfig): + super().__init__(config) + + self.config = config + self.auxiliary_loss = config.auxiliary_loss + self.query_dim = config.query_dim + # DAB-DETR encoder-decoder model + self.model = DabDetrModel(config) + + _bbox_embed = DabDetrMLP(config.hidden_size, config.hidden_size, 4, 3) + # Object detection heads + self.class_embed = nn.Linear(config.hidden_size, config.num_labels) + + # Default bbox_embed_diff_each_layer is False + self.bbox_predictor = _bbox_embed + + # Default iter_update is True + self.model.decoder.bbox_embed = self.bbox_predictor + + # Initialize weights and apply final processing + self.post_init() + + # taken from https://github.com/Atten4Vis/conditionalDETR/blob/master/models/dab_detr.py + @torch.jit.unused + def _set_aux_loss(self, outputs_class, outputs_coord): + # this is a workaround to make torchscript happy, as torchscript + # doesn't support dictionary with non-homogeneous values, such + # as a dict having both a Tensor and a list. + return [{"logits": a, "pred_boxes": b} for a, b in zip(outputs_class[:-1], outputs_coord[:-1])] + + @auto_docstring + def forward( + self, + pixel_values: torch.FloatTensor, + pixel_mask: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.LongTensor] = None, + encoder_outputs: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + decoder_inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[list[dict]] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple[torch.FloatTensor], DabDetrObjectDetectionOutput]: + r""" + decoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*): + Not used by default. Can be used to mask object queries. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you + can choose to directly pass a flattened representation of an image. + decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*): + Optionally, instead of initializing the queries with a tensor of zeros, you can choose to directly pass an + embedded representation. + labels (`list[Dict]` of len `(batch_size,)`, *optional*): + Labels for computing the bipartite matching loss. List of dicts, each dictionary containing at least the + following 2 keys: 'class_labels' and 'boxes' (the class labels and bounding boxes of an image in the batch + respectively). The class labels themselves should be a `torch.LongTensor` of len `(number of bounding boxes + in the image,)` and the boxes a `torch.FloatTensor` of shape `(number of bounding boxes in the image, 4)`. + + Examples: + + ```python + >>> from transformers import AutoImageProcessor, AutoModelForObjectDetection + >>> from PIL import Image + >>> import requests + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> image_processor = AutoImageProcessor.from_pretrained("IDEA-Research/dab-detr-resnet-50") + >>> model = AutoModelForObjectDetection.from_pretrained("IDEA-Research/dab-detr-resnet-50") + + >>> inputs = image_processor(images=image, return_tensors="pt") + + >>> with torch.no_grad(): + >>> outputs = model(**inputs) + + >>> # convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax) + >>> target_sizes = torch.tensor([(image.height, image.width)]) + >>> results = image_processor.post_process_object_detection(outputs, threshold=0.5, target_sizes=target_sizes)[0] + >>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]): + ... box = [round(i, 2) for i in box.tolist()] + ... print( + ... f"Detected {model.config.id2label[label.item()]} with confidence " + ... f"{round(score.item(), 3)} at location {box}" + ... ) + Detected remote with confidence 0.833 at location [38.31, 72.1, 177.63, 118.45] + Detected cat with confidence 0.831 at location [9.2, 51.38, 321.13, 469.0] + Detected cat with confidence 0.804 at location [340.3, 16.85, 642.93, 370.95] + Detected remote with confidence 0.683 at location [334.48, 73.49, 366.37, 190.01] + Detected couch with confidence 0.535 at location [0.52, 1.19, 640.35, 475.1] + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # First, sent images through DAB_DETR base model to obtain encoder + decoder outputs + model_outputs = self.model( + pixel_values, + pixel_mask=pixel_mask, + decoder_attention_mask=decoder_attention_mask, + encoder_outputs=encoder_outputs, + inputs_embeds=inputs_embeds, + decoder_inputs_embeds=decoder_inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + reference_points = model_outputs.reference_points if return_dict else model_outputs[-1] + intermediate_hidden_states = model_outputs.intermediate_hidden_states if return_dict else model_outputs[-2] + + # class logits + predicted bounding boxes + logits = self.class_embed(intermediate_hidden_states[-1]) + + reference_before_sigmoid = inverse_sigmoid(reference_points) + bbox_with_refinement = self.bbox_predictor(intermediate_hidden_states) + bbox_with_refinement[..., : self.query_dim] += reference_before_sigmoid + outputs_coord = bbox_with_refinement.sigmoid() + + pred_boxes = outputs_coord[-1] + + loss, loss_dict, auxiliary_outputs = None, None, None + if labels is not None: + outputs_class = None + if self.config.auxiliary_loss: + outputs_class = self.class_embed(intermediate_hidden_states) + loss, loss_dict, auxiliary_outputs = self.loss_function( + logits, labels, self.device, pred_boxes, self.config, outputs_class, outputs_coord + ) + + if not return_dict: + if auxiliary_outputs is not None: + output = (logits, pred_boxes) + auxiliary_outputs + model_outputs + else: + output = (logits, pred_boxes) + model_outputs + # Since DabDetrObjectDetectionOutput doesn't have reference points + intermedieate_hidden_states we cut down. + return ((loss, loss_dict) + output) if loss is not None else output[:-2] + + return DabDetrObjectDetectionOutput( + loss=loss, + loss_dict=loss_dict, + logits=logits, + pred_boxes=pred_boxes, + auxiliary_outputs=auxiliary_outputs, + last_hidden_state=model_outputs.last_hidden_state, + decoder_hidden_states=model_outputs.decoder_hidden_states if output_hidden_states else None, + decoder_attentions=model_outputs.decoder_attentions if output_attentions else None, + cross_attentions=model_outputs.cross_attentions if output_attentions else None, + encoder_last_hidden_state=model_outputs.encoder_last_hidden_state if output_hidden_states else None, + encoder_hidden_states=model_outputs.encoder_hidden_states if output_hidden_states else None, + encoder_attentions=model_outputs.encoder_attentions if output_attentions else None, + ) + + +__all__ = [ + "DabDetrForObjectDetection", + "DabDetrModel", + "DabDetrPreTrainedModel", +] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/dac/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/dac/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..40f84ccb59be91ad440ba413366dd726e555ca26 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/dac/__init__.py @@ -0,0 +1,28 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_dac import * + from .feature_extraction_dac import * + from .modeling_dac import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/dac/configuration_dac.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/dac/configuration_dac.py new file mode 100644 index 0000000000000000000000000000000000000000..bdf07cef45bde6a6064d4b6b02eb69f4f077e018 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/dac/configuration_dac.py @@ -0,0 +1,114 @@ +# coding=utf-8 +# Copyright 2024 Descript and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Dac model configuration""" + +import math + +import numpy as np + +from ...configuration_utils import PretrainedConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class DacConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of an [`DacModel`]. It is used to instantiate a + Dac model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of the + [descript/dac_16khz](https://huggingface.co/descript/dac_16khz) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + encoder_hidden_size (`int`, *optional*, defaults to 64): + Intermediate representation dimension for the encoder. + downsampling_ratios (`list[int]`, *optional*, defaults to `[2, 4, 8, 8]`): + Ratios for downsampling in the encoder. These are used in reverse order for upsampling in the decoder. + decoder_hidden_size (`int`, *optional*, defaults to 1536): + Intermediate representation dimension for the decoder. + n_codebooks (`int`, *optional*, defaults to 9): + Number of codebooks in the VQVAE. + codebook_size (`int`, *optional*, defaults to 1024): + Number of discrete codes in each codebook. + codebook_dim (`int`, *optional*, defaults to 8): + Dimension of the codebook vectors. If not defined, uses `encoder_hidden_size`. + quantizer_dropout (`bool`, *optional*, defaults to 0): + Whether to apply dropout to the quantizer. + commitment_loss_weight (float, *optional*, defaults to 0.25): + Weight of the commitment loss term in the VQVAE loss function. + codebook_loss_weight (float, *optional*, defaults to 1.0): + Weight of the codebook loss term in the VQVAE loss function. + sampling_rate (`int`, *optional*, defaults to 16000): + The sampling rate at which the audio waveform should be digitalized expressed in hertz (Hz). + Example: + + ```python + >>> from transformers import DacModel, DacConfig + + >>> # Initializing a "descript/dac_16khz" style configuration + >>> configuration = DacConfig() + + >>> # Initializing a model (with random weights) from the "descript/dac_16khz" style configuration + >>> model = DacModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "dac" + + def __init__( + self, + encoder_hidden_size=64, + downsampling_ratios=[2, 4, 8, 8], + decoder_hidden_size=1536, + n_codebooks=9, + codebook_size=1024, + codebook_dim=8, + quantizer_dropout=0, + commitment_loss_weight=0.25, + codebook_loss_weight=1.0, + sampling_rate=16000, + **kwargs, + ): + self.encoder_hidden_size = encoder_hidden_size + self.downsampling_ratios = downsampling_ratios + self.decoder_hidden_size = decoder_hidden_size + self.upsampling_ratios = downsampling_ratios[::-1] + self.n_codebooks = n_codebooks + self.codebook_size = codebook_size + self.codebook_dim = codebook_dim + self.quantizer_dropout = quantizer_dropout + self.sampling_rate = sampling_rate + + self.hidden_size = encoder_hidden_size * (2 ** len(downsampling_ratios)) + + self.hop_length = int(np.prod(downsampling_ratios)) + self.commitment_loss_weight = commitment_loss_weight + self.codebook_loss_weight = codebook_loss_weight + + super().__init__(**kwargs) + + @property + def frame_rate(self) -> int: + hop_length = np.prod(self.upsampling_ratios) + return math.ceil(self.sampling_rate / hop_length) + + +__all__ = ["DacConfig"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/dac/feature_extraction_dac.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/dac/feature_extraction_dac.py new file mode 100644 index 0000000000000000000000000000000000000000..e81a2466dd91faa91ad9a34590825aa55e9497db --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/dac/feature_extraction_dac.py @@ -0,0 +1,174 @@ +# coding=utf-8 +# Copyright 2024 Descript and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Feature extractor class for DAC""" + +from typing import Optional, Union + +import numpy as np + +from ...feature_extraction_sequence_utils import SequenceFeatureExtractor +from ...feature_extraction_utils import BatchFeature +from ...utils import PaddingStrategy, TensorType, logging + + +logger = logging.get_logger(__name__) + + +class DacFeatureExtractor(SequenceFeatureExtractor): + r""" + Constructs an Dac feature extractor. + + This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains + most of the main methods. Users should refer to this superclass for more information regarding those methods. + + Args: + feature_size (`int`, *optional*, defaults to 1): + The feature dimension of the extracted features. Use 1 for mono, 2 for stereo. + sampling_rate (`int`, *optional*, defaults to 16000): + The sampling rate at which the audio waveform should be digitalized, expressed in hertz (Hz). + padding_value (`float`, *optional*, defaults to 0.0): + The value that is used for padding. + hop_length (`int`, *optional*, defaults to 512): + Overlap length between successive windows. + """ + + model_input_names = ["input_values", "n_quantizers"] + + def __init__( + self, + feature_size: int = 1, + sampling_rate: int = 16000, + padding_value: float = 0.0, + hop_length: int = 512, + **kwargs, + ): + super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs) + self.hop_length = hop_length + + def __call__( + self, + raw_audio: Union[np.ndarray, list[float], list[np.ndarray], list[list[float]]], + padding: Optional[Union[bool, str, PaddingStrategy]] = None, + truncation: Optional[bool] = False, + max_length: Optional[int] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + sampling_rate: Optional[int] = None, + ) -> BatchFeature: + """ + Main method to featurize and prepare for the model one or several sequence(s). + + Args: + raw_audio (`np.ndarray`, `list[float]`, `list[np.ndarray]`, `list[list[float]]`): + The sequence or batch of sequences to be processed. Each sequence can be a numpy array, a list of float + values, a list of numpy arrays or a list of list of float values. The numpy array must be of shape + `(num_samples,)` for mono audio (`feature_size = 1`), or `(2, num_samples)` for stereo audio + (`feature_size = 2`). + padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`): + Select a strategy to pad the returned sequences (according to the model's padding side and padding + index) among: + + - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single + sequence if provided). + - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum + acceptable input length for the model if that argument is not provided. + - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different + lengths). + truncation (`bool`, *optional*, defaults to `False`): + Activates truncation to cut input sequences longer than `max_length` to `max_length`. + max_length (`int`, *optional*): + Maximum length of the returned list and optionally padding length (see above). + return_tensors (`str` or [`~utils.TensorType`], *optional*, default to 'pt'): + If set, will return tensors instead of list of python integers. Acceptable values are: + + - `'tf'`: Return TensorFlow `tf.constant` objects. + - `'pt'`: Return PyTorch `torch.Tensor` objects. + - `'np'`: Return Numpy `np.ndarray` objects. + sampling_rate (`int`, *optional*): + The sampling rate at which the `audio` input was sampled. It is strongly recommended to pass + `sampling_rate` at the forward call to prevent silent errors. + """ + if sampling_rate is not None: + if sampling_rate != self.sampling_rate: + raise ValueError( + f"The model corresponding to this feature extractor: {self} was trained using a sampling rate of" + f" {self.sampling_rate}. Please make sure that the provided audio input was sampled with" + f" {self.sampling_rate} and not {sampling_rate}." + ) + else: + logger.warning( + f"It is strongly recommended to pass the `sampling_rate` argument to `{self.__class__.__name__}()`. " + "Failing to do so can result in silent errors that might be hard to debug." + ) + + if padding and truncation: + raise ValueError("Both padding and truncation were set. Make sure you only set one.") + elif padding is None: + # by default let's pad the inputs + padding = True + + is_batched = bool( + isinstance(raw_audio, (list, tuple)) and (isinstance(raw_audio[0], (np.ndarray, tuple, list))) + ) + + if is_batched: + raw_audio = [np.asarray(audio, dtype=np.float32).T for audio in raw_audio] + elif not is_batched and not isinstance(raw_audio, np.ndarray): + raw_audio = np.asarray(raw_audio, dtype=np.float32) + elif isinstance(raw_audio, np.ndarray) and raw_audio.dtype is np.dtype(np.float64): + raw_audio = raw_audio.astype(np.float32) + + # always return batch + if not is_batched: + raw_audio = [np.asarray(raw_audio).T] + + # verify inputs are valid + for idx, example in enumerate(raw_audio): + if example.ndim > 2: + raise ValueError(f"Expected input shape (channels, length) but got shape {example.shape}") + if self.feature_size == 1 and example.ndim != 1: + raise ValueError(f"Expected mono audio but example has {example.shape[-1]} channels") + if self.feature_size == 2: + raise ValueError("Stereo audio isn't supported for now") + + input_values = BatchFeature({"input_values": raw_audio}) + + # normal padding on batch + padded_inputs = self.pad( + input_values, + max_length=max_length, + truncation=truncation, + padding=padding, + return_attention_mask=padding, + pad_to_multiple_of=self.hop_length, + ) + if padding: + padded_inputs["padding_mask"] = padded_inputs.pop("attention_mask") + if padding: + padded_inputs.input_values = padded_inputs.input_values[:, np.newaxis, :] + + input_values = [] + for example in padded_inputs.pop("input_values"): + if self.feature_size == 1: + example = example[..., None] + input_values.append(example.T) + + padded_inputs["input_values"] = input_values + if return_tensors is not None: + padded_inputs = padded_inputs.convert_to_tensors(return_tensors) + + return padded_inputs + + +__all__ = ["DacFeatureExtractor"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/dac/modeling_dac.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/dac/modeling_dac.py new file mode 100644 index 0000000000000000000000000000000000000000..e97c8183651e1fa45d0c0886e258aa564b701239 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/dac/modeling_dac.py @@ -0,0 +1,685 @@ +# coding=utf-8 +# Copyright 2024 Descript and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Transformers DAC model.""" + +import math +from dataclasses import dataclass +from typing import Optional + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F + +from ...modeling_utils import PreTrainedAudioTokenizerBase +from ...utils import ModelOutput, auto_docstring +from .configuration_dac import DacConfig + + +@dataclass +@auto_docstring +class DacOutput(ModelOutput): + r""" + loss (`torch.Tensor`): + Loss from the encoder model, comprising the weighted combination of the commitment and codebook losses. + audio_values (`torch.Tensor` of shape `(batch_size, input_length)`): + Reconstructed audio data. + quantized_representation (`torch.Tensor` of shape `(batch_size, dimension, time_steps)`): + Quantized continuous representation of input. + audio_codes (`torch.LongTensor` of shape `(batch_size, num_codebooks, time_steps)`): + Codebook indices for each codebook (quantized discrete representation of input). + projected_latents (`torch.Tensor` of shape `(batch_size, num_codebooks * dimension, time_steps)`): + Projected latents (continuous representation of input before quantization). + """ + + loss: Optional[torch.FloatTensor] = None + audio_values: Optional[torch.FloatTensor] = None + quantized_representation: Optional[torch.FloatTensor] = None + audio_codes: Optional[torch.LongTensor] = None + projected_latents: Optional[torch.FloatTensor] = None + + +@dataclass +@auto_docstring +class DacEncoderOutput(ModelOutput): + r""" + loss (`torch.Tensor`): + Loss from the encoder model, comprising the weighted combination of the commitment and codebook losses. + quantized_representation (`torch.Tensor` of shape `(batch_size, dimension, time_steps)`, *optional*): + Quantized continuous representation of input. + audio_codes (`torch.Tensor` of shape `(batch_size, num_codebooks, time_steps)`, *optional*): + Codebook indices for each codebook (quantized discrete representation of input). + projected_latents (`torch.Tensor` of shape `(batch_size, num_codebooks * dimension, time_steps)`, *optional*): + Projected latents (continuous representation of input before quantization). + """ + + loss: Optional[torch.FloatTensor] = None + quantized_representation: Optional[torch.FloatTensor] = None + audio_codes: Optional[torch.FloatTensor] = None + projected_latents: Optional[torch.FloatTensor] = None + + +@dataclass +@auto_docstring +# Copied from transformers.models.encodec.modeling_encodec.EncodecDecoderOutput with Encodec->Dac, segment_length->input_length +class DacDecoderOutput(ModelOutput): + r""" + audio_values (`torch.FloatTensor` of shape `(batch_size, input_length)`, *optional*): + Decoded audio values, obtained using the decoder part of Dac. + """ + + audio_values: Optional[torch.FloatTensor] = None + + +class Snake1d(nn.Module): + """ + A 1-dimensional Snake activation function module. + """ + + def __init__(self, hidden_dim): + super().__init__() + self.alpha = nn.Parameter(torch.ones(1, hidden_dim, 1)) + + def forward(self, hidden_states): + shape = hidden_states.shape + hidden_states = hidden_states.reshape(shape[0], shape[1], -1) + hidden_states = hidden_states + (self.alpha + 1e-9).reciprocal() * torch.sin(self.alpha * hidden_states).pow(2) + hidden_states = hidden_states.reshape(shape) + return hidden_states + + +class DacVectorQuantize(nn.Module): + """ + Implementation of VQ similar to Karpathy's repo (https://github.com/karpathy/deep-vector-quantization) + + Additionally uses following tricks from improved VQGAN + (https://huggingface.co/papers/2110.04627): + 1. Factorized codes: Perform nearest neighbor lookup in low-dimensional space + for improved codebook usage + 2. l2-normalized codes: Converts euclidean distance to cosine similarity which + improves training stability + """ + + def __init__(self, config: DacConfig): + super().__init__() + + self.codebook_dim = config.codebook_dim + self.in_proj = nn.Conv1d(config.hidden_size, config.codebook_dim, kernel_size=1) + self.out_proj = nn.Conv1d(config.codebook_dim, config.hidden_size, kernel_size=1) + self.codebook = nn.Embedding(config.codebook_size, config.codebook_dim) + + def forward(self, hidden_state): + """ + Quantizes the input tensor using a fixed codebook and returns the corresponding codebook vectors. + + Args: + hidden_state (`torch.FloatTensor` of shape `(batch_size, dimension, time_steps)`): + Input tensor. + + Returns: + quantized_representation (`torch.Tensor`of shape `(batch_size, dimension, time_steps)`): + Quantized continuous representation of input. + commitment_loss (`torch.FloatTensor`of shape `(1)`): + Commitment loss to train encoder to predict vectors closer to codebook entries. + codebook_loss (`torch.FloatTensor`of shape `(1)`): + Codebook loss to update the codebook. + audio_codes (`torch.LongTensor` of shape `(batch_size, time_steps)`): + Codebook indices for each codebook, quantized discrete representation of input. + projected_latents (torch.FloatTensor of shape `(batch_size, num_codebooks * dimension, time_steps)`): + Projected latents (continuous representation of input before quantization). + """ + + projected_latents = self.in_proj(hidden_state) + quantized_representation, audio_codes = self.decode_latents(projected_latents) + + commitment_loss = F.mse_loss(projected_latents, quantized_representation.detach(), reduction="mean") + codebook_loss = F.mse_loss(quantized_representation, projected_latents.detach(), reduction="mean") + # noop in forward pass, straight-through gradient estimator in backward pass + quantized_representation = projected_latents + (quantized_representation - projected_latents).detach() + quantized_representation = self.out_proj(quantized_representation) + + return quantized_representation, commitment_loss, codebook_loss, audio_codes, projected_latents + + def decode_latents(self, hidden_states): + batch_size, hidden_dim, sequence_length = hidden_states.shape + encodings = hidden_states.permute(0, 2, 1).reshape(batch_size * sequence_length, hidden_dim) + codebook = self.codebook.weight # codebook: (N x D) + + # L2 normalize encodings and codebook (ViT-VQGAN) + encodings = F.normalize(encodings) + codebook = F.normalize(codebook) + + # Compute euclidean distance with codebook + l2_norm = encodings.pow(2).sum(1, keepdim=True) + dist = -(l2_norm - 2 * encodings @ codebook.t()) + codebook.pow(2).sum(1, keepdim=True).t() + + indices = dist.max(1)[1] + indices = indices.reshape(hidden_states.size(0), -1) + quantized_representation = self.codebook(indices).transpose(1, 2) + return quantized_representation, indices + + +class DacResidualUnit(nn.Module): + """ + A residual unit composed of Snake1d and weight-normalized Conv1d layers with dilations. + """ + + def __init__(self, dimension: int = 16, dilation: int = 1): + super().__init__() + pad = ((7 - 1) * dilation) // 2 + + self.snake1 = Snake1d(dimension) + self.conv1 = nn.Conv1d(dimension, dimension, kernel_size=7, dilation=dilation, padding=pad) + self.snake2 = Snake1d(dimension) + self.conv2 = nn.Conv1d(dimension, dimension, kernel_size=1) + + def forward(self, hidden_state): + """ + Forward pass through the residual unit. + + Args: + hidden_state (`torch.Tensor` of shape `(batch_size, channels, time_steps)`): + Input tensor . + + Returns: + output_tensor (`torch.Tensor` of shape `(batch_size, channels, time_steps)`): + Input tensor after passing through the residual unit. + """ + output_tensor = hidden_state + output_tensor = self.conv1(self.snake1(output_tensor)) + output_tensor = self.conv2(self.snake2(output_tensor)) + + padding = (hidden_state.shape[-1] - output_tensor.shape[-1]) // 2 + if padding > 0: + hidden_state = hidden_state[..., padding:-padding] + output_tensor = hidden_state + output_tensor + return output_tensor + + +class DacEncoderBlock(nn.Module): + """Encoder block used in DAC encoder.""" + + def __init__(self, config: DacConfig, stride: int = 1, stride_index: int = 1): + super().__init__() + + dimension = config.encoder_hidden_size * 2**stride_index + self.res_unit1 = DacResidualUnit(dimension // 2, dilation=1) + self.res_unit2 = DacResidualUnit(dimension // 2, dilation=3) + self.res_unit3 = DacResidualUnit(dimension // 2, dilation=9) + self.snake1 = Snake1d(dimension // 2) + self.conv1 = nn.Conv1d( + dimension // 2, dimension, kernel_size=2 * stride, stride=stride, padding=math.ceil(stride / 2) + ) + + def forward(self, hidden_state): + hidden_state = self.res_unit1(hidden_state) + hidden_state = self.res_unit2(hidden_state) + hidden_state = self.snake1(self.res_unit3(hidden_state)) + hidden_state = self.conv1(hidden_state) + + return hidden_state + + +class DacDecoderBlock(nn.Module): + """Decoder block used in DAC decoder.""" + + def __init__(self, config: DacConfig, stride: int = 1, stride_index: int = 1): + super().__init__() + + input_dim = config.decoder_hidden_size // 2**stride_index + output_dim = config.decoder_hidden_size // 2 ** (stride_index + 1) + self.snake1 = Snake1d(input_dim) + self.conv_t1 = nn.ConvTranspose1d( + input_dim, + output_dim, + kernel_size=2 * stride, + stride=stride, + padding=math.ceil(stride / 2), + ) + + self.res_unit1 = DacResidualUnit(output_dim, dilation=1) + self.res_unit2 = DacResidualUnit(output_dim, dilation=3) + self.res_unit3 = DacResidualUnit(output_dim, dilation=9) + + def forward(self, hidden_state): + hidden_state = self.snake1(hidden_state) + hidden_state = self.conv_t1(hidden_state) + hidden_state = self.res_unit1(hidden_state) + hidden_state = self.res_unit2(hidden_state) + hidden_state = self.res_unit3(hidden_state) + + return hidden_state + + +class DacResidualVectorQuantize(nn.Module): + """ + ResidualVectorQuantize block - Introduced in SoundStream: An end2end neural audio codec (https://huggingface.co/papers/2107.03312) + """ + + def __init__(self, config: DacConfig): + super().__init__() + + n_codebooks = config.n_codebooks + quantizer_dropout = config.quantizer_dropout + + self.n_codebooks = n_codebooks + + self.quantizers = nn.ModuleList([DacVectorQuantize(config) for i in range(config.n_codebooks)]) + self.quantizer_dropout = quantizer_dropout + + def forward(self, hidden_state, n_quantizers: Optional[int] = None): + """ + Quantizes the input tensor using a fixed set of codebooks and returns corresponding codebook vectors. + Args: + hidden_state (`torch.Tensor` of shape `(batch_size, dimension, time_steps)`): + Input tensor to be quantized. + n_quantizers (`int`, *optional*): + Number of quantizers to use. If specified and `self.quantizer_dropout` is True, + this argument is ignored during training, and a random number of quantizers is used. + + Returns: + quantized_representation (`torch.Tensor` of shape `(batch_size, dimension, time_steps)`): + Quantized continuous representation of input. + audio_codes (`torch.Tensor` of shape `(batch_size, num_codebooks, time_steps)`): + Codebook indices for each codebook (quantized discrete representation of input). + projected_latents (`torch.Tensor` of shape `(batch_size, num_codebooks * dimension, time_steps)`): + Projected latents (continuous representation of input before quantization). + commitment_loss (`torch.Tensor` of shape `(1)`): + Commitment loss to train the encoder to predict vectors closer to codebook entries. + codebook_loss (`torch.Tensor` of shape `(1)`): + Codebook loss to update the codebook. + """ + + quantized_representation = 0 + residual = hidden_state + commitment_loss = 0 + codebook_loss = 0 + + audio_codes = [] + projected_latents = [] + + n_quantizers = n_quantizers if n_quantizers is not None else self.n_codebooks + if self.training: + n_quantizers = torch.ones((hidden_state.shape[0],)) * self.n_codebooks + 1 + dropout = torch.randint(1, self.n_codebooks + 1, (hidden_state.shape[0],)) + n_dropout = int(hidden_state.shape[0] * self.quantizer_dropout) + n_quantizers[:n_dropout] = dropout[:n_dropout] + n_quantizers = n_quantizers.to(hidden_state.device) + + for i, quantizer in enumerate(self.quantizers): + if self.training is False and i >= n_quantizers: + break + + quantized_representation_i, commitment_loss_i, codebook_loss_i, indices_i, projected_latents_i = quantizer( + residual + ) + + # Create mask to apply quantizer dropout + mask = torch.full((hidden_state.shape[0],), fill_value=i, device=hidden_state.device) < n_quantizers + quantized_representation = quantized_representation + quantized_representation_i * mask[:, None, None] + residual = residual - quantized_representation_i + + # Sum losses + commitment_loss += commitment_loss_i * mask + codebook_loss += codebook_loss_i * mask + + audio_codes.append(indices_i) + projected_latents.append(projected_latents_i) + + audio_codes = torch.stack(audio_codes, dim=1) + projected_latents = torch.cat(projected_latents, dim=1) + + return quantized_representation, audio_codes, projected_latents, commitment_loss, codebook_loss + + def from_codes(self, audio_codes: torch.Tensor): + """ + Reconstructs the continuous representation from quantized codes. + + Args: + audio_codes (`torch.Tensor` of shape `(batch_size, num_codebooks, time_steps)`): + Quantized discrete representation of input. + + Returns: + quantized_representation (`torch.Tensor`): + Quantized continuous representation of input. + projected_latents (`torch.Tensor`): + List of projected latents (continuous representations of input before quantization) + for each codebook. + audio_codes (`torch.Tensor`): + Codebook indices for each codebook. + """ + quantized_representation = 0.0 + projected_latents = [] + n_codebooks = audio_codes.shape[1] + for i in range(n_codebooks): + projected_latents_i = self.quantizers[i].codebook(audio_codes[:, i, :]).transpose(1, 2) + projected_latents.append(projected_latents_i) + quantized_representation += self.quantizers[i].out_proj(projected_latents_i) + return quantized_representation, torch.cat(projected_latents, dim=1), audio_codes + + def from_latents(self, latents: torch.Tensor): + """Reconstructs the quantized representation from unquantized latents. + + Args: + latents (`torch.Tensor` of shape `(batch_size, total_latent_dimension, time_steps)`): + Continuous representation of input after projection. + + Returns: + quantized_representation (`torch.Tensor` of shape `(batch_size, dimension, time_steps)`): + Quantized representation of the full-projected space. + quantized_latents (`torch.Tensor` of shape `(batch_size, dimension, time_steps)`): + Quantized representation of the latent space (continuous representation before quantization). + """ + quantized_representation = 0 + quantized_latents = [] + codes = [] + codebook_dims_tensor = torch.tensor([0] + [q.codebook_dim for q in self.quantizers]) + dims = torch.cumsum(codebook_dims_tensor, dim=0) + + n_codebooks = np.where(dims <= latents.shape[1])[0].max(axis=0, keepdims=True)[0] + for i in range(n_codebooks): + hidden_dim_j, hidden_dim_k = dims[i], dims[i + 1] + quantized_latents_i, codes_i = self.quantizers[i].decode_latents(latents[:, hidden_dim_j:hidden_dim_k, :]) + quantized_latents.append(quantized_latents_i) + codes.append(codes_i) + + quantized_representation_i = self.quantizers[i].out_proj(quantized_latents_i) + quantized_representation = quantized_representation + quantized_representation_i + + return quantized_representation, torch.cat(quantized_latents, dim=1) + + +class DacDecoder(nn.Module): + """DAC Decoder""" + + def __init__(self, config: DacConfig): + super().__init__() + + input_channel = config.hidden_size + channels = config.decoder_hidden_size + strides = config.upsampling_ratios + + # Add first conv layer + self.conv1 = nn.Conv1d(input_channel, channels, kernel_size=7, padding=3) + + # Add upsampling + MRF blocks + block = [] + for stride_index, stride in enumerate(strides): + block += [DacDecoderBlock(config, stride, stride_index)] + + self.block = nn.ModuleList(block) + output_dim = config.decoder_hidden_size // 2 ** (stride_index + 1) + self.snake1 = Snake1d(output_dim) + self.conv2 = nn.Conv1d(output_dim, 1, kernel_size=7, padding=3) + self.tanh = nn.Tanh() + + def forward(self, hidden_state): + hidden_state = self.conv1(hidden_state) + + for layer in self.block: + hidden_state = layer(hidden_state) + + hidden_state = self.snake1(hidden_state) + hidden_state = self.conv2(hidden_state) + hidden_state = self.tanh(hidden_state) + + return hidden_state + + +class DacEncoder(nn.Module): + """DAC Encoder""" + + def __init__(self, config: DacConfig): + super().__init__() + + strides = config.downsampling_ratios + # Create first convolution + self.conv1 = nn.Conv1d(1, config.encoder_hidden_size, kernel_size=7, padding=3) + + self.block = [] + # Create EncoderBlocks that double channels as they downsample by `stride` + for stride_index, stride in enumerate(strides): + stride_index = stride_index + 1 + self.block += [DacEncoderBlock(config, stride=stride, stride_index=stride_index)] + + self.block = nn.ModuleList(self.block) + d_model = config.encoder_hidden_size * 2**stride_index + self.snake1 = Snake1d(d_model) + self.conv2 = nn.Conv1d(d_model, config.hidden_size, kernel_size=3, padding=1) + + def forward(self, hidden_state): + hidden_state = self.conv1(hidden_state) + + for module in self.block: + hidden_state = module(hidden_state) + + hidden_state = self.snake1(hidden_state) + hidden_state = self.conv2(hidden_state) + + return hidden_state + + +@auto_docstring +class DacPreTrainedModel(PreTrainedAudioTokenizerBase): + config: DacConfig + base_model_prefix = "dac" + main_input_name = "input_values" + + def _init_weights(self, module): + if isinstance(module, nn.Conv1d): + nn.init.trunc_normal_(module.weight, std=0.02) + nn.init.constant_(module.bias, 0) + elif isinstance(module, Snake1d): + module.alpha.data.fill_(1.0) + elif isinstance(module, nn.ConvTranspose1d): + module.reset_parameters() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=0.02) + + def apply_weight_norm(self): + weight_norm = nn.utils.weight_norm + if hasattr(nn.utils.parametrizations, "weight_norm"): + weight_norm = nn.utils.parametrizations.weight_norm + + for layer in self.quantizer.quantizers: + weight_norm(layer.in_proj) + weight_norm(layer.out_proj) + + weight_norm(self.encoder.conv1) + weight_norm(self.encoder.conv2) + + for layer in self.encoder.block: + weight_norm(layer.conv1) + weight_norm(layer.res_unit1.conv1) + weight_norm(layer.res_unit1.conv2) + weight_norm(layer.res_unit2.conv1) + weight_norm(layer.res_unit2.conv2) + weight_norm(layer.res_unit3.conv1) + weight_norm(layer.res_unit3.conv2) + + weight_norm(self.decoder.conv1) + weight_norm(self.decoder.conv2) + + for layer in self.decoder.block: + weight_norm(layer.conv_t1) + weight_norm(layer.res_unit1.conv1) + weight_norm(layer.res_unit1.conv2) + weight_norm(layer.res_unit2.conv1) + weight_norm(layer.res_unit2.conv2) + weight_norm(layer.res_unit3.conv1) + weight_norm(layer.res_unit3.conv2) + + def remove_weight_norm(self): + for layer in self.quantizer.quantizers: + nn.utils.remove_weight_norm(layer.in_proj) + nn.utils.remove_weight_norm(layer.out_proj) + + nn.utils.remove_weight_norm(self.encoder.conv1) + nn.utils.remove_weight_norm(self.encoder.conv2) + + for layer in self.encoder.block: + nn.utils.remove_weight_norm(layer.conv1) + nn.utils.remove_weight_norm(layer.res_unit1.conv1) + nn.utils.remove_weight_norm(layer.res_unit1.conv2) + nn.utils.remove_weight_norm(layer.res_unit2.conv1) + nn.utils.remove_weight_norm(layer.res_unit2.conv2) + nn.utils.remove_weight_norm(layer.res_unit3.conv1) + nn.utils.remove_weight_norm(layer.res_unit3.conv2) + + nn.utils.remove_weight_norm(self.decoder.conv1) + nn.utils.remove_weight_norm(self.decoder.conv2) + + for layer in self.decoder.block: + nn.utils.remove_weight_norm(layer.conv_t1) + nn.utils.remove_weight_norm(layer.res_unit1.conv1) + nn.utils.remove_weight_norm(layer.res_unit1.conv2) + nn.utils.remove_weight_norm(layer.res_unit2.conv1) + nn.utils.remove_weight_norm(layer.res_unit2.conv2) + nn.utils.remove_weight_norm(layer.res_unit3.conv1) + nn.utils.remove_weight_norm(layer.res_unit3.conv2) + + +@auto_docstring( + custom_intro=""" + The DAC (Descript Audio Codec) model. + """ +) +class DacModel(DacPreTrainedModel): + def __init__(self, config: DacConfig): + super().__init__(config) + self.config = config + + self.encoder = DacEncoder(config) + self.decoder = DacDecoder(config) + + self.quantizer = DacResidualVectorQuantize(config) + + self.bits_per_codebook = int(math.log2(self.config.codebook_size)) + if 2**self.bits_per_codebook != self.config.codebook_size: + raise ValueError("The codebook_size must be a power of 2.") + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def encode( + self, + input_values: torch.Tensor, + n_quantizers: Optional[int] = None, + return_dict: Optional[bool] = None, + ): + r""" + input_values (`torch.Tensor of shape `(batch_size, 1, time_steps)`): + Input audio data to encode, + n_quantizers (int, *optional*): + Number of quantizers to use. If None, all quantizers are used. Default is None. + """ + return_dict = return_dict if return_dict is not None else self.config.return_dict + + quantized_representation = self.encoder(input_values) + quantized_representation, audio_codes, projected_latents, commitment_loss, codebook_loss = self.quantizer( + quantized_representation, n_quantizers + ) + + loss = self.config.commitment_loss_weight * commitment_loss + self.config.codebook_loss_weight * codebook_loss + + if not return_dict: + return (loss, quantized_representation, audio_codes, projected_latents) + + return DacEncoderOutput(loss, quantized_representation, audio_codes, projected_latents) + + @auto_docstring + def decode( + self, + quantized_representation: Optional[torch.Tensor] = None, + audio_codes: Optional[torch.Tensor] = None, + return_dict: Optional[bool] = None, + ): + r""" + quantized_representation (torch.Tensor of shape `(batch_size, dimension, time_steps)`, *optional*): + Quantized continuous representation of input. + audio_codes (`torch.Tensor` of shape `(batch_size, num_codebooks, time_steps)`, *optional*): + The codebook indices for each codebook, representing the quantized discrete + representation of the input. This parameter should be provided if you want + to decode directly from the audio codes (it will overwrite quantized_representation). + return_dict (`bool`, *optional*, defaults to `True`): + Whether to return a [`DacDecoderOutput`] instead of a plain tuple. + """ + + if quantized_representation is None and audio_codes is None: + raise ValueError("Either `quantized_representation` or `audio_codes` must be provided.") + + return_dict = return_dict if return_dict is not None else self.config.return_dict + + if audio_codes is not None: + quantized_representation = self.quantizer.from_codes(audio_codes)[0] + + audio_values = self.decoder(quantized_representation).squeeze(1) + + if not return_dict: + return (audio_values,) + + return DacDecoderOutput(audio_values) + + @auto_docstring + def forward( + self, + input_values: torch.Tensor, + n_quantizers: Optional[int] = None, + return_dict: Optional[bool] = None, + ): + r""" + input_values (`torch.Tensor` of shape `(batch_size, 1, time_steps)`): + Audio data to encode. + n_quantizers (`int`, *optional*): + Number of quantizers to use. If `None`, all quantizers are used. Default is `None`. + + Examples: + + ```python + >>> from datasets import load_dataset, Audio + >>> from transformers import DacModel, AutoProcessor + >>> librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + + >>> model = DacModel.from_pretrained("descript/dac_16khz") + >>> processor = AutoProcessor.from_pretrained("descript/dac_16khz") + >>> librispeech_dummy = librispeech_dummy.cast_column("audio", Audio(sampling_rate=processor.sampling_rate)) + >>> audio_sample = librispeech_dummy[-1]["audio"]["array"] + >>> inputs = processor(raw_audio=audio_sample, sampling_rate=processor.sampling_rate, return_tensors="pt") + + >>> encoder_outputs = model.encode(inputs["input_values"]) + >>> # Get the intermediate audio codes + >>> audio_codes = encoder_outputs.audio_codes + >>> # Reconstruct the audio from its quantized representation + >>> audio_values = model.decode(encoder_outputs.quantized_representation) + >>> # or the equivalent with a forward pass + >>> audio_values = model(inputs["input_values"]).audio_values + ```""" + + return_dict = return_dict if return_dict is not None else self.config.return_dict + length = input_values.shape[-1] + + loss, quantized_representation, audio_codes, projected_latents = self.encode( + input_values, n_quantizers, return_dict=False + ) + audio_values = self.decode(quantized_representation, return_dict=False)[0][..., :length] + + if not return_dict: + return (loss, audio_values, quantized_representation, audio_codes, projected_latents) + + return DacOutput(loss, audio_values, quantized_representation, audio_codes, projected_latents) + + +__all__ = ["DacModel", "DacPreTrainedModel"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/dbrx/__pycache__/__init__.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/dbrx/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b2c3e0585fd8b61278b6db809c2c5007358644b8 Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/dbrx/__pycache__/__init__.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/dbrx/__pycache__/configuration_dbrx.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/dbrx/__pycache__/configuration_dbrx.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a3f596f3f0b0f9cfbcada56bc6b3b29f9263101d Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/dbrx/__pycache__/configuration_dbrx.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/dbrx/__pycache__/modeling_dbrx.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/dbrx/__pycache__/modeling_dbrx.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6068376e338156d9cb305c74763d789a8fef97b0 Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/dbrx/__pycache__/modeling_dbrx.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deberta/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deberta/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..f70972237964208b461eee8141a4f8b1377154f6 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deberta/__init__.py @@ -0,0 +1,30 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_deberta import * + from .modeling_deberta import * + from .modeling_tf_deberta import * + from .tokenization_deberta import * + from .tokenization_deberta_fast import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deberta/configuration_deberta.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deberta/configuration_deberta.py new file mode 100644 index 0000000000000000000000000000000000000000..3e23a73a8c38a1b2c08a6392f8cd2a8c228799ac --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deberta/configuration_deberta.py @@ -0,0 +1,200 @@ +# coding=utf-8 +# Copyright 2020, Microsoft and the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""DeBERTa model configuration""" + +from collections import OrderedDict +from collections.abc import Mapping +from typing import TYPE_CHECKING, Any, Optional, Union + +from ...configuration_utils import PretrainedConfig +from ...onnx import OnnxConfig +from ...utils import logging + + +if TYPE_CHECKING: + from ... import FeatureExtractionMixin, PreTrainedTokenizerBase, TensorType + + +logger = logging.get_logger(__name__) + + +class DebertaConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`DebertaModel`] or a [`TFDebertaModel`]. It is + used to instantiate a DeBERTa model according to the specified arguments, defining the model architecture. + Instantiating a configuration with the defaults will yield a similar configuration to that of the DeBERTa + [microsoft/deberta-base](https://huggingface.co/microsoft/deberta-base) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Arguments: + vocab_size (`int`, *optional*, defaults to 50265): + Vocabulary size of the DeBERTa model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`DebertaModel`] or [`TFDebertaModel`]. + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + intermediate_size (`int`, *optional*, defaults to 3072): + Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder. + hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"silu"`, `"gelu"`, `"tanh"`, `"gelu_fast"`, `"mish"`, `"linear"`, `"sigmoid"` and `"gelu_new"` + are supported. + hidden_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout ratio for the attention probabilities. + max_position_embeddings (`int`, *optional*, defaults to 512): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + type_vocab_size (`int`, *optional*, defaults to 0): + The vocabulary size of the `token_type_ids` passed when calling [`DebertaModel`] or [`TFDebertaModel`]. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-12): + The epsilon used by the layer normalization layers. + relative_attention (`bool`, *optional*, defaults to `False`): + Whether use relative position encoding. + max_relative_positions (`int`, *optional*, defaults to 1): + The range of relative positions `[-max_position_embeddings, max_position_embeddings]`. Use the same value + as `max_position_embeddings`. + pad_token_id (`int`, *optional*, defaults to 0): + The value used to pad input_ids. + position_biased_input (`bool`, *optional*, defaults to `True`): + Whether add absolute position embedding to content embedding. + pos_att_type (`list[str]`, *optional*): + The type of relative position attention, it can be a combination of `["p2c", "c2p"]`, e.g. `["p2c"]`, + `["p2c", "c2p"]`. + layer_norm_eps (`float`, *optional*, defaults to 1e-12): + The epsilon used by the layer normalization layers. + legacy (`bool`, *optional*, defaults to `True`): + Whether or not the model should use the legacy `LegacyDebertaOnlyMLMHead`, which does not work properly + for mask infilling tasks. + + Example: + + ```python + >>> from transformers import DebertaConfig, DebertaModel + + >>> # Initializing a DeBERTa microsoft/deberta-base style configuration + >>> configuration = DebertaConfig() + + >>> # Initializing a model (with random weights) from the microsoft/deberta-base style configuration + >>> model = DebertaModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "deberta" + + def __init__( + self, + vocab_size=50265, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=0, + initializer_range=0.02, + layer_norm_eps=1e-7, + relative_attention=False, + max_relative_positions=-1, + pad_token_id=0, + position_biased_input=True, + pos_att_type=None, + pooler_dropout=0, + pooler_hidden_act="gelu", + legacy=True, + **kwargs, + ): + super().__init__(**kwargs) + + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.initializer_range = initializer_range + self.relative_attention = relative_attention + self.max_relative_positions = max_relative_positions + self.pad_token_id = pad_token_id + self.position_biased_input = position_biased_input + + # Backwards compatibility + if isinstance(pos_att_type, str): + pos_att_type = [x.strip() for x in pos_att_type.lower().split("|")] + + self.pos_att_type = pos_att_type + self.vocab_size = vocab_size + self.layer_norm_eps = layer_norm_eps + + self.pooler_hidden_size = kwargs.get("pooler_hidden_size", hidden_size) + self.pooler_dropout = pooler_dropout + self.pooler_hidden_act = pooler_hidden_act + self.legacy = legacy + + +# Copied from transformers.models.deberta_v2.configuration_deberta_v2.DebertaV2OnnxConfig +class DebertaOnnxConfig(OnnxConfig): + @property + def inputs(self) -> Mapping[str, Mapping[int, str]]: + if self.task == "multiple-choice": + dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"} + else: + dynamic_axis = {0: "batch", 1: "sequence"} + if self._config.type_vocab_size > 0: + return OrderedDict( + [("input_ids", dynamic_axis), ("attention_mask", dynamic_axis), ("token_type_ids", dynamic_axis)] + ) + else: + return OrderedDict([("input_ids", dynamic_axis), ("attention_mask", dynamic_axis)]) + + @property + def default_onnx_opset(self) -> int: + return 12 + + def generate_dummy_inputs( + self, + preprocessor: Union["PreTrainedTokenizerBase", "FeatureExtractionMixin"], + batch_size: int = -1, + seq_length: int = -1, + num_choices: int = -1, + is_pair: bool = False, + framework: Optional["TensorType"] = None, + num_channels: int = 3, + image_width: int = 40, + image_height: int = 40, + tokenizer: "PreTrainedTokenizerBase" = None, + ) -> Mapping[str, Any]: + dummy_inputs = super().generate_dummy_inputs(preprocessor=preprocessor, framework=framework) + if self._config.type_vocab_size == 0 and "token_type_ids" in dummy_inputs: + del dummy_inputs["token_type_ids"] + return dummy_inputs + + +__all__ = ["DebertaConfig", "DebertaOnnxConfig"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deberta/modeling_deberta.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deberta/modeling_deberta.py new file mode 100644 index 0000000000000000000000000000000000000000..461572b47677fbc097cac1a7e66d8d758db90108 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deberta/modeling_deberta.py @@ -0,0 +1,1204 @@ +# coding=utf-8 +# Copyright 2020 Microsoft and the Hugging Face Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch DeBERTa model.""" + +from typing import Optional, Union + +import torch +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from ...activations import ACT2FN +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_outputs import ( + BaseModelOutput, + MaskedLMOutput, + QuestionAnsweringModelOutput, + SequenceClassifierOutput, + TokenClassifierOutput, +) +from ...modeling_utils import PreTrainedModel +from ...utils import auto_docstring, logging +from .configuration_deberta import DebertaConfig + + +logger = logging.get_logger(__name__) + + +class DebertaLayerNorm(nn.Module): + """LayerNorm module in the TF style (epsilon inside the square root).""" + + def __init__(self, size, eps=1e-12): + super().__init__() + self.weight = nn.Parameter(torch.ones(size)) + self.bias = nn.Parameter(torch.zeros(size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + input_type = hidden_states.dtype + hidden_states = hidden_states.float() + mean = hidden_states.mean(-1, keepdim=True) + variance = (hidden_states - mean).pow(2).mean(-1, keepdim=True) + hidden_states = (hidden_states - mean) / torch.sqrt(variance + self.variance_epsilon) + hidden_states = hidden_states.to(input_type) + y = self.weight * hidden_states + self.bias + return y + + +class DebertaSelfOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.LayerNorm = DebertaLayerNorm(config.hidden_size, config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states, input_tensor): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +@torch.jit.script +def build_relative_position(query_layer, key_layer): + """ + Build relative position according to the query and key + + We assume the absolute position of query \\(P_q\\) is range from (0, query_size) and the absolute position of key + \\(P_k\\) is range from (0, key_size), The relative positions from query to key is \\(R_{q \\rightarrow k} = P_q - + P_k\\) + + Args: + query_size (int): the length of query + key_size (int): the length of key + + Return: + `torch.LongTensor`: A tensor with shape [1, query_size, key_size] + + """ + + query_size = query_layer.size(-2) + key_size = key_layer.size(-2) + + q_ids = torch.arange(query_size, dtype=torch.long, device=query_layer.device) + k_ids = torch.arange(key_size, dtype=torch.long, device=key_layer.device) + rel_pos_ids = q_ids[:, None] - k_ids.view(1, -1).repeat(query_size, 1) + rel_pos_ids = rel_pos_ids[:query_size, :] + rel_pos_ids = rel_pos_ids.unsqueeze(0) + return rel_pos_ids + + +@torch.jit.script +def c2p_dynamic_expand(c2p_pos, query_layer, relative_pos): + return c2p_pos.expand([query_layer.size(0), query_layer.size(1), query_layer.size(2), relative_pos.size(-1)]) + + +@torch.jit.script +def p2c_dynamic_expand(c2p_pos, query_layer, key_layer): + return c2p_pos.expand([query_layer.size(0), query_layer.size(1), key_layer.size(-2), key_layer.size(-2)]) + + +@torch.jit.script +def pos_dynamic_expand(pos_index, p2c_att, key_layer): + return pos_index.expand(p2c_att.size()[:2] + (pos_index.size(-2), key_layer.size(-2))) + + +###### To support a general trace, we have to define these operation as they use python objects (sizes) ################## +# which are not supported by torch.jit.trace. +# Full credits to @Szustarol +@torch.jit.script +def scaled_size_sqrt(query_layer: torch.Tensor, scale_factor: int): + return torch.sqrt(torch.tensor(query_layer.size(-1), dtype=torch.float) * scale_factor) + + +@torch.jit.script +def build_rpos(query_layer: torch.Tensor, key_layer: torch.Tensor, relative_pos): + if query_layer.size(-2) != key_layer.size(-2): + return build_relative_position(query_layer, key_layer) + else: + return relative_pos + + +@torch.jit.script +def compute_attention_span(query_layer: torch.Tensor, key_layer: torch.Tensor, max_relative_positions: int): + return torch.tensor(min(max(query_layer.size(-2), key_layer.size(-2)), max_relative_positions)) + + +@torch.jit.script +def uneven_size_corrected(p2c_att, query_layer: torch.Tensor, key_layer: torch.Tensor, relative_pos): + if query_layer.size(-2) != key_layer.size(-2): + pos_index = relative_pos[:, :, :, 0].unsqueeze(-1) + return torch.gather(p2c_att, dim=2, index=pos_dynamic_expand(pos_index, p2c_att, key_layer)) + else: + return p2c_att + + +######################################################################################################################## + + +class DisentangledSelfAttention(nn.Module): + """ + Disentangled self-attention module + + Parameters: + config (`str`): + A model config class instance with the configuration to build a new model. The schema is similar to + *BertConfig*, for more details, please refer [`DebertaConfig`] + + """ + + def __init__(self, config): + super().__init__() + if config.hidden_size % config.num_attention_heads != 0: + raise ValueError( + f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention " + f"heads ({config.num_attention_heads})" + ) + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + self.in_proj = nn.Linear(config.hidden_size, self.all_head_size * 3, bias=False) + self.q_bias = nn.Parameter(torch.zeros((self.all_head_size), dtype=torch.float)) + self.v_bias = nn.Parameter(torch.zeros((self.all_head_size), dtype=torch.float)) + self.pos_att_type = config.pos_att_type if config.pos_att_type is not None else [] + + self.relative_attention = getattr(config, "relative_attention", False) + self.talking_head = getattr(config, "talking_head", False) + + if self.talking_head: + self.head_logits_proj = nn.Linear(config.num_attention_heads, config.num_attention_heads, bias=False) + self.head_weights_proj = nn.Linear(config.num_attention_heads, config.num_attention_heads, bias=False) + else: + self.head_logits_proj = None + self.head_weights_proj = None + + if self.relative_attention: + self.max_relative_positions = getattr(config, "max_relative_positions", -1) + if self.max_relative_positions < 1: + self.max_relative_positions = config.max_position_embeddings + self.pos_dropout = nn.Dropout(config.hidden_dropout_prob) + + if "c2p" in self.pos_att_type: + self.pos_proj = nn.Linear(config.hidden_size, self.all_head_size, bias=False) + if "p2c" in self.pos_att_type: + self.pos_q_proj = nn.Linear(config.hidden_size, self.all_head_size) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + + def transpose_for_scores(self, x): + new_x_shape = x.size()[:-1] + (self.num_attention_heads, -1) + x = x.view(new_x_shape) + return x.permute(0, 2, 1, 3) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: torch.Tensor, + output_attentions: bool = False, + query_states: Optional[torch.Tensor] = None, + relative_pos: Optional[torch.Tensor] = None, + rel_embeddings: Optional[torch.Tensor] = None, + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: + """ + Call the module + + Args: + hidden_states (`torch.FloatTensor`): + Input states to the module usually the output from previous layer, it will be the Q,K and V in + *Attention(Q,K,V)* + + attention_mask (`torch.BoolTensor`): + An attention mask matrix of shape [*B*, *N*, *N*] where *B* is the batch size, *N* is the maximum + sequence length in which element [i,j] = *1* means the *i* th token in the input can attend to the *j* + th token. + + output_attentions (`bool`, *optional*): + Whether return the attention matrix. + + query_states (`torch.FloatTensor`, *optional*): + The *Q* state in *Attention(Q,K,V)*. + + relative_pos (`torch.LongTensor`): + The relative position encoding between the tokens in the sequence. It's of shape [*B*, *N*, *N*] with + values ranging in [*-max_relative_positions*, *max_relative_positions*]. + + rel_embeddings (`torch.FloatTensor`): + The embedding of relative distances. It's a tensor of shape [\\(2 \\times + \\text{max_relative_positions}\\), *hidden_size*]. + + + """ + if query_states is None: + qp = self.in_proj(hidden_states) # .split(self.all_head_size, dim=-1) + query_layer, key_layer, value_layer = self.transpose_for_scores(qp).chunk(3, dim=-1) + else: + ws = self.in_proj.weight.chunk(self.num_attention_heads * 3, dim=0) + qkvw = [torch.cat([ws[i * 3 + k] for i in range(self.num_attention_heads)], dim=0) for k in range(3)] + q = torch.matmul(qkvw[0], query_states.t().to(dtype=qkvw[0].dtype)) + k = torch.matmul(qkvw[1], hidden_states.t().to(dtype=qkvw[1].dtype)) + v = torch.matmul(qkvw[2], hidden_states.t().to(dtype=qkvw[2].dtype)) + query_layer, key_layer, value_layer = [self.transpose_for_scores(x) for x in [q, k, v]] + + query_layer = query_layer + self.transpose_for_scores(self.q_bias[None, None, :]) + value_layer = value_layer + self.transpose_for_scores(self.v_bias[None, None, :]) + + rel_att: int = 0 + # Take the dot product between "query" and "key" to get the raw attention scores. + scale_factor = 1 + len(self.pos_att_type) + scale = scaled_size_sqrt(query_layer, scale_factor) + query_layer = query_layer / scale.to(dtype=query_layer.dtype) + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + + if self.relative_attention and rel_embeddings is not None and relative_pos is not None: + rel_embeddings = self.pos_dropout(rel_embeddings) + rel_att = self.disentangled_att_bias(query_layer, key_layer, relative_pos, rel_embeddings, scale_factor) + + if rel_att is not None: + attention_scores = attention_scores + rel_att + + # bxhxlxd + if self.head_logits_proj is not None: + attention_scores = self.head_logits_proj(attention_scores.permute(0, 2, 3, 1)).permute(0, 3, 1, 2) + + attention_mask = attention_mask.bool() + attention_scores = attention_scores.masked_fill(~(attention_mask), torch.finfo(query_layer.dtype).min) + # bsz x height x length x dimension + attention_probs = nn.functional.softmax(attention_scores, dim=-1) + + attention_probs = self.dropout(attention_probs) + if self.head_weights_proj is not None: + attention_probs = self.head_weights_proj(attention_probs.permute(0, 2, 3, 1)).permute(0, 3, 1, 2) + + context_layer = torch.matmul(attention_probs, value_layer) + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (-1,) + context_layer = context_layer.view(new_context_layer_shape) + if not output_attentions: + return (context_layer, None) + return (context_layer, attention_probs) + + def disentangled_att_bias( + self, + query_layer: torch.Tensor, + key_layer: torch.Tensor, + relative_pos: torch.Tensor, + rel_embeddings: torch.Tensor, + scale_factor: int, + ): + if relative_pos is None: + relative_pos = build_relative_position(query_layer, key_layer, query_layer.device) + if relative_pos.dim() == 2: + relative_pos = relative_pos.unsqueeze(0).unsqueeze(0) + elif relative_pos.dim() == 3: + relative_pos = relative_pos.unsqueeze(1) + # bxhxqxk + elif relative_pos.dim() != 4: + raise ValueError(f"Relative position ids must be of dim 2 or 3 or 4. {relative_pos.dim()}") + + att_span = compute_attention_span(query_layer, key_layer, self.max_relative_positions) + relative_pos = relative_pos.long() + rel_embeddings = rel_embeddings[ + self.max_relative_positions - att_span : self.max_relative_positions + att_span, : + ].unsqueeze(0) + + score = 0 + + # content->position + if "c2p" in self.pos_att_type: + pos_key_layer = self.pos_proj(rel_embeddings) + pos_key_layer = self.transpose_for_scores(pos_key_layer) + c2p_att = torch.matmul(query_layer, pos_key_layer.transpose(-1, -2)) + c2p_pos = torch.clamp(relative_pos + att_span, 0, att_span * 2 - 1) + c2p_att = torch.gather(c2p_att, dim=-1, index=c2p_dynamic_expand(c2p_pos, query_layer, relative_pos)) + score += c2p_att + + # position->content + if "p2c" in self.pos_att_type: + pos_query_layer = self.pos_q_proj(rel_embeddings) + pos_query_layer = self.transpose_for_scores(pos_query_layer) + pos_query_layer /= scaled_size_sqrt(pos_query_layer, scale_factor) + r_pos = build_rpos( + query_layer, + key_layer, + relative_pos, + ) + p2c_pos = torch.clamp(-r_pos + att_span, 0, att_span * 2 - 1) + p2c_att = torch.matmul(key_layer, pos_query_layer.transpose(-1, -2).to(dtype=key_layer.dtype)) + p2c_att = torch.gather( + p2c_att, dim=-1, index=p2c_dynamic_expand(p2c_pos, query_layer, key_layer) + ).transpose(-1, -2) + + p2c_att = uneven_size_corrected(p2c_att, query_layer, key_layer, relative_pos) + score += p2c_att + + return score + + +class DebertaEmbeddings(nn.Module): + """Construct the embeddings from word, position and token_type embeddings.""" + + def __init__(self, config): + super().__init__() + pad_token_id = getattr(config, "pad_token_id", 0) + self.embedding_size = getattr(config, "embedding_size", config.hidden_size) + self.word_embeddings = nn.Embedding(config.vocab_size, self.embedding_size, padding_idx=pad_token_id) + + self.position_biased_input = getattr(config, "position_biased_input", True) + if not self.position_biased_input: + self.position_embeddings = None + else: + self.position_embeddings = nn.Embedding(config.max_position_embeddings, self.embedding_size) + + if config.type_vocab_size > 0: + self.token_type_embeddings = nn.Embedding(config.type_vocab_size, self.embedding_size) + else: + self.token_type_embeddings = None + + if self.embedding_size != config.hidden_size: + self.embed_proj = nn.Linear(self.embedding_size, config.hidden_size, bias=False) + else: + self.embed_proj = None + + self.LayerNorm = DebertaLayerNorm(config.hidden_size, config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.config = config + + # position_ids (1, len position emb) is contiguous in memory and exported when serialized + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) + + def forward(self, input_ids=None, token_type_ids=None, position_ids=None, mask=None, inputs_embeds=None): + if input_ids is not None: + input_shape = input_ids.size() + else: + input_shape = inputs_embeds.size()[:-1] + + seq_length = input_shape[1] + + if position_ids is None: + position_ids = self.position_ids[:, :seq_length] + + if token_type_ids is None: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device) + + if inputs_embeds is None: + inputs_embeds = self.word_embeddings(input_ids) + + if self.position_embeddings is not None: + position_embeddings = self.position_embeddings(position_ids.long()) + else: + position_embeddings = torch.zeros_like(inputs_embeds) + + embeddings = inputs_embeds + if self.position_biased_input: + embeddings = embeddings + position_embeddings + if self.token_type_embeddings is not None: + token_type_embeddings = self.token_type_embeddings(token_type_ids) + embeddings = embeddings + token_type_embeddings + + if self.embed_proj is not None: + embeddings = self.embed_proj(embeddings) + + embeddings = self.LayerNorm(embeddings) + + if mask is not None: + if mask.dim() != embeddings.dim(): + if mask.dim() == 4: + mask = mask.squeeze(1).squeeze(1) + mask = mask.unsqueeze(2) + mask = mask.to(embeddings.dtype) + + embeddings = embeddings * mask + + embeddings = self.dropout(embeddings) + return embeddings + + +class DebertaAttention(nn.Module): + def __init__(self, config): + super().__init__() + self.self = DisentangledSelfAttention(config) + self.output = DebertaSelfOutput(config) + self.config = config + + def forward( + self, + hidden_states, + attention_mask, + output_attentions: bool = False, + query_states=None, + relative_pos=None, + rel_embeddings=None, + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: + self_output, att_matrix = self.self( + hidden_states, + attention_mask, + output_attentions, + query_states=query_states, + relative_pos=relative_pos, + rel_embeddings=rel_embeddings, + ) + if query_states is None: + query_states = hidden_states + attention_output = self.output(self_output, query_states) + + if output_attentions: + return (attention_output, att_matrix) + else: + return (attention_output, None) + + +# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->Deberta +class DebertaIntermediate(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +class DebertaOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.LayerNorm = DebertaLayerNorm(config.hidden_size, config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.config = config + + def forward(self, hidden_states, input_tensor): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class DebertaLayer(GradientCheckpointingLayer): + def __init__(self, config): + super().__init__() + self.attention = DebertaAttention(config) + self.intermediate = DebertaIntermediate(config) + self.output = DebertaOutput(config) + + def forward( + self, + hidden_states, + attention_mask, + query_states=None, + relative_pos=None, + rel_embeddings=None, + output_attentions: bool = False, + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: + attention_output, att_matrix = self.attention( + hidden_states, + attention_mask, + output_attentions=output_attentions, + query_states=query_states, + relative_pos=relative_pos, + rel_embeddings=rel_embeddings, + ) + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + + if output_attentions: + return (layer_output, att_matrix) + else: + return (layer_output, None) + + +class DebertaEncoder(nn.Module): + """Modified BertEncoder with relative position bias support""" + + def __init__(self, config): + super().__init__() + self.layer = nn.ModuleList([DebertaLayer(config) for _ in range(config.num_hidden_layers)]) + self.relative_attention = getattr(config, "relative_attention", False) + if self.relative_attention: + self.max_relative_positions = getattr(config, "max_relative_positions", -1) + if self.max_relative_positions < 1: + self.max_relative_positions = config.max_position_embeddings + self.rel_embeddings = nn.Embedding(self.max_relative_positions * 2, config.hidden_size) + self.gradient_checkpointing = False + + def get_rel_embedding(self): + rel_embeddings = self.rel_embeddings.weight if self.relative_attention else None + return rel_embeddings + + def get_attention_mask(self, attention_mask): + if attention_mask.dim() <= 2: + extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2) + attention_mask = extended_attention_mask * extended_attention_mask.squeeze(-2).unsqueeze(-1) + elif attention_mask.dim() == 3: + attention_mask = attention_mask.unsqueeze(1) + + return attention_mask + + def get_rel_pos(self, hidden_states, query_states=None, relative_pos=None): + if self.relative_attention and relative_pos is None: + if query_states is not None: + relative_pos = build_relative_position(query_states, hidden_states) + else: + relative_pos = build_relative_position(hidden_states, hidden_states) + return relative_pos + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: torch.Tensor, + output_hidden_states: bool = True, + output_attentions: bool = False, + query_states=None, + relative_pos=None, + return_dict: bool = True, + ): + attention_mask = self.get_attention_mask(attention_mask) + relative_pos = self.get_rel_pos(hidden_states, query_states, relative_pos) + + all_hidden_states: Optional[tuple[torch.Tensor]] = (hidden_states,) if output_hidden_states else None + all_attentions = () if output_attentions else None + + next_kv = hidden_states + + rel_embeddings = self.get_rel_embedding() + for i, layer_module in enumerate(self.layer): + hidden_states, att_m = layer_module( + next_kv, + attention_mask, + query_states=query_states, + relative_pos=relative_pos, + rel_embeddings=rel_embeddings, + output_attentions=output_attentions, + ) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if query_states is not None: + query_states = hidden_states + else: + next_kv = hidden_states + + if output_attentions: + all_attentions = all_attentions + (att_m,) + + if not return_dict: + return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None) + return BaseModelOutput( + last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions + ) + + +@auto_docstring +class DebertaPreTrainedModel(PreTrainedModel): + config: DebertaConfig + base_model_prefix = "deberta" + _keys_to_ignore_on_load_unexpected = ["position_embeddings"] + supports_gradient_checkpointing = True + + def _init_weights(self, module): + """Initialize the weights.""" + if isinstance(module, nn.Linear): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, (nn.LayerNorm, DebertaLayerNorm)): + module.weight.data.fill_(1.0) + module.bias.data.zero_() + elif isinstance(module, DisentangledSelfAttention): + module.q_bias.data.zero_() + module.v_bias.data.zero_() + elif isinstance(module, (LegacyDebertaLMPredictionHead, DebertaLMPredictionHead)): + module.bias.data.zero_() + + +@auto_docstring +class DebertaModel(DebertaPreTrainedModel): + def __init__(self, config): + super().__init__(config) + + self.embeddings = DebertaEmbeddings(config) + self.encoder = DebertaEncoder(config) + self.z_steps = 0 + self.config = config + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embeddings.word_embeddings + + def set_input_embeddings(self, new_embeddings): + self.embeddings.word_embeddings = new_embeddings + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + raise NotImplementedError("The prune function is not implemented in DeBERTa model.") + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, BaseModelOutput]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask) + input_shape = input_ids.size() + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + device = input_ids.device if input_ids is not None else inputs_embeds.device + + if attention_mask is None: + attention_mask = torch.ones(input_shape, device=device) + if token_type_ids is None: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) + + embedding_output = self.embeddings( + input_ids=input_ids, + token_type_ids=token_type_ids, + position_ids=position_ids, + mask=attention_mask, + inputs_embeds=inputs_embeds, + ) + + encoder_outputs = self.encoder( + embedding_output, + attention_mask, + output_hidden_states=True, + output_attentions=output_attentions, + return_dict=return_dict, + ) + encoded_layers = encoder_outputs[1] + + if self.z_steps > 1: + hidden_states = encoded_layers[-2] + layers = [self.encoder.layer[-1] for _ in range(self.z_steps)] + query_states = encoded_layers[-1] + rel_embeddings = self.encoder.get_rel_embedding() + attention_mask = self.encoder.get_attention_mask(attention_mask) + rel_pos = self.encoder.get_rel_pos(embedding_output) + for layer in layers[1:]: + query_states = layer( + hidden_states, + attention_mask, + output_attentions=False, + query_states=query_states, + relative_pos=rel_pos, + rel_embeddings=rel_embeddings, + ) + encoded_layers.append(query_states) + + sequence_output = encoded_layers[-1] + + if not return_dict: + return (sequence_output,) + encoder_outputs[(1 if output_hidden_states else 2) :] + + return BaseModelOutput( + last_hidden_state=sequence_output, + hidden_states=encoder_outputs.hidden_states if output_hidden_states else None, + attentions=encoder_outputs.attentions, + ) + + +class LegacyDebertaPredictionHeadTransform(nn.Module): + def __init__(self, config): + super().__init__() + self.embedding_size = getattr(config, "embedding_size", config.hidden_size) + + self.dense = nn.Linear(config.hidden_size, self.embedding_size) + if isinstance(config.hidden_act, str): + self.transform_act_fn = ACT2FN[config.hidden_act] + else: + self.transform_act_fn = config.hidden_act + self.LayerNorm = nn.LayerNorm(self.embedding_size, eps=config.layer_norm_eps) + + def forward(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.transform_act_fn(hidden_states) + hidden_states = self.LayerNorm(hidden_states) + return hidden_states + + +class LegacyDebertaLMPredictionHead(nn.Module): + def __init__(self, config): + super().__init__() + self.transform = LegacyDebertaPredictionHeadTransform(config) + + self.embedding_size = getattr(config, "embedding_size", config.hidden_size) + # The output weights are the same as the input embeddings, but there is + # an output-only bias for each token. + self.decoder = nn.Linear(self.embedding_size, config.vocab_size, bias=False) + + self.bias = nn.Parameter(torch.zeros(config.vocab_size)) + + # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` + self.decoder.bias = self.bias + + def _tie_weights(self): + self.decoder.bias = self.bias + + def forward(self, hidden_states): + hidden_states = self.transform(hidden_states) + hidden_states = self.decoder(hidden_states) + return hidden_states + + +# Copied from transformers.models.bert.modeling_bert.BertOnlyMLMHead with Bert->LegacyDeberta +class LegacyDebertaOnlyMLMHead(nn.Module): + def __init__(self, config): + super().__init__() + self.predictions = LegacyDebertaLMPredictionHead(config) + + def forward(self, sequence_output: torch.Tensor) -> torch.Tensor: + prediction_scores = self.predictions(sequence_output) + return prediction_scores + + +class DebertaLMPredictionHead(nn.Module): + """https://github.com/microsoft/DeBERTa/blob/master/DeBERTa/deberta/bert.py#L270""" + + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + + if isinstance(config.hidden_act, str): + self.transform_act_fn = ACT2FN[config.hidden_act] + else: + self.transform_act_fn = config.hidden_act + + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps, elementwise_affine=True) + + self.bias = nn.Parameter(torch.zeros(config.vocab_size)) + + # note that the input embeddings must be passed as an argument + def forward(self, hidden_states, word_embeddings): + hidden_states = self.dense(hidden_states) + hidden_states = self.transform_act_fn(hidden_states) + hidden_states = self.LayerNorm( + hidden_states + ) # original used MaskedLayerNorm, but passed no mask. This is equivalent. + hidden_states = torch.matmul(hidden_states, word_embeddings.weight.t()) + self.bias + return hidden_states + + +class DebertaOnlyMLMHead(nn.Module): + def __init__(self, config): + super().__init__() + self.lm_head = DebertaLMPredictionHead(config) + + # note that the input embeddings must be passed as an argument + def forward(self, sequence_output, word_embeddings): + prediction_scores = self.lm_head(sequence_output, word_embeddings) + return prediction_scores + + +@auto_docstring +class DebertaForMaskedLM(DebertaPreTrainedModel): + _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"] + + def __init__(self, config): + super().__init__(config) + self.legacy = config.legacy + self.deberta = DebertaModel(config) + if self.legacy: + self.cls = LegacyDebertaOnlyMLMHead(config) + else: + self._tied_weights_keys = ["lm_predictions.lm_head.weight", "deberta.embeddings.word_embeddings.weight"] + self.lm_predictions = DebertaOnlyMLMHead(config) + + # Initialize weights and apply final processing + self.post_init() + + def get_output_embeddings(self): + if self.legacy: + return self.cls.predictions.decoder + else: + return self.lm_predictions.lm_head.dense + + def set_output_embeddings(self, new_embeddings): + if self.legacy: + self.cls.predictions.decoder = new_embeddings + self.cls.predictions.bias = new_embeddings.bias + else: + self.lm_predictions.lm_head.dense = new_embeddings + self.lm_predictions.lm_head.bias = new_embeddings.bias + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, MaskedLMOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., + config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the + loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` + """ + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.deberta( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + if self.legacy: + prediction_scores = self.cls(sequence_output) + else: + prediction_scores = self.lm_predictions(sequence_output, self.deberta.embeddings.word_embeddings) + + masked_lm_loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() # -100 index = padding token + masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) + + if not return_dict: + output = (prediction_scores,) + outputs[1:] + return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output + + return MaskedLMOutput( + loss=masked_lm_loss, + logits=prediction_scores, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +class ContextPooler(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.pooler_hidden_size, config.pooler_hidden_size) + self.dropout = nn.Dropout(config.pooler_dropout) + self.config = config + + def forward(self, hidden_states): + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. + + context_token = hidden_states[:, 0] + context_token = self.dropout(context_token) + pooled_output = self.dense(context_token) + pooled_output = ACT2FN[self.config.pooler_hidden_act](pooled_output) + return pooled_output + + @property + def output_dim(self): + return self.config.hidden_size + + +@auto_docstring( + custom_intro=""" + DeBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the + pooled output) e.g. for GLUE tasks. + """ +) +class DebertaForSequenceClassification(DebertaPreTrainedModel): + def __init__(self, config): + super().__init__(config) + + num_labels = getattr(config, "num_labels", 2) + self.num_labels = num_labels + + self.deberta = DebertaModel(config) + self.pooler = ContextPooler(config) + output_dim = self.pooler.output_dim + + self.classifier = nn.Linear(output_dim, num_labels) + drop_out = getattr(config, "cls_dropout", None) + drop_out = self.config.hidden_dropout_prob if drop_out is None else drop_out + self.dropout = nn.Dropout(drop_out) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.deberta.get_input_embeddings() + + def set_input_embeddings(self, new_embeddings): + self.deberta.set_input_embeddings(new_embeddings) + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, SequenceClassifierOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.deberta( + input_ids, + token_type_ids=token_type_ids, + attention_mask=attention_mask, + position_ids=position_ids, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + encoder_layer = outputs[0] + pooled_output = self.pooler(encoder_layer) + pooled_output = self.dropout(pooled_output) + logits = self.classifier(pooled_output) + + loss = None + if labels is not None: + if self.config.problem_type is None: + if self.num_labels == 1: + # regression task + loss_fn = nn.MSELoss() + logits = logits.view(-1).to(labels.dtype) + loss = loss_fn(logits, labels.view(-1)) + elif labels.dim() == 1 or labels.size(-1) == 1: + label_index = (labels >= 0).nonzero() + labels = labels.long() + if label_index.size(0) > 0: + labeled_logits = torch.gather( + logits, 0, label_index.expand(label_index.size(0), logits.size(1)) + ) + labels = torch.gather(labels, 0, label_index.view(-1)) + loss_fct = CrossEntropyLoss() + loss = loss_fct(labeled_logits.view(-1, self.num_labels).float(), labels.view(-1)) + else: + loss = torch.tensor(0).to(logits) + else: + log_softmax = nn.LogSoftmax(-1) + loss = -((log_softmax(logits) * labels).sum(-1)).mean() + elif self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits, labels) + if not return_dict: + output = (logits,) + outputs[1:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutput( + loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions + ) + + +@auto_docstring +class DebertaForTokenClassification(DebertaPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + + self.deberta = DebertaModel(config) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, TokenClassifierOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.deberta( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + sequence_output = self.dropout(sequence_output) + logits = self.classifier(sequence_output) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + + if not return_dict: + output = (logits,) + outputs[1:] + return ((loss,) + output) if loss is not None else output + + return TokenClassifierOutput( + loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions + ) + + +@auto_docstring +class DebertaForQuestionAnswering(DebertaPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + + self.deberta = DebertaModel(config) + self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + start_positions: Optional[torch.Tensor] = None, + end_positions: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, QuestionAnsweringModelOutput]: + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.deberta( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + logits = self.qa_outputs(sequence_output) + start_logits, end_logits = logits.split(1, dim=-1) + start_logits = start_logits.squeeze(-1).contiguous() + end_logits = end_logits.squeeze(-1).contiguous() + + total_loss = None + if start_positions is not None and end_positions is not None: + # If we are on multi-GPU, split add a dimension + if len(start_positions.size()) > 1: + start_positions = start_positions.squeeze(-1) + if len(end_positions.size()) > 1: + end_positions = end_positions.squeeze(-1) + # sometimes the start/end positions are outside our model inputs, we ignore these terms + ignored_index = start_logits.size(1) + start_positions = start_positions.clamp(0, ignored_index) + end_positions = end_positions.clamp(0, ignored_index) + + loss_fct = CrossEntropyLoss(ignore_index=ignored_index) + start_loss = loss_fct(start_logits, start_positions) + end_loss = loss_fct(end_logits, end_positions) + total_loss = (start_loss + end_loss) / 2 + + if not return_dict: + output = (start_logits, end_logits) + outputs[1:] + return ((total_loss,) + output) if total_loss is not None else output + + return QuestionAnsweringModelOutput( + loss=total_loss, + start_logits=start_logits, + end_logits=end_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +__all__ = [ + "DebertaForMaskedLM", + "DebertaForQuestionAnswering", + "DebertaForSequenceClassification", + "DebertaForTokenClassification", + "DebertaModel", + "DebertaPreTrainedModel", +] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deberta/modeling_tf_deberta.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deberta/modeling_tf_deberta.py new file mode 100644 index 0000000000000000000000000000000000000000..40d23fc28b9475623dc8a94d5539c5a530b3f376 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deberta/modeling_tf_deberta.py @@ -0,0 +1,1652 @@ +# coding=utf-8 +# Copyright 2021 Microsoft and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""TF 2.0 DeBERTa model.""" + +from __future__ import annotations + +import math +from collections.abc import Sequence + +import numpy as np +import tensorflow as tf + +from ...activations_tf import get_tf_activation +from ...modeling_tf_outputs import ( + TFBaseModelOutput, + TFMaskedLMOutput, + TFQuestionAnsweringModelOutput, + TFSequenceClassifierOutput, + TFTokenClassifierOutput, +) +from ...modeling_tf_utils import ( + TFMaskedLanguageModelingLoss, + TFModelInputType, + TFPreTrainedModel, + TFQuestionAnsweringLoss, + TFSequenceClassificationLoss, + TFTokenClassificationLoss, + get_initializer, + keras, + unpack_inputs, +) +from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax +from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging +from .configuration_deberta import DebertaConfig + + +logger = logging.get_logger(__name__) + + +_CONFIG_FOR_DOC = "DebertaConfig" +_CHECKPOINT_FOR_DOC = "kamalkraj/deberta-base" + + +class TFDebertaContextPooler(keras.layers.Layer): + def __init__(self, config: DebertaConfig, **kwargs): + super().__init__(**kwargs) + self.dense = keras.layers.Dense(config.pooler_hidden_size, name="dense") + self.dropout = TFDebertaStableDropout(config.pooler_dropout, name="dropout") + self.config = config + + def call(self, hidden_states, training: bool = False): + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. + context_token = hidden_states[:, 0] + context_token = self.dropout(context_token, training=training) + pooled_output = self.dense(context_token) + pooled_output = get_tf_activation(self.config.pooler_hidden_act)(pooled_output) + return pooled_output + + @property + def output_dim(self) -> int: + return self.config.hidden_size + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build([None, None, self.config.pooler_hidden_size]) + if getattr(self, "dropout", None) is not None: + with tf.name_scope(self.dropout.name): + self.dropout.build(None) + + +class TFDebertaXSoftmax(keras.layers.Layer): + """ + Masked Softmax which is optimized for saving memory + + Args: + input (`tf.Tensor`): The input tensor that will apply softmax. + mask (`tf.Tensor`): The mask matrix where 0 indicate that element will be ignored in the softmax calculation. + dim (int): The dimension that will apply softmax + """ + + def __init__(self, axis=-1, **kwargs): + super().__init__(**kwargs) + self.axis = axis + + def call(self, inputs: tf.Tensor, mask: tf.Tensor): + rmask = tf.logical_not(tf.cast(mask, tf.bool)) + output = tf.where(rmask, tf.cast(float("-inf"), dtype=self.compute_dtype), inputs) + output = stable_softmax(tf.cast(output, dtype=tf.float32), self.axis) + output = tf.where(rmask, 0.0, output) + return output + + +class TFDebertaStableDropout(keras.layers.Layer): + """ + Optimized dropout module for stabilizing the training + + Args: + drop_prob (float): the dropout probabilities + """ + + def __init__(self, drop_prob, **kwargs): + super().__init__(**kwargs) + self.drop_prob = drop_prob + + @tf.custom_gradient + def xdropout(self, inputs): + """ + Applies dropout to the inputs, as vanilla dropout, but also scales the remaining elements up by 1/drop_prob. + """ + mask = tf.cast( + 1 + - tf.compat.v1.distributions.Bernoulli(probs=1.0 - self.drop_prob).sample(sample_shape=shape_list(inputs)), + tf.bool, + ) + scale = tf.convert_to_tensor(1.0 / (1 - self.drop_prob), dtype=self.compute_dtype) + if self.drop_prob > 0: + inputs = tf.where(mask, tf.cast(0.0, dtype=self.compute_dtype), inputs) * scale + + def grad(upstream): + if self.drop_prob > 0: + return tf.where(mask, tf.cast(0.0, dtype=self.compute_dtype), upstream) * scale + else: + return upstream + + return inputs, grad + + def call(self, inputs: tf.Tensor, training: tf.Tensor = False): + if training: + return self.xdropout(inputs) + return inputs + + +class TFDebertaLayerNorm(keras.layers.Layer): + """LayerNorm module in the TF style (epsilon inside the square root).""" + + def __init__(self, size, eps=1e-12, **kwargs): + super().__init__(**kwargs) + self.size = size + self.eps = eps + + def build(self, input_shape): + self.gamma = self.add_weight(shape=[self.size], initializer=tf.ones_initializer(), name="weight") + self.beta = self.add_weight(shape=[self.size], initializer=tf.zeros_initializer(), name="bias") + return super().build(input_shape) + + def call(self, x: tf.Tensor) -> tf.Tensor: + mean = tf.reduce_mean(x, axis=[-1], keepdims=True) + variance = tf.reduce_mean(tf.square(x - mean), axis=[-1], keepdims=True) + std = tf.math.sqrt(variance + self.eps) + return self.gamma * (x - mean) / std + self.beta + + +class TFDebertaSelfOutput(keras.layers.Layer): + def __init__(self, config: DebertaConfig, **kwargs): + super().__init__(**kwargs) + self.dense = keras.layers.Dense(config.hidden_size, name="dense") + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = TFDebertaStableDropout(config.hidden_dropout_prob, name="dropout") + self.config = config + + def call(self, hidden_states, input_tensor, training: bool = False): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states, training=training) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build([None, None, self.config.hidden_size]) + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) + if getattr(self, "dropout", None) is not None: + with tf.name_scope(self.dropout.name): + self.dropout.build(None) + + +class TFDebertaAttention(keras.layers.Layer): + def __init__(self, config: DebertaConfig, **kwargs): + super().__init__(**kwargs) + self.self = TFDebertaDisentangledSelfAttention(config, name="self") + self.dense_output = TFDebertaSelfOutput(config, name="output") + self.config = config + + def call( + self, + input_tensor: tf.Tensor, + attention_mask: tf.Tensor, + query_states: tf.Tensor | None = None, + relative_pos: tf.Tensor | None = None, + rel_embeddings: tf.Tensor | None = None, + output_attentions: bool = False, + training: bool = False, + ) -> tuple[tf.Tensor]: + self_outputs = self.self( + hidden_states=input_tensor, + attention_mask=attention_mask, + query_states=query_states, + relative_pos=relative_pos, + rel_embeddings=rel_embeddings, + output_attentions=output_attentions, + training=training, + ) + if query_states is None: + query_states = input_tensor + attention_output = self.dense_output( + hidden_states=self_outputs[0], input_tensor=query_states, training=training + ) + + output = (attention_output,) + self_outputs[1:] + + return output + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "self", None) is not None: + with tf.name_scope(self.self.name): + self.self.build(None) + if getattr(self, "dense_output", None) is not None: + with tf.name_scope(self.dense_output.name): + self.dense_output.build(None) + + +class TFDebertaIntermediate(keras.layers.Layer): + def __init__(self, config: DebertaConfig, **kwargs): + super().__init__(**kwargs) + + self.dense = keras.layers.Dense( + units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + ) + + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = get_tf_activation(config.hidden_act) + else: + self.intermediate_act_fn = config.hidden_act + self.config = config + + def call(self, hidden_states: tf.Tensor) -> tf.Tensor: + hidden_states = self.dense(inputs=hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + + return hidden_states + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build([None, None, self.config.hidden_size]) + + +class TFDebertaOutput(keras.layers.Layer): + def __init__(self, config: DebertaConfig, **kwargs): + super().__init__(**kwargs) + + self.dense = keras.layers.Dense( + units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + ) + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = TFDebertaStableDropout(config.hidden_dropout_prob, name="dropout") + self.config = config + + def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: + hidden_states = self.dense(inputs=hidden_states) + hidden_states = self.dropout(hidden_states, training=training) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + + return hidden_states + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build([None, None, self.config.intermediate_size]) + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) + if getattr(self, "dropout", None) is not None: + with tf.name_scope(self.dropout.name): + self.dropout.build(None) + + +class TFDebertaLayer(keras.layers.Layer): + def __init__(self, config: DebertaConfig, **kwargs): + super().__init__(**kwargs) + + self.attention = TFDebertaAttention(config, name="attention") + self.intermediate = TFDebertaIntermediate(config, name="intermediate") + self.bert_output = TFDebertaOutput(config, name="output") + + def call( + self, + hidden_states: tf.Tensor, + attention_mask: tf.Tensor, + query_states: tf.Tensor | None = None, + relative_pos: tf.Tensor | None = None, + rel_embeddings: tf.Tensor | None = None, + output_attentions: bool = False, + training: bool = False, + ) -> tuple[tf.Tensor]: + attention_outputs = self.attention( + input_tensor=hidden_states, + attention_mask=attention_mask, + query_states=query_states, + relative_pos=relative_pos, + rel_embeddings=rel_embeddings, + output_attentions=output_attentions, + training=training, + ) + attention_output = attention_outputs[0] + intermediate_output = self.intermediate(hidden_states=attention_output) + layer_output = self.bert_output( + hidden_states=intermediate_output, input_tensor=attention_output, training=training + ) + outputs = (layer_output,) + attention_outputs[1:] # add attentions if we output them + + return outputs + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "attention", None) is not None: + with tf.name_scope(self.attention.name): + self.attention.build(None) + if getattr(self, "intermediate", None) is not None: + with tf.name_scope(self.intermediate.name): + self.intermediate.build(None) + if getattr(self, "bert_output", None) is not None: + with tf.name_scope(self.bert_output.name): + self.bert_output.build(None) + + +class TFDebertaEncoder(keras.layers.Layer): + def __init__(self, config: DebertaConfig, **kwargs): + super().__init__(**kwargs) + + self.layer = [TFDebertaLayer(config, name=f"layer_._{i}") for i in range(config.num_hidden_layers)] + self.relative_attention = getattr(config, "relative_attention", False) + self.config = config + if self.relative_attention: + self.max_relative_positions = getattr(config, "max_relative_positions", -1) + if self.max_relative_positions < 1: + self.max_relative_positions = config.max_position_embeddings + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if self.relative_attention: + self.rel_embeddings = self.add_weight( + name="rel_embeddings.weight", + shape=[self.max_relative_positions * 2, self.config.hidden_size], + initializer=get_initializer(self.config.initializer_range), + ) + if getattr(self, "layer", None) is not None: + for layer in self.layer: + with tf.name_scope(layer.name): + layer.build(None) + + def get_rel_embedding(self): + rel_embeddings = self.rel_embeddings if self.relative_attention else None + return rel_embeddings + + def get_attention_mask(self, attention_mask): + if len(shape_list(attention_mask)) <= 2: + extended_attention_mask = tf.expand_dims(tf.expand_dims(attention_mask, 1), 2) + attention_mask = extended_attention_mask * tf.expand_dims(tf.squeeze(extended_attention_mask, -2), -1) + attention_mask = tf.cast(attention_mask, tf.uint8) + elif len(shape_list(attention_mask)) == 3: + attention_mask = tf.expand_dims(attention_mask, 1) + + return attention_mask + + def get_rel_pos(self, hidden_states, query_states=None, relative_pos=None): + if self.relative_attention and relative_pos is None: + q = shape_list(query_states)[-2] if query_states is not None else shape_list(hidden_states)[-2] + relative_pos = build_relative_position(q, shape_list(hidden_states)[-2]) + return relative_pos + + def call( + self, + hidden_states: tf.Tensor, + attention_mask: tf.Tensor, + query_states: tf.Tensor | None = None, + relative_pos: tf.Tensor | None = None, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + training: bool = False, + ) -> TFBaseModelOutput | tuple[tf.Tensor]: + all_hidden_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + + attention_mask = self.get_attention_mask(attention_mask) + relative_pos = self.get_rel_pos(hidden_states, query_states, relative_pos) + + if isinstance(hidden_states, Sequence): + next_kv = hidden_states[0] + else: + next_kv = hidden_states + + rel_embeddings = self.get_rel_embedding() + + for i, layer_module in enumerate(self.layer): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_outputs = layer_module( + hidden_states=next_kv, + attention_mask=attention_mask, + query_states=query_states, + relative_pos=relative_pos, + rel_embeddings=rel_embeddings, + output_attentions=output_attentions, + training=training, + ) + hidden_states = layer_outputs[0] + + if query_states is not None: + query_states = hidden_states + if isinstance(hidden_states, Sequence): + next_kv = hidden_states[i + 1] if i + 1 < len(self.layer) else None + else: + next_kv = hidden_states + + if output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + + # Add last layer + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None) + + return TFBaseModelOutput( + last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions + ) + + +def build_relative_position(query_size, key_size): + """ + Build relative position according to the query and key + + We assume the absolute position of query \\(P_q\\) is range from (0, query_size) and the absolute position of key + \\(P_k\\) is range from (0, key_size), The relative positions from query to key is \\(R_{q \\rightarrow k} = P_q - + P_k\\) + + Args: + query_size (int): the length of query + key_size (int): the length of key + + Return: + `tf.Tensor`: A tensor with shape [1, query_size, key_size] + + """ + q_ids = tf.range(query_size, dtype=tf.int32) + k_ids = tf.range(key_size, dtype=tf.int32) + rel_pos_ids = q_ids[:, None] - tf.tile(tf.reshape(k_ids, [1, -1]), [query_size, 1]) + rel_pos_ids = rel_pos_ids[:query_size, :] + rel_pos_ids = tf.expand_dims(rel_pos_ids, axis=0) + return tf.cast(rel_pos_ids, tf.int64) + + +def c2p_dynamic_expand(c2p_pos, query_layer, relative_pos): + shapes = [ + shape_list(query_layer)[0], + shape_list(query_layer)[1], + shape_list(query_layer)[2], + shape_list(relative_pos)[-1], + ] + return tf.broadcast_to(c2p_pos, shapes) + + +def p2c_dynamic_expand(c2p_pos, query_layer, key_layer): + shapes = [ + shape_list(query_layer)[0], + shape_list(query_layer)[1], + shape_list(key_layer)[-2], + shape_list(key_layer)[-2], + ] + return tf.broadcast_to(c2p_pos, shapes) + + +def pos_dynamic_expand(pos_index, p2c_att, key_layer): + shapes = shape_list(p2c_att)[:2] + [shape_list(pos_index)[-2], shape_list(key_layer)[-2]] + return tf.broadcast_to(pos_index, shapes) + + +def torch_gather(x, indices, gather_axis): + if gather_axis < 0: + gather_axis = tf.rank(x) + gather_axis + + if gather_axis != tf.rank(x) - 1: + pre_roll = tf.rank(x) - 1 - gather_axis + permutation = tf.roll(tf.range(tf.rank(x)), pre_roll, axis=0) + x = tf.transpose(x, perm=permutation) + indices = tf.transpose(indices, perm=permutation) + else: + pre_roll = 0 + + flat_x = tf.reshape(x, (-1, tf.shape(x)[-1])) + flat_indices = tf.reshape(indices, (-1, tf.shape(indices)[-1])) + gathered = tf.gather(flat_x, flat_indices, batch_dims=1) + gathered = tf.reshape(gathered, tf.shape(indices)) + + if pre_roll != 0: + permutation = tf.roll(tf.range(tf.rank(x)), -pre_roll, axis=0) + gathered = tf.transpose(gathered, perm=permutation) + + return gathered + + +class TFDebertaDisentangledSelfAttention(keras.layers.Layer): + """ + Disentangled self-attention module + + Parameters: + config (`str`): + A model config class instance with the configuration to build a new model. The schema is similar to + *BertConfig*, for more details, please refer [`DebertaConfig`] + + """ + + def __init__(self, config: DebertaConfig, **kwargs): + super().__init__(**kwargs) + if config.hidden_size % config.num_attention_heads != 0: + raise ValueError( + f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention " + f"heads ({config.num_attention_heads})" + ) + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + self.in_proj = keras.layers.Dense( + self.all_head_size * 3, + kernel_initializer=get_initializer(config.initializer_range), + name="in_proj", + use_bias=False, + ) + self.pos_att_type = config.pos_att_type if config.pos_att_type is not None else [] + + self.relative_attention = getattr(config, "relative_attention", False) + self.talking_head = getattr(config, "talking_head", False) + + if self.talking_head: + self.head_logits_proj = keras.layers.Dense( + self.num_attention_heads, + kernel_initializer=get_initializer(config.initializer_range), + name="head_logits_proj", + use_bias=False, + ) + self.head_weights_proj = keras.layers.Dense( + self.num_attention_heads, + kernel_initializer=get_initializer(config.initializer_range), + name="head_weights_proj", + use_bias=False, + ) + + self.softmax = TFDebertaXSoftmax(axis=-1) + + if self.relative_attention: + self.max_relative_positions = getattr(config, "max_relative_positions", -1) + if self.max_relative_positions < 1: + self.max_relative_positions = config.max_position_embeddings + self.pos_dropout = TFDebertaStableDropout(config.hidden_dropout_prob, name="pos_dropout") + if "c2p" in self.pos_att_type: + self.pos_proj = keras.layers.Dense( + self.all_head_size, + kernel_initializer=get_initializer(config.initializer_range), + name="pos_proj", + use_bias=False, + ) + if "p2c" in self.pos_att_type: + self.pos_q_proj = keras.layers.Dense( + self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="pos_q_proj" + ) + + self.dropout = TFDebertaStableDropout(config.attention_probs_dropout_prob, name="dropout") + self.config = config + + def build(self, input_shape=None): + if self.built: + return + self.built = True + self.q_bias = self.add_weight( + name="q_bias", shape=(self.all_head_size), initializer=keras.initializers.Zeros() + ) + self.v_bias = self.add_weight( + name="v_bias", shape=(self.all_head_size), initializer=keras.initializers.Zeros() + ) + if getattr(self, "in_proj", None) is not None: + with tf.name_scope(self.in_proj.name): + self.in_proj.build([None, None, self.config.hidden_size]) + if getattr(self, "dropout", None) is not None: + with tf.name_scope(self.dropout.name): + self.dropout.build(None) + if getattr(self, "head_logits_proj", None) is not None: + with tf.name_scope(self.head_logits_proj.name): + self.head_logits_proj.build(None) + if getattr(self, "head_weights_proj", None) is not None: + with tf.name_scope(self.head_weights_proj.name): + self.head_weights_proj.build(None) + if getattr(self, "pos_dropout", None) is not None: + with tf.name_scope(self.pos_dropout.name): + self.pos_dropout.build(None) + if getattr(self, "pos_proj", None) is not None: + with tf.name_scope(self.pos_proj.name): + self.pos_proj.build([self.config.hidden_size]) + if getattr(self, "pos_q_proj", None) is not None: + with tf.name_scope(self.pos_q_proj.name): + self.pos_q_proj.build([self.config.hidden_size]) + + def transpose_for_scores(self, tensor: tf.Tensor) -> tf.Tensor: + shape = shape_list(tensor)[:-1] + [self.num_attention_heads, -1] + # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size] + tensor = tf.reshape(tensor=tensor, shape=shape) + + # Transpose the tensor from [batch_size, seq_length, num_attention_heads, attention_head_size] to [batch_size, num_attention_heads, seq_length, attention_head_size] + return tf.transpose(tensor, perm=[0, 2, 1, 3]) + + def call( + self, + hidden_states: tf.Tensor, + attention_mask: tf.Tensor, + query_states: tf.Tensor | None = None, + relative_pos: tf.Tensor | None = None, + rel_embeddings: tf.Tensor | None = None, + output_attentions: bool = False, + training: bool = False, + ) -> tuple[tf.Tensor]: + """ + Call the module + + Args: + hidden_states (`tf.Tensor`): + Input states to the module usually the output from previous layer, it will be the Q,K and V in + *Attention(Q,K,V)* + + attention_mask (`tf.Tensor`): + An attention mask matrix of shape [*B*, *N*, *N*] where *B* is the batch size, *N* is the maximum + sequence length in which element [i,j] = *1* means the *i* th token in the input can attend to the *j* + th token. + + return_att (`bool`, *optional*): + Whether return the attention matrix. + + query_states (`tf.Tensor`, *optional*): + The *Q* state in *Attention(Q,K,V)*. + + relative_pos (`tf.Tensor`): + The relative position encoding between the tokens in the sequence. It's of shape [*B*, *N*, *N*] with + values ranging in [*-max_relative_positions*, *max_relative_positions*]. + + rel_embeddings (`tf.Tensor`): + The embedding of relative distances. It's a tensor of shape [\\(2 \\times + \\text{max_relative_positions}\\), *hidden_size*]. + + + """ + if query_states is None: + qp = self.in_proj(hidden_states) # .split(self.all_head_size, dim=-1) + query_layer, key_layer, value_layer = tf.split( + self.transpose_for_scores(qp), num_or_size_splits=3, axis=-1 + ) + else: + + def linear(w, b, x): + out = tf.matmul(x, w, transpose_b=True) + if b is not None: + out += tf.transpose(b) + return out + + ws = tf.split( + tf.transpose(self.in_proj.weight[0]), num_or_size_splits=self.num_attention_heads * 3, axis=0 + ) + qkvw = tf.TensorArray(dtype=self.dtype, size=3) + for k in tf.range(3): + qkvw_inside = tf.TensorArray(dtype=self.dtype, size=self.num_attention_heads) + for i in tf.range(self.num_attention_heads): + qkvw_inside = qkvw_inside.write(i, ws[i * 3 + k]) + qkvw = qkvw.write(k, qkvw_inside.concat()) + qkvb = [None] * 3 + + q = linear(qkvw[0], qkvb[0], query_states) + k = linear(qkvw[1], qkvb[1], hidden_states) + v = linear(qkvw[2], qkvb[2], hidden_states) + query_layer = self.transpose_for_scores(q) + key_layer = self.transpose_for_scores(k) + value_layer = self.transpose_for_scores(v) + + query_layer = query_layer + self.transpose_for_scores(self.q_bias[None, None, :]) + value_layer = value_layer + self.transpose_for_scores(self.v_bias[None, None, :]) + + rel_att = None + # Take the dot product between "query" and "key" to get the raw attention scores. + scale_factor = 1 + len(self.pos_att_type) + scale = math.sqrt(shape_list(query_layer)[-1] * scale_factor) + query_layer = query_layer / scale + + attention_scores = tf.matmul(query_layer, tf.transpose(key_layer, [0, 1, 3, 2])) + if self.relative_attention: + rel_embeddings = self.pos_dropout(rel_embeddings, training=training) + rel_att = self.disentangled_att_bias(query_layer, key_layer, relative_pos, rel_embeddings, scale_factor) + + if rel_att is not None: + attention_scores = attention_scores + rel_att + + if self.talking_head: + attention_scores = tf.transpose( + self.head_logits_proj(tf.transpose(attention_scores, [0, 2, 3, 1])), [0, 3, 1, 2] + ) + + attention_probs = self.softmax(attention_scores, attention_mask) + attention_probs = self.dropout(attention_probs, training=training) + if self.talking_head: + attention_probs = tf.transpose( + self.head_weights_proj(tf.transpose(attention_probs, [0, 2, 3, 1])), [0, 3, 1, 2] + ) + + context_layer = tf.matmul(attention_probs, value_layer) + context_layer = tf.transpose(context_layer, [0, 2, 1, 3]) + context_layer_shape = shape_list(context_layer) + # Set the final dimension here explicitly. + # Calling tf.reshape(context_layer, (*context_layer_shape[:-2], -1)) raises an error when executing + # the model in graph mode as context_layer is reshaped to (None, 7, None) and Dense layer in TFDebertaV2SelfOutput + # requires final input dimension to be defined + new_context_layer_shape = context_layer_shape[:-2] + [context_layer_shape[-2] * context_layer_shape[-1]] + context_layer = tf.reshape(context_layer, new_context_layer_shape) + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + return outputs + + def disentangled_att_bias(self, query_layer, key_layer, relative_pos, rel_embeddings, scale_factor): + if relative_pos is None: + q = shape_list(query_layer)[-2] + relative_pos = build_relative_position(q, shape_list(key_layer)[-2]) + shape_list_pos = shape_list(relative_pos) + if len(shape_list_pos) == 2: + relative_pos = tf.expand_dims(tf.expand_dims(relative_pos, 0), 0) + elif len(shape_list_pos) == 3: + relative_pos = tf.expand_dims(relative_pos, 1) + # bxhxqxk + elif len(shape_list_pos) != 4: + raise ValueError(f"Relative position ids must be of dim 2 or 3 or 4. {len(shape_list_pos)}") + + att_span = tf.cast( + tf.minimum( + tf.maximum(shape_list(query_layer)[-2], shape_list(key_layer)[-2]), self.max_relative_positions + ), + tf.int64, + ) + rel_embeddings = tf.expand_dims( + rel_embeddings[self.max_relative_positions - att_span : self.max_relative_positions + att_span, :], 0 + ) + + score = 0 + + # content->position + if "c2p" in self.pos_att_type: + pos_key_layer = self.pos_proj(rel_embeddings) + pos_key_layer = self.transpose_for_scores(pos_key_layer) + c2p_att = tf.matmul(query_layer, tf.transpose(pos_key_layer, [0, 1, 3, 2])) + c2p_pos = tf.clip_by_value(relative_pos + att_span, 0, att_span * 2 - 1) + c2p_att = torch_gather(c2p_att, c2p_dynamic_expand(c2p_pos, query_layer, relative_pos), -1) + score += c2p_att + + # position->content + if "p2c" in self.pos_att_type: + pos_query_layer = self.pos_q_proj(rel_embeddings) + pos_query_layer = self.transpose_for_scores(pos_query_layer) + pos_query_layer /= tf.math.sqrt( + tf.cast(shape_list(pos_query_layer)[-1] * scale_factor, dtype=self.compute_dtype) + ) + if shape_list(query_layer)[-2] != shape_list(key_layer)[-2]: + r_pos = build_relative_position(shape_list(key_layer)[-2], shape_list(key_layer)[-2]) + else: + r_pos = relative_pos + p2c_pos = tf.clip_by_value(-r_pos + att_span, 0, att_span * 2 - 1) + p2c_att = tf.matmul(key_layer, tf.transpose(pos_query_layer, [0, 1, 3, 2])) + p2c_att = tf.transpose( + torch_gather(p2c_att, p2c_dynamic_expand(p2c_pos, query_layer, key_layer), -1), [0, 1, 3, 2] + ) + if shape_list(query_layer)[-2] != shape_list(key_layer)[-2]: + pos_index = tf.expand_dims(relative_pos[:, :, :, 0], -1) + p2c_att = torch_gather(p2c_att, pos_dynamic_expand(pos_index, p2c_att, key_layer), -2) + score += p2c_att + + return score + + +class TFDebertaEmbeddings(keras.layers.Layer): + """Construct the embeddings from word, position and token_type embeddings.""" + + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + + self.config = config + self.embedding_size = getattr(config, "embedding_size", config.hidden_size) + self.hidden_size = config.hidden_size + self.max_position_embeddings = config.max_position_embeddings + self.position_biased_input = getattr(config, "position_biased_input", True) + self.initializer_range = config.initializer_range + if self.embedding_size != config.hidden_size: + self.embed_proj = keras.layers.Dense( + config.hidden_size, + kernel_initializer=get_initializer(config.initializer_range), + name="embed_proj", + use_bias=False, + ) + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = TFDebertaStableDropout(config.hidden_dropout_prob, name="dropout") + + def build(self, input_shape=None): + with tf.name_scope("word_embeddings"): + self.weight = self.add_weight( + name="weight", + shape=[self.config.vocab_size, self.embedding_size], + initializer=get_initializer(self.initializer_range), + ) + + with tf.name_scope("token_type_embeddings"): + if self.config.type_vocab_size > 0: + self.token_type_embeddings = self.add_weight( + name="embeddings", + shape=[self.config.type_vocab_size, self.embedding_size], + initializer=get_initializer(self.initializer_range), + ) + else: + self.token_type_embeddings = None + + with tf.name_scope("position_embeddings"): + if self.position_biased_input: + self.position_embeddings = self.add_weight( + name="embeddings", + shape=[self.max_position_embeddings, self.hidden_size], + initializer=get_initializer(self.initializer_range), + ) + else: + self.position_embeddings = None + + if self.built: + return + self.built = True + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) + if getattr(self, "dropout", None) is not None: + with tf.name_scope(self.dropout.name): + self.dropout.build(None) + if getattr(self, "embed_proj", None) is not None: + with tf.name_scope(self.embed_proj.name): + self.embed_proj.build([None, None, self.embedding_size]) + + def call( + self, + input_ids: tf.Tensor | None = None, + position_ids: tf.Tensor | None = None, + token_type_ids: tf.Tensor | None = None, + inputs_embeds: tf.Tensor | None = None, + mask: tf.Tensor | None = None, + training: bool = False, + ) -> tf.Tensor: + """ + Applies embedding based on inputs tensor. + + Returns: + final_embeddings (`tf.Tensor`): output embedding tensor. + """ + if input_ids is None and inputs_embeds is None: + raise ValueError("Need to provide either `input_ids` or `input_embeds`.") + + if input_ids is not None: + check_embeddings_within_bounds(input_ids, self.config.vocab_size) + inputs_embeds = tf.gather(params=self.weight, indices=input_ids) + + input_shape = shape_list(inputs_embeds)[:-1] + + if token_type_ids is None: + token_type_ids = tf.fill(dims=input_shape, value=0) + + if position_ids is None: + position_ids = tf.expand_dims(tf.range(start=0, limit=input_shape[-1]), axis=0) + + final_embeddings = inputs_embeds + if self.position_biased_input: + position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids) + final_embeddings += position_embeds + if self.config.type_vocab_size > 0: + token_type_embeds = tf.gather(params=self.token_type_embeddings, indices=token_type_ids) + final_embeddings += token_type_embeds + + if self.embedding_size != self.hidden_size: + final_embeddings = self.embed_proj(final_embeddings) + + final_embeddings = self.LayerNorm(final_embeddings) + + if mask is not None: + if len(shape_list(mask)) != len(shape_list(final_embeddings)): + if len(shape_list(mask)) == 4: + mask = tf.squeeze(tf.squeeze(mask, axis=1), axis=1) + mask = tf.cast(tf.expand_dims(mask, axis=2), dtype=self.compute_dtype) + + final_embeddings = final_embeddings * mask + + final_embeddings = self.dropout(final_embeddings, training=training) + + return final_embeddings + + +class TFDebertaPredictionHeadTransform(keras.layers.Layer): + def __init__(self, config: DebertaConfig, **kwargs): + super().__init__(**kwargs) + + self.embedding_size = getattr(config, "embedding_size", config.hidden_size) + + self.dense = keras.layers.Dense( + units=self.embedding_size, + kernel_initializer=get_initializer(config.initializer_range), + name="dense", + ) + + if isinstance(config.hidden_act, str): + self.transform_act_fn = get_tf_activation(config.hidden_act) + else: + self.transform_act_fn = config.hidden_act + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.config = config + + def call(self, hidden_states: tf.Tensor) -> tf.Tensor: + hidden_states = self.dense(inputs=hidden_states) + hidden_states = self.transform_act_fn(hidden_states) + hidden_states = self.LayerNorm(hidden_states) + + return hidden_states + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build([None, None, self.config.hidden_size]) + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.embedding_size]) + + +class TFDebertaLMPredictionHead(keras.layers.Layer): + def __init__(self, config: DebertaConfig, input_embeddings: keras.layers.Layer, **kwargs): + super().__init__(**kwargs) + + self.config = config + self.embedding_size = getattr(config, "embedding_size", config.hidden_size) + + self.transform = TFDebertaPredictionHeadTransform(config, name="transform") + + # The output weights are the same as the input embeddings, but there is + # an output-only bias for each token. + self.input_embeddings = input_embeddings + + def build(self, input_shape=None): + self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias") + + if self.built: + return + self.built = True + if getattr(self, "transform", None) is not None: + with tf.name_scope(self.transform.name): + self.transform.build(None) + + def get_output_embeddings(self) -> keras.layers.Layer: + return self.input_embeddings + + def set_output_embeddings(self, value: tf.Variable): + self.input_embeddings.weight = value + self.input_embeddings.vocab_size = shape_list(value)[0] + + def get_bias(self) -> dict[str, tf.Variable]: + return {"bias": self.bias} + + def set_bias(self, value: tf.Variable): + self.bias = value["bias"] + self.config.vocab_size = shape_list(value["bias"])[0] + + def call(self, hidden_states: tf.Tensor) -> tf.Tensor: + hidden_states = self.transform(hidden_states=hidden_states) + seq_length = shape_list(hidden_states)[1] + hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.embedding_size]) + hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True) + hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size]) + hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias) + + return hidden_states + + +class TFDebertaOnlyMLMHead(keras.layers.Layer): + def __init__(self, config: DebertaConfig, input_embeddings: keras.layers.Layer, **kwargs): + super().__init__(**kwargs) + self.predictions = TFDebertaLMPredictionHead(config, input_embeddings, name="predictions") + + def call(self, sequence_output: tf.Tensor) -> tf.Tensor: + prediction_scores = self.predictions(hidden_states=sequence_output) + + return prediction_scores + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "predictions", None) is not None: + with tf.name_scope(self.predictions.name): + self.predictions.build(None) + + +# @keras_serializable +class TFDebertaMainLayer(keras.layers.Layer): + config_class = DebertaConfig + + def __init__(self, config: DebertaConfig, **kwargs): + super().__init__(**kwargs) + + self.config = config + + self.embeddings = TFDebertaEmbeddings(config, name="embeddings") + self.encoder = TFDebertaEncoder(config, name="encoder") + + def get_input_embeddings(self) -> keras.layers.Layer: + return self.embeddings + + def set_input_embeddings(self, value: tf.Variable): + self.embeddings.weight = value + self.embeddings.vocab_size = shape_list(value)[0] + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + raise NotImplementedError + + @unpack_inputs + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + training: bool = False, + ) -> TFBaseModelOutput | tuple[tf.Tensor]: + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + input_shape = shape_list(input_ids) + elif inputs_embeds is not None: + input_shape = shape_list(inputs_embeds)[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + if attention_mask is None: + attention_mask = tf.fill(dims=input_shape, value=1) + + if token_type_ids is None: + token_type_ids = tf.fill(dims=input_shape, value=0) + + embedding_output = self.embeddings( + input_ids=input_ids, + position_ids=position_ids, + token_type_ids=token_type_ids, + inputs_embeds=inputs_embeds, + mask=attention_mask, + training=training, + ) + + encoder_outputs = self.encoder( + hidden_states=embedding_output, + attention_mask=attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + sequence_output = encoder_outputs[0] + + if not return_dict: + return (sequence_output,) + encoder_outputs[1:] + + return TFBaseModelOutput( + last_hidden_state=sequence_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "embeddings", None) is not None: + with tf.name_scope(self.embeddings.name): + self.embeddings.build(None) + if getattr(self, "encoder", None) is not None: + with tf.name_scope(self.encoder.name): + self.encoder.build(None) + + +class TFDebertaPreTrainedModel(TFPreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = DebertaConfig + base_model_prefix = "deberta" + + +DEBERTA_START_DOCSTRING = r""" + The DeBERTa model was proposed in [DeBERTa: Decoding-enhanced BERT with Disentangled + Attention](https://huggingface.co/papers/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. It's build + on top of BERT/RoBERTa with two improvements, i.e. disentangled attention and enhanced mask decoder. With those two + improvements, it out perform BERT/RoBERTa on a majority of tasks with 80GB pretraining data. + + This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and + behavior. + + + + TensorFlow models and layers in `transformers` accept two formats as input: + + - having all inputs as keyword arguments (like PyTorch models), or + - having all inputs as a list, tuple or dict in the first positional argument. + + The reason the second format is supported is that Keras methods prefer this format when passing inputs to models + and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just + pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second + format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with + the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first + positional argument: + + - a single Tensor with `input_ids` only and nothing else: `model(input_ids)` + - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring: + `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])` + - a dictionary with one or several input Tensors associated to the input names given in the docstring: + `model({"input_ids": input_ids, "token_type_ids": token_type_ids})` + + Note that when creating models and layers with + [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry + about any of this, as you can just pass inputs like you would to any other Python function! + + + + Parameters: + config ([`DebertaConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +DEBERTA_INPUTS_DOCSTRING = r""" + Args: + input_ids (`np.ndarray`, `tf.Tensor`, `list[tf.Tensor]` ``dict[str, tf.Tensor]` or `dict[str, np.ndarray]` and each example must have the shape `({0})`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + token_type_ids (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, + 1]`: + + - 0 corresponds to a *sentence A* token, + - 1 corresponds to a *sentence B* token. + + [What are token type IDs?](../glossary#token-type-ids) + position_ids (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + + [What are position IDs?](../glossary#position-ids) + inputs_embeds (`np.ndarray` or `tf.Tensor` of shape `({0}, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert *input_ids* indices into associated vectors than the + model's internal embedding lookup matrix. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput``] instead of a plain tuple. +""" + + +@add_start_docstrings( + "The bare DeBERTa Model transformer outputting raw hidden-states without any specific head on top.", + DEBERTA_START_DOCSTRING, +) +class TFDebertaModel(TFDebertaPreTrainedModel): + def __init__(self, config: DebertaConfig, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + self.deberta = TFDebertaMainLayer(config, name="deberta") + + @unpack_inputs + @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFBaseModelOutput, + config_class=_CONFIG_FOR_DOC, + ) + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + training: bool | None = False, + ) -> TFBaseModelOutput | tuple[tf.Tensor]: + outputs = self.deberta( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + return outputs + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "deberta", None) is not None: + with tf.name_scope(self.deberta.name): + self.deberta.build(None) + + +@add_start_docstrings("""DeBERTa Model with a `language modeling` head on top.""", DEBERTA_START_DOCSTRING) +class TFDebertaForMaskedLM(TFDebertaPreTrainedModel, TFMaskedLanguageModelingLoss): + def __init__(self, config: DebertaConfig, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + if config.is_decoder: + logger.warning( + "If you want to use `TFDebertaForMaskedLM` make sure `config.is_decoder=False` for " + "bi-directional self-attention." + ) + + self.deberta = TFDebertaMainLayer(config, name="deberta") + self.mlm = TFDebertaOnlyMLMHead(config, input_embeddings=self.deberta.embeddings, name="cls") + + def get_lm_head(self) -> keras.layers.Layer: + return self.mlm.predictions + + @unpack_inputs + @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFMaskedLMOutput, + config_class=_CONFIG_FOR_DOC, + ) + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + labels: np.ndarray | tf.Tensor | None = None, + training: bool | None = False, + ) -> TFMaskedLMOutput | tuple[tf.Tensor]: + r""" + labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., + config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the + loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` + """ + outputs = self.deberta( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + sequence_output = outputs[0] + prediction_scores = self.mlm(sequence_output=sequence_output, training=training) + loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=prediction_scores) + + if not return_dict: + output = (prediction_scores,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TFMaskedLMOutput( + loss=loss, + logits=prediction_scores, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "deberta", None) is not None: + with tf.name_scope(self.deberta.name): + self.deberta.build(None) + if getattr(self, "mlm", None) is not None: + with tf.name_scope(self.mlm.name): + self.mlm.build(None) + + +@add_start_docstrings( + """ + DeBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the + pooled output) e.g. for GLUE tasks. + """, + DEBERTA_START_DOCSTRING, +) +class TFDebertaForSequenceClassification(TFDebertaPreTrainedModel, TFSequenceClassificationLoss): + def __init__(self, config: DebertaConfig, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + self.num_labels = config.num_labels + + self.deberta = TFDebertaMainLayer(config, name="deberta") + self.pooler = TFDebertaContextPooler(config, name="pooler") + + drop_out = getattr(config, "cls_dropout", None) + drop_out = self.config.hidden_dropout_prob if drop_out is None else drop_out + self.dropout = TFDebertaStableDropout(drop_out, name="cls_dropout") + self.classifier = keras.layers.Dense( + units=config.num_labels, + kernel_initializer=get_initializer(config.initializer_range), + name="classifier", + ) + self.output_dim = self.pooler.output_dim + + @unpack_inputs + @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFSequenceClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + labels: np.ndarray | tf.Tensor | None = None, + training: bool | None = False, + ) -> TFSequenceClassifierOutput | tuple[tf.Tensor]: + r""" + labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + outputs = self.deberta( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + sequence_output = outputs[0] + pooled_output = self.pooler(sequence_output, training=training) + pooled_output = self.dropout(pooled_output, training=training) + logits = self.classifier(pooled_output) + loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=logits) + + if not return_dict: + output = (logits,) + outputs[1:] + + return ((loss,) + output) if loss is not None else output + + return TFSequenceClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "deberta", None) is not None: + with tf.name_scope(self.deberta.name): + self.deberta.build(None) + if getattr(self, "pooler", None) is not None: + with tf.name_scope(self.pooler.name): + self.pooler.build(None) + if getattr(self, "dropout", None) is not None: + with tf.name_scope(self.dropout.name): + self.dropout.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build([None, None, self.output_dim]) + + +@add_start_docstrings( + """ + DeBERTa Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for + Named-Entity-Recognition (NER) tasks. + """, + DEBERTA_START_DOCSTRING, +) +class TFDebertaForTokenClassification(TFDebertaPreTrainedModel, TFTokenClassificationLoss): + def __init__(self, config: DebertaConfig, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + self.num_labels = config.num_labels + + self.deberta = TFDebertaMainLayer(config, name="deberta") + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.classifier = keras.layers.Dense( + units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" + ) + self.config = config + + @unpack_inputs + @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFTokenClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + labels: np.ndarray | tf.Tensor | None = None, + training: bool | None = False, + ) -> TFTokenClassifierOutput | tuple[tf.Tensor]: + r""" + labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. + """ + outputs = self.deberta( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + sequence_output = outputs[0] + sequence_output = self.dropout(sequence_output, training=training) + logits = self.classifier(inputs=sequence_output) + loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=logits) + + if not return_dict: + output = (logits,) + outputs[1:] + return ((loss,) + output) if loss is not None else output + + return TFTokenClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "deberta", None) is not None: + with tf.name_scope(self.deberta.name): + self.deberta.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build([None, None, self.config.hidden_size]) + + +@add_start_docstrings( + """ + DeBERTa Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear + layers on top of the hidden-states output to compute `span start logits` and `span end logits`). + """, + DEBERTA_START_DOCSTRING, +) +class TFDebertaForQuestionAnswering(TFDebertaPreTrainedModel, TFQuestionAnsweringLoss): + def __init__(self, config: DebertaConfig, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + self.num_labels = config.num_labels + + self.deberta = TFDebertaMainLayer(config, name="deberta") + self.qa_outputs = keras.layers.Dense( + units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" + ) + self.config = config + + @unpack_inputs + @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFQuestionAnsweringModelOutput, + config_class=_CONFIG_FOR_DOC, + ) + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + start_positions: np.ndarray | tf.Tensor | None = None, + end_positions: np.ndarray | tf.Tensor | None = None, + training: bool | None = False, + ) -> TFQuestionAnsweringModelOutput | tuple[tf.Tensor]: + r""" + start_positions (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*): + Labels for position (index) of the start of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence + are not taken into account for computing the loss. + end_positions (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*): + Labels for position (index) of the end of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence + are not taken into account for computing the loss. + """ + outputs = self.deberta( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + sequence_output = outputs[0] + logits = self.qa_outputs(inputs=sequence_output) + start_logits, end_logits = tf.split(value=logits, num_or_size_splits=2, axis=-1) + start_logits = tf.squeeze(input=start_logits, axis=-1) + end_logits = tf.squeeze(input=end_logits, axis=-1) + loss = None + + if start_positions is not None and end_positions is not None: + labels = {"start_position": start_positions} + labels["end_position"] = end_positions + loss = self.hf_compute_loss(labels=labels, logits=(start_logits, end_logits)) + + if not return_dict: + output = (start_logits, end_logits) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TFQuestionAnsweringModelOutput( + loss=loss, + start_logits=start_logits, + end_logits=end_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "deberta", None) is not None: + with tf.name_scope(self.deberta.name): + self.deberta.build(None) + if getattr(self, "qa_outputs", None) is not None: + with tf.name_scope(self.qa_outputs.name): + self.qa_outputs.build([None, None, self.config.hidden_size]) + + +__all__ = [ + "TFDebertaForMaskedLM", + "TFDebertaForQuestionAnswering", + "TFDebertaForSequenceClassification", + "TFDebertaForTokenClassification", + "TFDebertaModel", + "TFDebertaPreTrainedModel", +] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deberta/tokenization_deberta.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deberta/tokenization_deberta.py new file mode 100644 index 0000000000000000000000000000000000000000..74e958c8030b4f535bc1448f4abb8f119ef5f926 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deberta/tokenization_deberta.py @@ -0,0 +1,366 @@ +# coding=utf-8 +# Copyright 2020 Microsoft and the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization class for model DeBERTa.""" + +import json +import os +from typing import Optional + +import regex as re + +from ...tokenization_utils import AddedToken, PreTrainedTokenizer +from ...utils import logging + + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt"} + + +# Copied from transformers.models.gpt2.tokenization_gpt2.bytes_to_unicode +def bytes_to_unicode(): + """ + Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control + characters the bpe code barfs on. + + The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab + if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for + decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup + tables between utf-8 bytes and unicode strings. + """ + bs = ( + list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1)) + ) + cs = bs[:] + n = 0 + for b in range(2**8): + if b not in bs: + bs.append(b) + cs.append(2**8 + n) + n += 1 + cs = [chr(n) for n in cs] + return dict(zip(bs, cs)) + + +# Copied from transformers.models.gpt2.tokenization_gpt2.get_pairs +def get_pairs(word): + """ + Return set of symbol pairs in a word. + + Word is represented as tuple of symbols (symbols being variable-length strings). + """ + pairs = set() + prev_char = word[0] + for char in word[1:]: + pairs.add((prev_char, char)) + prev_char = char + return pairs + + +class DebertaTokenizer(PreTrainedTokenizer): + """ + Construct a DeBERTa tokenizer. Based on byte-level Byte-Pair-Encoding. + + This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will + be encoded differently whether it is at the beginning of the sentence (without space) or not: + + ```python + >>> from transformers import DebertaTokenizer + + >>> tokenizer = DebertaTokenizer.from_pretrained("microsoft/deberta-base") + >>> tokenizer("Hello world")["input_ids"] + [1, 31414, 232, 2] + + >>> tokenizer(" Hello world")["input_ids"] + [1, 20920, 232, 2] + ``` + + You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer or when you + call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance. + + + + When used with `is_split_into_words=True`, this tokenizer will add a space before each word (even the first one). + + + + This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to + this superclass for more information regarding those methods. + + Args: + vocab_file (`str`): + Path to the vocabulary file. + merges_file (`str`): + Path to the merges file. + errors (`str`, *optional*, defaults to `"replace"`): + Paradigm to follow when decoding bytes to UTF-8. See + [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information. + bos_token (`str`, *optional*, defaults to `"[CLS]"`): + The beginning of sequence token. + eos_token (`str`, *optional*, defaults to `"[SEP]"`): + The end of sequence token. + sep_token (`str`, *optional*, defaults to `"[SEP]"`): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for + sequence classification or for a text and a question for question answering. It is also used as the last + token of a sequence built with special tokens. + cls_token (`str`, *optional*, defaults to `"[CLS]"`): + The classifier token which is used when doing sequence classification (classification of the whole sequence + instead of per-token classification). It is the first token of the sequence when built with special tokens. + unk_token (`str`, *optional*, defaults to `"[UNK]"`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + pad_token (`str`, *optional*, defaults to `"[PAD]"`): + The token used for padding, for example when batching sequences of different lengths. + mask_token (`str`, *optional*, defaults to `"[MASK]"`): + The token used for masking values. This is the token used when training this model with masked language + modeling. This is the token which the model will try to predict. + add_prefix_space (`bool`, *optional*, defaults to `False`): + Whether or not to add an initial space to the input. This allows to treat the leading word just as any + other word. (Deberta tokenizer detect beginning of words by the preceding space). + add_bos_token (`bool`, *optional*, defaults to `False`): + Whether or not to add an initial <|endoftext|> to the input. This allows to treat the leading word just as + any other word. + """ + + vocab_files_names = VOCAB_FILES_NAMES + model_input_names = ["input_ids", "attention_mask", "token_type_ids"] + + def __init__( + self, + vocab_file, + merges_file, + errors="replace", + bos_token="[CLS]", + eos_token="[SEP]", + sep_token="[SEP]", + cls_token="[CLS]", + unk_token="[UNK]", + pad_token="[PAD]", + mask_token="[MASK]", + add_prefix_space=False, + add_bos_token=False, + **kwargs, + ): + bos_token = AddedToken(bos_token, special=True) if isinstance(bos_token, str) else bos_token + eos_token = AddedToken(eos_token, special=True) if isinstance(eos_token, str) else eos_token + sep_token = AddedToken(sep_token, special=True) if isinstance(sep_token, str) else sep_token + cls_token = AddedToken(cls_token, special=True) if isinstance(cls_token, str) else cls_token + unk_token = AddedToken(unk_token, special=True) if isinstance(unk_token, str) else unk_token + pad_token = AddedToken(pad_token, special=True) if isinstance(pad_token, str) else pad_token + + # Mask token behave like a normal word, i.e. include the space before it + mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token + self.add_bos_token = add_bos_token + + with open(vocab_file, encoding="utf-8") as vocab_handle: + self.encoder = json.load(vocab_handle) + self.decoder = {v: k for k, v in self.encoder.items()} + self.errors = errors # how to handle errors in decoding + self.byte_encoder = bytes_to_unicode() + self.byte_decoder = {v: k for k, v in self.byte_encoder.items()} + with open(merges_file, encoding="utf-8") as merges_handle: + bpe_merges = merges_handle.read().split("\n")[1:-1] + bpe_merges = [tuple(merge.split()) for merge in bpe_merges] + self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges)))) + self.cache = {} + self.add_prefix_space = add_prefix_space + + # Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions + self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""") + + super().__init__( + errors=errors, + bos_token=bos_token, + eos_token=eos_token, + unk_token=unk_token, + sep_token=sep_token, + cls_token=cls_token, + pad_token=pad_token, + mask_token=mask_token, + add_prefix_space=add_prefix_space, + add_bos_token=add_bos_token, + **kwargs, + ) + + @property + # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.vocab_size + def vocab_size(self): + return len(self.encoder) + + # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.get_vocab + def get_vocab(self): + return dict(self.encoder, **self.added_tokens_encoder) + + # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.bpe + def bpe(self, token): + if token in self.cache: + return self.cache[token] + word = tuple(token) + pairs = get_pairs(word) + + if not pairs: + return token + + while True: + bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf"))) + if bigram not in self.bpe_ranks: + break + first, second = bigram + new_word = [] + i = 0 + while i < len(word): + try: + j = word.index(first, i) + except ValueError: + new_word.extend(word[i:]) + break + else: + new_word.extend(word[i:j]) + i = j + + if word[i] == first and i < len(word) - 1 and word[i + 1] == second: + new_word.append(first + second) + i += 2 + else: + new_word.append(word[i]) + i += 1 + new_word = tuple(new_word) + word = new_word + if len(word) == 1: + break + else: + pairs = get_pairs(word) + word = " ".join(word) + self.cache[token] = word + return word + + def build_inputs_with_special_tokens( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None + ) -> list[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A DeBERTa sequence has the following format: + + - single sequence: [CLS] X [SEP] + - pair of sequences: [CLS] A [SEP] B [SEP] + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + if token_ids_1 is None: + return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] + cls = [self.cls_token_id] + sep = [self.sep_token_id] + return cls + token_ids_0 + sep + token_ids_1 + sep + + def get_special_tokens_mask( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False + ) -> list[int]: + """ + Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods. + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not the token list is already formatted with special tokens for the model. + + Returns: + `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + if already_has_special_tokens: + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) + + if token_ids_1 is None: + return [1] + ([0] * len(token_ids_0)) + [1] + return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] + + # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer._tokenize + def _tokenize(self, text): + """Tokenize a string.""" + bpe_tokens = [] + for token in re.findall(self.pat, text): + token = "".join( + self.byte_encoder[b] for b in token.encode("utf-8") + ) # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case) + bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" ")) + return bpe_tokens + + # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer._convert_token_to_id + def _convert_token_to_id(self, token): + """Converts a token (str) in an id using the vocab.""" + return self.encoder.get(token, self.encoder.get(self.unk_token)) + + # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer._convert_id_to_token + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.decoder.get(index) + + # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.convert_tokens_to_string + def convert_tokens_to_string(self, tokens): + """Converts a sequence of tokens (string) in a single string.""" + text = "".join(tokens) + text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors) + return text + + # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.save_vocabulary + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]: + if not os.path.isdir(save_directory): + logger.error(f"Vocabulary path ({save_directory}) should be a directory") + return + vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + merge_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"] + ) + + with open(vocab_file, "w", encoding="utf-8") as f: + f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n") + + index = 0 + with open(merge_file, "w", encoding="utf-8") as writer: + writer.write("#version: 0.2\n") + for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]): + if index != token_index: + logger.warning( + f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive." + " Please check that the tokenizer is not corrupted!" + ) + index = token_index + writer.write(" ".join(bpe_tokens) + "\n") + index += 1 + + return vocab_file, merge_file + + def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs): + add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space) + if (is_split_into_words or add_prefix_space) and (len(text) > 0 and not text[0].isspace()): + text = " " + text + return (text, kwargs) + + +__all__ = ["DebertaTokenizer"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deberta/tokenization_deberta_fast.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deberta/tokenization_deberta_fast.py new file mode 100644 index 0000000000000000000000000000000000000000..c2f2e6552d9dfc8c5d3d48f3c396f816f82158bc --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deberta/tokenization_deberta_fast.py @@ -0,0 +1,209 @@ +# coding=utf-8 +# Copyright 2020 Microsoft and the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Fast Tokenization class for model DeBERTa.""" + +from typing import Optional + +from ...tokenization_utils_base import AddedToken, BatchEncoding +from ...tokenization_utils_fast import PreTrainedTokenizerFast +from ...utils import logging +from .tokenization_deberta import DebertaTokenizer + + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"} + + +class DebertaTokenizerFast(PreTrainedTokenizerFast): + """ + Construct a "fast" DeBERTa tokenizer (backed by HuggingFace's *tokenizers* library). Based on byte-level + Byte-Pair-Encoding. + + This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will + be encoded differently whether it is at the beginning of the sentence (without space) or not: + + ```python + >>> from transformers import DebertaTokenizerFast + + >>> tokenizer = DebertaTokenizerFast.from_pretrained("microsoft/deberta-base") + >>> tokenizer("Hello world")["input_ids"] + [1, 31414, 232, 2] + + >>> tokenizer(" Hello world")["input_ids"] + [1, 20920, 232, 2] + ``` + + You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer, but since + the model was not pretrained this way, it might yield a decrease in performance. + + + + When used with `is_split_into_words=True`, this tokenizer needs to be instantiated with `add_prefix_space=True`. + + + + This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should + refer to this superclass for more information regarding those methods. + + Args: + vocab_file (`str`, *optional*): + Path to the vocabulary file. + merges_file (`str`, *optional*): + Path to the merges file. + tokenizer_file (`str`, *optional*): + The path to a tokenizer file to use instead of the vocab file. + errors (`str`, *optional*, defaults to `"replace"`): + Paradigm to follow when decoding bytes to UTF-8. See + [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information. + bos_token (`str`, *optional*, defaults to `"[CLS]"`): + The beginning of sequence token. + eos_token (`str`, *optional*, defaults to `"[SEP]"`): + The end of sequence token. + sep_token (`str`, *optional*, defaults to `"[SEP]"`): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for + sequence classification or for a text and a question for question answering. It is also used as the last + token of a sequence built with special tokens. + cls_token (`str`, *optional*, defaults to `"[CLS]"`): + The classifier token which is used when doing sequence classification (classification of the whole sequence + instead of per-token classification). It is the first token of the sequence when built with special tokens. + unk_token (`str`, *optional*, defaults to `"[UNK]"`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + pad_token (`str`, *optional*, defaults to `"[PAD]"`): + The token used for padding, for example when batching sequences of different lengths. + mask_token (`str`, *optional*, defaults to `"[MASK]"`): + The token used for masking values. This is the token used when training this model with masked language + modeling. This is the token which the model will try to predict. + add_prefix_space (`bool`, *optional*, defaults to `False`): + Whether or not to add an initial space to the input. This allows to treat the leading word just as any + other word. (Deberta tokenizer detect beginning of words by the preceding space). + """ + + vocab_files_names = VOCAB_FILES_NAMES + model_input_names = ["input_ids", "attention_mask", "token_type_ids"] + slow_tokenizer_class = DebertaTokenizer + + def __init__( + self, + vocab_file=None, + merges_file=None, + tokenizer_file=None, + errors="replace", + bos_token="[CLS]", + eos_token="[SEP]", + sep_token="[SEP]", + cls_token="[CLS]", + unk_token="[UNK]", + pad_token="[PAD]", + mask_token="[MASK]", + add_prefix_space=False, + **kwargs, + ): + super().__init__( + vocab_file, + merges_file, + tokenizer_file=tokenizer_file, + errors=errors, + bos_token=bos_token, + eos_token=eos_token, + unk_token=unk_token, + sep_token=sep_token, + cls_token=cls_token, + pad_token=pad_token, + mask_token=mask_token, + add_prefix_space=add_prefix_space, + **kwargs, + ) + self.add_bos_token = kwargs.pop("add_bos_token", False) + + @property + def mask_token(self) -> str: + """ + `str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while not + having been set. + + Deberta tokenizer has a special mask token to be used in the fill-mask pipeline. The mask token will greedily + comprise the space before the *[MASK]*. + """ + if self._mask_token is None: + if self.verbose: + logger.error("Using mask_token, but it is not set yet.") + return None + return str(self._mask_token) + + @mask_token.setter + def mask_token(self, value): + """ + Overriding the default behavior of the mask token to have it eat the space before it. + """ + # Mask token behave like a normal word, i.e. include the space before it + # So we set lstrip to True + value = AddedToken(value, lstrip=True, rstrip=False) if isinstance(value, str) else value + self._mask_token = value + + def build_inputs_with_special_tokens( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None + ) -> list[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A DeBERTa sequence has the following format: + + - single sequence: [CLS] X [SEP] + - pair of sequences: [CLS] A [SEP] B [SEP] + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + if token_ids_1 is None: + return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] + cls = [self.cls_token_id] + sep = [self.sep_token_id] + return cls + token_ids_0 + sep + token_ids_1 + sep + + # Copied from transformers.models.gpt2.tokenization_gpt2_fast.GPT2TokenizerFast._batch_encode_plus + def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding: + is_split_into_words = kwargs.get("is_split_into_words", False) + assert self.add_prefix_space or not is_split_into_words, ( + f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True " + "to use it with pretokenized inputs." + ) + + return super()._batch_encode_plus(*args, **kwargs) + + # Copied from transformers.models.gpt2.tokenization_gpt2_fast.GPT2TokenizerFast._encode_plus + def _encode_plus(self, *args, **kwargs) -> BatchEncoding: + is_split_into_words = kwargs.get("is_split_into_words", False) + + assert self.add_prefix_space or not is_split_into_words, ( + f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True " + "to use it with pretokenized inputs." + ) + + return super()._encode_plus(*args, **kwargs) + + # Copied from transformers.models.gpt2.tokenization_gpt2_fast.GPT2TokenizerFast.save_vocabulary + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]: + files = self._tokenizer.model.save(save_directory, name=filename_prefix) + return tuple(files) + + +__all__ = ["DebertaTokenizerFast"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/decision_transformer/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/decision_transformer/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..455f7ffec5dee0567039cfd844588cb991c15622 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/decision_transformer/__init__.py @@ -0,0 +1,27 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_decision_transformer import * + from .modeling_decision_transformer import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/decision_transformer/configuration_decision_transformer.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/decision_transformer/configuration_decision_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..436834c7e5e038b79d9cf6ef21e34ba5eaf6ae64 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/decision_transformer/configuration_decision_transformer.py @@ -0,0 +1,157 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Team and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Decision Transformer model configuration""" + +from ...configuration_utils import PretrainedConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class DecisionTransformerConfig(PretrainedConfig): + """ + This is the configuration class to store the configuration of a [`DecisionTransformerModel`]. It is used to + instantiate a Decision Transformer model according to the specified arguments, defining the model architecture. + Instantiating a configuration with the defaults will yield a similar configuration to that of the standard + DecisionTransformer architecture. Many of the config options are used to instantiate the GPT2 model that is used as + part of the architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + state_dim (`int`, *optional*, defaults to 17): + The state size for the RL environment + act_dim (`int`, *optional*, defaults to 4): + The size of the output action space + hidden_size (`int`, *optional*, defaults to 128): + The size of the hidden layers + max_ep_len (`int`, *optional*, defaults to 4096): + The maximum length of an episode in the environment + action_tanh (`bool`, *optional*, defaults to True): + Whether to use a tanh activation on action prediction + vocab_size (`int`, *optional*, defaults to 50257): + Vocabulary size of the GPT-2 model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`DecisionTransformerModel`]. + n_positions (`int`, *optional*, defaults to 1024): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + n_layer (`int`, *optional*, defaults to 3): + Number of hidden layers in the Transformer encoder. + n_head (`int`, *optional*, defaults to 1): + Number of attention heads for each attention layer in the Transformer encoder. + n_inner (`int`, *optional*): + Dimensionality of the inner feed-forward layers. If unset, will default to 4 times `n_embd`. + activation_function (`str`, *optional*, defaults to `"gelu"`): + Activation function, to be selected in the list `["relu", "silu", "gelu", "tanh", "gelu_new"]`. + resid_pdrop (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + embd_pdrop (`int`, *optional*, defaults to 0.1): + The dropout ratio for the embeddings. + attn_pdrop (`float`, *optional*, defaults to 0.1): + The dropout ratio for the attention. + layer_norm_epsilon (`float`, *optional*, defaults to 1e-5): + The epsilon to use in the layer normalization layers. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + scale_attn_weights (`bool`, *optional*, defaults to `True`): + Scale attention weights by dividing by sqrt(hidden_size).. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). + scale_attn_by_inverse_layer_idx (`bool`, *optional*, defaults to `False`): + Whether to additionally scale attention weights by `1 / layer_idx + 1`. + reorder_and_upcast_attn (`bool`, *optional*, defaults to `False`): + Whether to scale keys (K) prior to computing attention (dot-product) and upcast attention + dot-product/softmax to float() when training with mixed precision. + + Example: + + ```python + >>> from transformers import DecisionTransformerConfig, DecisionTransformerModel + + >>> # Initializing a DecisionTransformer configuration + >>> configuration = DecisionTransformerConfig() + + >>> # Initializing a model (with random weights) from the configuration + >>> model = DecisionTransformerModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "decision_transformer" + keys_to_ignore_at_inference = ["past_key_values"] + attribute_map = { + "max_position_embeddings": "n_positions", + "num_attention_heads": "n_head", + "num_hidden_layers": "n_layer", + } + + def __init__( + self, + state_dim=17, + act_dim=4, + hidden_size=128, + max_ep_len=4096, + action_tanh=True, + vocab_size=1, + n_positions=1024, + n_layer=3, + n_head=1, + n_inner=None, + activation_function="relu", + resid_pdrop=0.1, + embd_pdrop=0.1, + attn_pdrop=0.1, + layer_norm_epsilon=1e-5, + initializer_range=0.02, + scale_attn_weights=True, + use_cache=True, + bos_token_id=50256, + eos_token_id=50256, + scale_attn_by_inverse_layer_idx=False, + reorder_and_upcast_attn=False, + **kwargs, + ): + self.state_dim = state_dim + self.act_dim = act_dim + self.hidden_size = hidden_size + self.max_ep_len = max_ep_len + self.action_tanh = action_tanh + self.vocab_size = vocab_size + self.n_positions = n_positions + self.n_layer = n_layer + self.n_head = n_head + self.n_inner = n_inner + self.activation_function = activation_function + self.resid_pdrop = resid_pdrop + self.embd_pdrop = embd_pdrop + self.attn_pdrop = attn_pdrop + self.layer_norm_epsilon = layer_norm_epsilon + self.initializer_range = initializer_range + self.scale_attn_weights = scale_attn_weights + self.use_cache = use_cache + self.scale_attn_by_inverse_layer_idx = scale_attn_by_inverse_layer_idx + self.reorder_and_upcast_attn = reorder_and_upcast_attn + + self.bos_token_id = bos_token_id + self.eos_token_id = eos_token_id + + super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) + + +__all__ = ["DecisionTransformerConfig"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/decision_transformer/modeling_decision_transformer.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/decision_transformer/modeling_decision_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..f9c68fcbdeaeca9fb89c160de8f440333befd5fd --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/decision_transformer/modeling_decision_transformer.py @@ -0,0 +1,934 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Team The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch DecisionTransformer model.""" + +import math +import os +from dataclasses import dataclass +from typing import Callable, Optional, Union + +import torch +from torch import nn + +from ...activations import ACT2FN +from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_outputs import BaseModelOutputWithPastAndCrossAttentions +from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel +from ...pytorch_utils import Conv1D, find_pruneable_heads_and_indices, prune_conv1d_layer +from ...utils import ( + ModelOutput, + auto_docstring, + logging, +) +from ...utils.deprecation import deprecate_kwarg +from .configuration_decision_transformer import DecisionTransformerConfig + + +logger = logging.get_logger(__name__) + + +# Copied from transformers.models.gpt2.modeling_gpt2.load_tf_weights_in_gpt2 +def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path): + """Load tf checkpoints in a pytorch model""" + try: + import re + + import tensorflow as tf + except ImportError: + logger.error( + "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see " + "https://www.tensorflow.org/install/ for installation instructions." + ) + raise + tf_path = os.path.abspath(gpt2_checkpoint_path) + logger.info(f"Converting TensorFlow checkpoint from {tf_path}") + # Load weights from TF model + init_vars = tf.train.list_variables(tf_path) + names = [] + arrays = [] + for name, shape in init_vars: + logger.info(f"Loading TF weight {name} with shape {shape}") + array = tf.train.load_variable(tf_path, name) + names.append(name) + arrays.append(array.squeeze()) + + for name, array in zip(names, arrays): + name = name[6:] # skip "model/" + name = name.split("/") + pointer = model + for m_name in name: + if re.fullmatch(r"[A-Za-z]+\d+", m_name): + scope_names = re.split(r"(\d+)", m_name) + else: + scope_names = [m_name] + if scope_names[0] == "w" or scope_names[0] == "g": + pointer = getattr(pointer, "weight") + elif scope_names[0] == "b": + pointer = getattr(pointer, "bias") + elif scope_names[0] == "wpe" or scope_names[0] == "wte": + pointer = getattr(pointer, scope_names[0]) + pointer = getattr(pointer, "weight") + else: + pointer = getattr(pointer, scope_names[0]) + if len(scope_names) >= 2: + num = int(scope_names[1]) + pointer = pointer[num] + try: + if pointer.shape != array.shape: + raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched") + except ValueError as e: + e.args += (pointer.shape, array.shape) + raise + logger.info(f"Initialize PyTorch weight {name}") + pointer.data = torch.from_numpy(array) + return model + + +# Copied from transformers.models.gpt2.modeling_gpt2.eager_attention_forward +def eager_attention_forward(module, query, key, value, attention_mask, head_mask=None, **kwargs): + attn_weights = torch.matmul(query, key.transpose(-1, -2)) + + if module.scale_attn_weights: + attn_weights = attn_weights / torch.full( + [], value.size(-1) ** 0.5, dtype=attn_weights.dtype, device=attn_weights.device + ) + + # Layer-wise attention scaling + if module.scale_attn_by_inverse_layer_idx: + attn_weights = attn_weights / float(module.layer_idx + 1) + + if not module.is_cross_attention: + # if only "normal" attention layer implements causal mask + query_length, key_length = query.size(-2), key.size(-2) + causal_mask = module.bias[:, :, key_length - query_length : key_length, :key_length] + mask_value = torch.finfo(attn_weights.dtype).min + # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`. + # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device` + mask_value = torch.full([], mask_value, dtype=attn_weights.dtype, device=attn_weights.device) + attn_weights = torch.where(causal_mask, attn_weights.to(attn_weights.dtype), mask_value) + + if attention_mask is not None: + # Apply the attention mask + causal_mask = attention_mask[:, :, :, : key.shape[-2]] + attn_weights = attn_weights + causal_mask + + attn_weights = nn.functional.softmax(attn_weights, dim=-1) + + # Downcast (if necessary) back to V's dtype (if in mixed-precision) -- No-Op otherwise + attn_weights = attn_weights.type(value.dtype) + attn_weights = module.attn_dropout(attn_weights) + + # Mask heads if we want to + if head_mask is not None: + attn_weights = attn_weights * head_mask + + attn_output = torch.matmul(attn_weights, value) + attn_output = attn_output.transpose(1, 2) + + return attn_output, attn_weights + + +# Copied from transformers.models.gpt2.modeling_gpt2.GPT2Attention with GPT2->DecisionTransformerGPT2 +class DecisionTransformerGPT2Attention(nn.Module): + def __init__(self, config, is_cross_attention=False, layer_idx=None): + super().__init__() + self.config = config + max_positions = config.max_position_embeddings + self.register_buffer( + "bias", + torch.tril(torch.ones((max_positions, max_positions), dtype=torch.bool)).view( + 1, 1, max_positions, max_positions + ), + persistent=False, + ) + self.register_buffer("masked_bias", torch.tensor(-1e4), persistent=False) + + self.embed_dim = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.embed_dim // self.num_heads + self.split_size = self.embed_dim + if self.head_dim * self.num_heads != self.embed_dim: + raise ValueError( + f"`embed_dim` must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:" + f" {self.num_heads})." + ) + + self.scale_attn_weights = config.scale_attn_weights + self.is_cross_attention = is_cross_attention + + # Layer-wise attention scaling, reordering, and upcasting + self.scale_attn_by_inverse_layer_idx = config.scale_attn_by_inverse_layer_idx + self.layer_idx = layer_idx + self.reorder_and_upcast_attn = config.reorder_and_upcast_attn + + if self.is_cross_attention: + self.c_attn = Conv1D(2 * self.embed_dim, self.embed_dim) + self.q_attn = Conv1D(self.embed_dim, self.embed_dim) + else: + self.c_attn = Conv1D(3 * self.embed_dim, self.embed_dim) + self.c_proj = Conv1D(self.embed_dim, self.embed_dim) + + self.attn_dropout = nn.Dropout(config.attn_pdrop) + self.resid_dropout = nn.Dropout(config.resid_pdrop) + self.is_causal = True + + self.pruned_heads = set() + + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices(heads, self.num_heads, self.head_dim, self.pruned_heads) + index_attn = torch.cat([index, index + self.split_size, index + (2 * self.split_size)]) + + # Prune conv1d layers + self.c_attn = prune_conv1d_layer(self.c_attn, index_attn, dim=1) + self.c_proj = prune_conv1d_layer(self.c_proj, index, dim=0) + + # Update hyper params + self.split_size = (self.split_size // self.num_heads) * (self.num_heads - len(heads)) + self.num_heads = self.num_heads - len(heads) + self.pruned_heads = self.pruned_heads.union(heads) + + def _upcast_and_reordered_attn(self, query, key, value, attention_mask=None, head_mask=None): + # Use `torch.baddbmm` (a bit more efficient w/ alpha param for scaling -- from Megatron-LM) + bsz, num_heads, q_seq_len, dk = query.size() + _, _, k_seq_len, _ = key.size() + + # Preallocate attn_weights for `baddbmm` + attn_weights = torch.empty(bsz * num_heads, q_seq_len, k_seq_len, dtype=torch.float32, device=query.device) + + # Compute Scale Factor + scale_factor = 1.0 + if self.scale_attn_weights: + scale_factor /= float(value.size(-1)) ** 0.5 + + if self.scale_attn_by_inverse_layer_idx: + scale_factor /= float(self.layer_idx + 1) + + # Upcast (turn off autocast) and reorder (Scale K by 1 / root(dk)) + with torch.autocast(query.device.type, enabled=False): + q, k = query.reshape(-1, q_seq_len, dk), key.transpose(-1, -2).reshape(-1, dk, k_seq_len) + attn_weights = torch.baddbmm(attn_weights, q.float(), k.float(), beta=0, alpha=scale_factor) + attn_weights = attn_weights.reshape(bsz, num_heads, q_seq_len, k_seq_len) + + if not self.is_cross_attention: + # if only "normal" attention layer implements causal mask + query_length, key_length = query.size(-2), key.size(-2) + causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length] + mask_value = torch.finfo(attn_weights.dtype).min + # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`. + # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device` + mask_value = torch.tensor(mask_value, dtype=attn_weights.dtype, device=attn_weights.device) + attn_weights = torch.where(causal_mask, attn_weights, mask_value) + + if attention_mask is not None: + # Apply the attention mask + attn_weights = attn_weights + attention_mask + + attn_weights = nn.functional.softmax(attn_weights, dim=-1) + + # Downcast (if necessary) back to V's dtype (if in mixed-precision) -- No-Op if otherwise + if attn_weights.dtype != torch.float32: + raise RuntimeError("Error with upcasting, attn_weights does not have dtype torch.float32") + attn_weights = attn_weights.type(value.dtype) + attn_weights = self.attn_dropout(attn_weights) + + # Mask heads if we want to + if head_mask is not None: + attn_weights = attn_weights * head_mask + + attn_output = torch.matmul(attn_weights, value) + attn_output = attn_output.transpose(1, 2) + + return attn_output, attn_weights + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: Optional[tuple[torch.FloatTensor]], + past_key_values: Optional[Cache] = None, + cache_position: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = False, + **kwargs, + ) -> tuple[Union[torch.Tensor, tuple[torch.Tensor]], ...]: + is_cross_attention = encoder_hidden_states is not None + if past_key_values is not None: + if isinstance(past_key_values, EncoderDecoderCache): + is_updated = past_key_values.is_updated.get(self.layer_idx) + if is_cross_attention: + # after the first generated id, we can subsequently re-use all key/value_layer from cache + curr_past_key_value = past_key_values.cross_attention_cache + else: + curr_past_key_value = past_key_values.self_attention_cache + else: + curr_past_key_value = past_key_values + + if is_cross_attention: + if not hasattr(self, "q_attn"): + raise ValueError( + "If class is used as cross attention, the weights `q_attn` have to be defined. " + "Please make sure to instantiate class with `DecisionTransformerGPT2Attention(..., is_cross_attention=True)`." + ) + query_states = self.q_attn(hidden_states) + attention_mask = encoder_attention_mask + + # Try to get key/value states from cache if possible + if past_key_values is not None and is_updated: + key_states = curr_past_key_value.layers[self.layer_idx].keys + value_states = curr_past_key_value.layers[self.layer_idx].values + else: + key_states, value_states = self.c_attn(encoder_hidden_states).split(self.split_size, dim=2) + shape_kv = (*key_states.shape[:-1], -1, self.head_dim) + key_states = key_states.view(shape_kv).transpose(1, 2) + value_states = value_states.view(shape_kv).transpose(1, 2) + else: + query_states, key_states, value_states = self.c_attn(hidden_states).split(self.split_size, dim=2) + shape_kv = (*key_states.shape[:-1], -1, self.head_dim) + key_states = key_states.view(shape_kv).transpose(1, 2) + value_states = value_states.view(shape_kv).transpose(1, 2) + + shape_q = (*query_states.shape[:-1], -1, self.head_dim) + query_states = query_states.view(shape_q).transpose(1, 2) + + if (past_key_values is not None and not is_cross_attention) or ( + past_key_values is not None and is_cross_attention and not is_updated + ): + # save all key/value_layer to cache to be re-used for fast auto-regressive generation + cache_position = cache_position if not is_cross_attention else None + key_states, value_states = curr_past_key_value.update( + key_states, value_states, self.layer_idx, {"cache_position": cache_position} + ) + # set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls + if is_cross_attention: + past_key_values.is_updated[self.layer_idx] = True + + is_causal = attention_mask is None and query_states.shape[-2] > 1 and not is_cross_attention + + using_eager = self.config._attn_implementation == "eager" + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + if using_eager and self.reorder_and_upcast_attn: + attn_output, attn_weights = self._upcast_and_reordered_attn( + query_states, key_states, value_states, attention_mask, head_mask + ) + else: + attn_output, attn_weights = attention_interface( + self, + query_states, + key_states, + value_states, + attention_mask, + head_mask=head_mask, + dropout=self.attn_dropout.p if self.training else 0.0, + is_causal=is_causal, + **kwargs, + ) + + attn_output = attn_output.reshape(*attn_output.shape[:-2], -1).contiguous() + attn_output = self.c_proj(attn_output) + attn_output = self.resid_dropout(attn_output) + + return attn_output, attn_weights + + +# Copied from transformers.models.gpt2.modeling_gpt2.GPT2MLP with GPT2->DecisionTransformerGPT2 +class DecisionTransformerGPT2MLP(nn.Module): + def __init__(self, intermediate_size, config): + super().__init__() + embed_dim = config.hidden_size + self.c_fc = Conv1D(intermediate_size, embed_dim) + self.c_proj = Conv1D(embed_dim, intermediate_size) + self.act = ACT2FN[config.activation_function] + self.dropout = nn.Dropout(config.resid_pdrop) + + def forward(self, hidden_states: Optional[tuple[torch.FloatTensor]]) -> torch.FloatTensor: + hidden_states = self.c_fc(hidden_states) + hidden_states = self.act(hidden_states) + hidden_states = self.c_proj(hidden_states) + hidden_states = self.dropout(hidden_states) + return hidden_states + + +# Copied from transformers.models.gpt2.modeling_gpt2.GPT2Block with GPT2->DecisionTransformerGPT2 +class DecisionTransformerGPT2Block(GradientCheckpointingLayer): + # Ignore copy + def __init__(self, config, layer_idx=None): + super().__init__() + hidden_size = config.hidden_size + inner_dim = config.n_inner if config.n_inner is not None else 4 * hidden_size + + self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon) + self.attn = DecisionTransformerGPT2Attention(config, layer_idx=layer_idx) + self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon) + + if config.add_cross_attention: + self.crossattention = DecisionTransformerGPT2Attention( + config, is_cross_attention=True, layer_idx=layer_idx + ) + self.ln_cross_attn = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon) + + self.mlp = DecisionTransformerGPT2MLP(inner_dim, config) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: Optional[tuple[torch.FloatTensor]], + past_key_values: Optional[Cache] = None, + cache_position: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = False, + output_attentions: Optional[bool] = False, + **kwargs, + ) -> Union[tuple[torch.Tensor], Optional[tuple[torch.Tensor, tuple[torch.FloatTensor, ...]]]]: + residual = hidden_states + hidden_states = self.ln_1(hidden_states) + attn_output, self_attn_weights = self.attn( + hidden_states, + past_key_values=past_key_values, + cache_position=cache_position, + attention_mask=attention_mask, + head_mask=head_mask, + use_cache=use_cache, + output_attentions=output_attentions, + **kwargs, + ) + # residual connection + hidden_states = attn_output + residual + + if encoder_hidden_states is not None: + # add one self-attention block for cross-attention + if not hasattr(self, "crossattention"): + raise ValueError( + f"If `encoder_hidden_states` are passed, {self} has to be instantiated with " + "cross-attention layers by setting `config.add_cross_attention=True`" + ) + residual = hidden_states + hidden_states = self.ln_cross_attn(hidden_states) + cross_attn_output, cross_attn_weights = self.crossattention( + hidden_states, + past_key_values=past_key_values, + attention_mask=attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + output_attentions=output_attentions, + ) + # residual connection + hidden_states = residual + cross_attn_output + + residual = hidden_states + hidden_states = self.ln_2(hidden_states) + feed_forward_hidden_states = self.mlp(hidden_states) + # residual connection + hidden_states = residual + feed_forward_hidden_states + + outputs = (hidden_states,) + if output_attentions: + outputs += (self_attn_weights,) + if encoder_hidden_states is not None: + outputs += (cross_attn_weights,) + + return outputs + + +@auto_docstring +class DecisionTransformerGPT2PreTrainedModel(PreTrainedModel): + config: DecisionTransformerConfig + load_tf_weights = load_tf_weights_in_gpt2 + base_model_prefix = "transformer" + is_parallelizable = True + supports_gradient_checkpointing = True + + _can_compile_fullgraph = False + + def __init__(self, *inputs, **kwargs): + super().__init__(*inputs, **kwargs) + + def _init_weights(self, module): + """Initialize the weights.""" + if isinstance(module, (nn.Linear, Conv1D)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme: + # > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale + # > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers. + # > -- GPT-2 :: https://openai.com/blog/better-language-models/ + # + # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py + for name, p in module.named_parameters(): + if "c_proj" in name and "weight" in name: + # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block + p.data.normal_(mean=0.0, std=(self.config.initializer_range / math.sqrt(2 * self.config.n_layer))) + + +class DecisionTransformerGPT2Model(DecisionTransformerGPT2PreTrainedModel): + def __init__(self, config): + super().__init__(config) + + self.embed_dim = config.hidden_size + + self.wte = nn.Embedding(config.vocab_size, self.embed_dim) + self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim) + + self.drop = nn.Dropout(config.embd_pdrop) + self.h = nn.ModuleList( + [DecisionTransformerGPT2Block(config, layer_idx=i) for i in range(config.num_hidden_layers)] + ) + self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon) + + # Model parallel + self.model_parallel = False + self.device_map = None + self.gradient_checkpointing = False + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.wte + + def set_input_embeddings(self, new_embeddings): + self.wte = new_embeddings + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + cache_position: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, BaseModelOutputWithPastAndCrossAttentions]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask) + input_shape = input_ids.size() + input_ids = input_ids.view(-1, input_shape[-1]) + batch_size = input_ids.shape[0] + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + batch_size = inputs_embeds.shape[0] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + device = input_ids.device if input_ids is not None else inputs_embeds.device + + if token_type_ids is not None: + token_type_ids = token_type_ids.view(-1, input_shape[-1]) + + # based on pattern from src/transformers/models/whisper/modeling_whisper.py::WhisperDecoder and similar addition in GPT2Model + if use_cache: + if past_key_values is None: + past_key_values = DynamicCache(config=self.config) + + if self.config.add_cross_attention and not isinstance(past_key_values, EncoderDecoderCache): + past_key_values = EncoderDecoderCache(past_key_values, DynamicCache(config=self.config)) + + if cache_position is None: + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + cache_position = torch.arange( + past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device + ) + if position_ids is None: + position_ids = cache_position.unsqueeze(0) + + # Attention mask. + if attention_mask is not None: + if batch_size <= 0: + raise ValueError("batch_size has to be defined and > 0") + attention_mask = attention_mask.view(batch_size, -1) + # We create a 3D attention mask from a 2D tensor mask. + # Sizes are [batch_size, 1, 1, to_seq_length] + # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] + # this attention mask is more simple than the triangular masking of causal attention + # used in OpenAI GPT, we just need to prepare the broadcast dimension here. + attention_mask = attention_mask[:, None, None, :] + + # Since attention_mask is 1.0 for positions we want to attend and 0.0 for + # masked positions, this operation will create a tensor which is 0.0 for + # positions we want to attend and the dtype's smallest value for masked positions. + # Since we are adding it to the raw scores before the softmax, this is + # effectively the same as removing these entirely. + attention_mask = attention_mask.to(dtype=self.dtype) # fp16 compatibility + attention_mask = (1.0 - attention_mask) * torch.finfo(self.dtype).min + + # If a 2D or 3D attention mask is provided for the cross-attention + # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] + if self.config.add_cross_attention and encoder_hidden_states is not None: + encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size() + encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) + if encoder_attention_mask is None: + encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device) + encoder_attention_mask = self.invert_attention_mask(encoder_attention_mask) + else: + encoder_attention_mask = None + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # head_mask has shape n_layer x batch x n_heads x N x N + head_mask = self.get_head_mask(head_mask, self.config.n_layer) + + if inputs_embeds is None: + inputs_embeds = self.wte(input_ids) + position_embeds = self.wpe(position_ids) + hidden_states = inputs_embeds + position_embeds + + if token_type_ids is not None: + token_type_embeds = self.wte(token_type_ids) + hidden_states = hidden_states + token_type_embeds + + hidden_states = self.drop(hidden_states) + + output_shape = (-1,) + input_shape[1:] + (hidden_states.size(-1),) + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + all_self_attentions = () if output_attentions else None + all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None + all_hidden_states = () if output_hidden_states else None + for i, block in enumerate(self.h): + # Model parallel + if self.model_parallel: + torch.cuda.set_device(hidden_states.device) + # Ensure that attention_mask is always on the same device as hidden_states + if attention_mask is not None: + attention_mask = attention_mask.to(hidden_states.device) + if isinstance(head_mask, torch.Tensor): + head_mask = head_mask.to(hidden_states.device) + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + outputs = block( + hidden_states, + past_key_values if not (self.gradient_checkpointing and self.training) else None, + cache_position, + attention_mask, + head_mask[i], + encoder_hidden_states, # as a positional argument for gradient checkpointing + encoder_attention_mask=encoder_attention_mask, + use_cache=use_cache, + output_attentions=output_attentions, + ) + + hidden_states = outputs[0] + + if output_attentions: + all_self_attentions = all_self_attentions + (outputs[1],) + if self.config.add_cross_attention: + all_cross_attentions = all_cross_attentions + (outputs[2],) + + # Model Parallel: If it's the last layer for that device, put things on the next device + if self.model_parallel: + for k, v in self.device_map.items(): + if i == v[-1] and "cuda:" + str(k) != self.last_device: + hidden_states = hidden_states.to("cuda:" + str(k + 1)) + + hidden_states = self.ln_f(hidden_states) + + hidden_states = hidden_states.view(output_shape) + # Add last hidden state + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + past_key_values = past_key_values if use_cache else None + # no return to legacy cache + if not return_dict: + return tuple( + v + for v in [hidden_states, past_key_values, all_hidden_states, all_self_attentions, all_cross_attentions] + if v is not None + ) + + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=past_key_values, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + cross_attentions=all_cross_attentions, + ) + + +@dataclass +@auto_docstring( + custom_intro=""" + Base class for model's outputs that also contains a pooling of the last hidden states. + """ +) +class DecisionTransformerOutput(ModelOutput): + r""" + state_preds (`torch.FloatTensor` of shape `(batch_size, sequence_length, state_dim)`): + Environment state predictions + action_preds (`torch.FloatTensor` of shape `(batch_size, sequence_length, action_dim)`): + Model action predictions + return_preds (`torch.FloatTensor` of shape `(batch_size, sequence_length, 1)`): + Predicted returns for each state + """ + + state_preds: Optional[torch.FloatTensor] = None + action_preds: Optional[torch.FloatTensor] = None + return_preds: Optional[torch.FloatTensor] = None + hidden_states: Optional[torch.FloatTensor] = None + attentions: Optional[torch.FloatTensor] = None + last_hidden_state: Optional[torch.FloatTensor] = None + + +class DecisionTransformerPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config: DecisionTransformerConfig + base_model_prefix = "decision_transformer" + main_input_name = "states" + supports_gradient_checkpointing = False + + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, nn.Linear): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + +@auto_docstring( + custom_intro=""" + The Decision Transformer Model + """ +) +class DecisionTransformerModel(DecisionTransformerPreTrainedModel): + """ + + The model builds upon the GPT2 architecture to perform autoregressive prediction of actions in an offline RL + setting. Refer to the paper for more details: https://huggingface.co/papers/2106.01345 + + """ + + def __init__(self, config): + super().__init__(config) + self.config = config + self.hidden_size = config.hidden_size + # note: the only difference between this GPT2Model and the default Huggingface version + # is that the positional embeddings are removed (since we'll add those ourselves) + self.encoder = DecisionTransformerGPT2Model(config) + + self.embed_timestep = nn.Embedding(config.max_ep_len, config.hidden_size) + self.embed_return = torch.nn.Linear(1, config.hidden_size) + self.embed_state = torch.nn.Linear(config.state_dim, config.hidden_size) + self.embed_action = torch.nn.Linear(config.act_dim, config.hidden_size) + + self.embed_ln = nn.LayerNorm(config.hidden_size) + + # note: we don't predict states or returns for the paper + self.predict_state = torch.nn.Linear(config.hidden_size, config.state_dim) + self.predict_action = nn.Sequential( + *([nn.Linear(config.hidden_size, config.act_dim)] + ([nn.Tanh()] if config.action_tanh else [])) + ) + self.predict_return = torch.nn.Linear(config.hidden_size, 1) + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + states: Optional[torch.FloatTensor] = None, + actions: Optional[torch.FloatTensor] = None, + rewards: Optional[torch.FloatTensor] = None, + returns_to_go: Optional[torch.FloatTensor] = None, + timesteps: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + output_hidden_states: Optional[bool] = None, + output_attentions: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple[torch.FloatTensor], DecisionTransformerOutput]: + r""" + states (`torch.FloatTensor` of shape `(batch_size, episode_length, state_dim)`): + The states for each step in the trajectory + actions (`torch.FloatTensor` of shape `(batch_size, episode_length, act_dim)`): + The actions taken by the "expert" policy for the current state, these are masked for auto regressive + prediction + rewards (`torch.FloatTensor` of shape `(batch_size, episode_length, 1)`): + The rewards for each state, action + returns_to_go (`torch.FloatTensor` of shape `(batch_size, episode_length, 1)`): + The returns for each state in the trajectory + timesteps (`torch.LongTensor` of shape `(batch_size, episode_length)`): + The timestep for each step in the trajectory + + Examples: + + ```python + >>> from transformers import DecisionTransformerModel + >>> import torch + + >>> model = DecisionTransformerModel.from_pretrained("edbeeching/decision-transformer-gym-hopper-medium") + >>> # evaluation + >>> model = model.to(device) + >>> model.eval() + + >>> env = gym.make("Hopper-v3") + >>> state_dim = env.observation_space.shape[0] + >>> act_dim = env.action_space.shape[0] + + >>> state = env.reset() + >>> states = torch.from_numpy(state).reshape(1, 1, state_dim).to(device=device, dtype=torch.float32) + >>> actions = torch.zeros((1, 1, act_dim), device=device, dtype=torch.float32) + >>> rewards = torch.zeros(1, 1, device=device, dtype=torch.float32) + >>> target_return = torch.tensor(TARGET_RETURN, dtype=torch.float32).reshape(1, 1) + >>> timesteps = torch.tensor(0, device=device, dtype=torch.long).reshape(1, 1) + >>> attention_mask = torch.zeros(1, 1, device=device, dtype=torch.float32) + + >>> # forward pass + >>> with torch.no_grad(): + ... state_preds, action_preds, return_preds = model( + ... states=states, + ... actions=actions, + ... rewards=rewards, + ... returns_to_go=target_return, + ... timesteps=timesteps, + ... attention_mask=attention_mask, + ... return_dict=False, + ... ) + ```""" + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + batch_size, seq_length = states.shape[0], states.shape[1] + + if attention_mask is None: + # attention mask for GPT: 1 if can be attended to, 0 if not + attention_mask = torch.ones((batch_size, seq_length), dtype=torch.long) + + # embed each modality with a different head + state_embeddings = self.embed_state(states) + action_embeddings = self.embed_action(actions) + returns_embeddings = self.embed_return(returns_to_go) + time_embeddings = self.embed_timestep(timesteps) + + # time embeddings are treated similar to positional embeddings + state_embeddings = state_embeddings + time_embeddings + action_embeddings = action_embeddings + time_embeddings + returns_embeddings = returns_embeddings + time_embeddings + + # this makes the sequence look like (R_1, s_1, a_1, R_2, s_2, a_2, ...) + # which works nice in an autoregressive sense since states predict actions + stacked_inputs = ( + torch.stack((returns_embeddings, state_embeddings, action_embeddings), dim=1) + .permute(0, 2, 1, 3) + .reshape(batch_size, 3 * seq_length, self.hidden_size) + ) + stacked_inputs = self.embed_ln(stacked_inputs) + + # to make the attention mask fit the stacked inputs, have to stack it as well + stacked_attention_mask = ( + torch.stack((attention_mask, attention_mask, attention_mask), dim=1) + .permute(0, 2, 1) + .reshape(batch_size, 3 * seq_length) + ) + device = stacked_inputs.device + # we feed in the input embeddings (not word indices as in NLP) to the model + encoder_outputs = self.encoder( + inputs_embeds=stacked_inputs, + attention_mask=stacked_attention_mask, + position_ids=torch.zeros(stacked_attention_mask.shape, device=device, dtype=torch.long), + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + x = encoder_outputs[0] + + # reshape x so that the second dimension corresponds to the original + # returns (0), states (1), or actions (2); i.e. x[:,1,t] is the token for s_t + x = x.reshape(batch_size, seq_length, 3, self.hidden_size).permute(0, 2, 1, 3) + + # get predictions + return_preds = self.predict_return(x[:, 2]) # predict next return given state and action + state_preds = self.predict_state(x[:, 2]) # predict next state given state and action + action_preds = self.predict_action(x[:, 1]) # predict next action given state + if not return_dict: + return (state_preds, action_preds, return_preds) + + return DecisionTransformerOutput( + last_hidden_state=encoder_outputs.last_hidden_state, + state_preds=state_preds, + action_preds=action_preds, + return_preds=return_preds, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + +__all__ = [ + "DecisionTransformerGPT2Model", + "DecisionTransformerGPT2PreTrainedModel", + "DecisionTransformerModel", + "DecisionTransformerPreTrainedModel", +] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deepseek_v2/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deepseek_v2/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..c9aaf5f0fccd2d155613c244c4f5391201712e44 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deepseek_v2/__init__.py @@ -0,0 +1,29 @@ +# Copyright 2025 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_deepseek_v2 import * + from .modeling_deepseek_v2 import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deepseek_v2/configuration_deepseek_v2.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deepseek_v2/configuration_deepseek_v2.py new file mode 100644 index 0000000000000000000000000000000000000000..b0b25625ea2313b764337b90b3fdd76e32c21152 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deepseek_v2/configuration_deepseek_v2.py @@ -0,0 +1,239 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/deepseek_v2/modular_deepseek_v2.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_deepseek_v2.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright 2025 HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from ...configuration_utils import PretrainedConfig +from ...modeling_rope_utils import rope_config_validation + + +class DeepseekV2Config(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`DeepseekV2Model`]. It is used to instantiate a DeepSeek + model according to the specified arguments, defining the model architecture. Instantiating a configuration with the + defaults will yield a similar configuration to that of DeepSeek-V2-Lite" [deepseek-ai/DeepSeek-V2-Lite"](https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite"). + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + vocab_size (`int`, *optional*, defaults to 32000): + Vocabulary size of the DeepSeek model. Defines the number of different tokens that can be represented by the + `input_ids` passed when calling [`DeepseekV2Model`]. + hidden_size (`int`, *optional*, defaults to 4096): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 11008): + Dimension of the MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 32): + Number of hidden layers in the Transformer decoder. + num_attention_heads (`int`, *optional*, defaults to 32): + Number of attention heads for each attention layer in the Transformer decoder. + num_key_value_heads (`int`, *optional*): + The number of key-value heads used to implement Grouped Query Attention (GQA). If + `num_key_value_heads=num_attention_heads`, the model will use Multi-Head Attention (MHA). If + `num_key_value_heads=1`, the model will use Multi-Query Attention (MQA). Otherwise, GQA is used. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the decoder. + max_position_embeddings (`int`, *optional*, defaults to 2048): + The maximum sequence length that this model might ever be used with. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated normal initializer for initializing all weight matrices. + rms_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon value used by the RMS normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/value attentions (useful for inference optimization). + pad_token_id (`int`, *optional*): + Padding token ID. + bos_token_id (`int`, *optional*, defaults to 1): + Beginning-of-sequence token ID. + eos_token_id (`int`, *optional*, defaults to 2): + End-of-sequence token ID. + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether to tie input and output embeddings. + rope_theta (`float`, *optional*, defaults to 10000.0): + The base period of the Rotary Position Embeddings (RoPE). + rope_scaling (`Dict`, *optional*): + Configuration for scaling RoPE embeddings. Supports `linear` and `dynamic` scaling strategies. + attention_bias (`bool`, *optional*, defaults to `False`): + Whether to use a bias in the query, key, value, and output projection layers during self-attention. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout probability applied to attention weights. + mlp_bias (`bool`, *optional*, defaults to `False`): + Whether to use a bias term in the MLP layers. + aux_loss_alpha (`float`, *optional*, defaults to 0.001): + Weight coefficient for auxiliary loss in Mixture of Experts (MoE) models. + first_k_dense_replace (`int`, *optional*, defaults to 0): + Number of dense layers in the shallow layers before switching to MoE layers. + kv_lora_rank (`int`, *optional*, defaults to 512): + Rank of the LoRA decomposition for key-value projections. + q_lora_rank (`int`, *optional*, defaults to 1536): + Rank of the LoRA decomposition for query projections. + Specifically, it determines the dimensionality to which the query (q) vectors are compressed before being expanded back to their original size. + It reduces computational overhead while maintaining model performance. + n_group (`int`, *optional*): + Number of groups for routed experts. + n_routed_experts (`int`, *optional*, defaults to 64): + Number of routed experts (None indicates a dense model). + n_shared_experts (`int`, *optional*, defaults to 2): + Number of shared experts (None indicates a dense model). + qk_nope_head_dim (`int`, *optional*, defaults to 128): + The head dimension for the QK (query-key) projections when using NOPE (Neural Operator Position Encoding). + qk_rope_head_dim (`int`, *optional*, defaults to 64): + The head dimension for QK projections when using RoPE. + routed_scaling_factor (`float`, *optional*, defaults to 1.0): + Scaling factor for routed experts in MoE models. + seq_aux (`bool`, *optional*, defaults to `True`): + Whether to compute the auxiliary loss for each individual sequence. + topk_group (`int`, *optional*): + Number of selected groups per token for expert selection. + topk_method (`str`, *optional*, defaults to `"greedy"`): + The method used for selecting top-k experts in the routed gate mechanism. + v_head_dim (`int`, *optional*, defaults to 128): + The dimension of value projections in the attention layers. + num_experts_per_tok (`int`, *optional*): + The number of experts selected per token. If `None`, the model behaves as a dense Transformer. + norm_topk_prob (`bool`, *optional*, defaults to `False`): + Whether to normalize the probability distribution over top-k selected experts. + moe_intermediate_size (`int`, *optional*, defaults to 1407): + Dimension of the MoE (Mixture of Experts) representations. + + ```python + >>> from transformers import DeepseekV2Model, DeepseekV2Config + >>> # Initializing a DeepSeek-V2 style configuration + >>> configuration = DeepseekV2Config() + >>> # Accessing the model configuration + >>> model = DeepseekV2Model(configuration) + >>> print(model.config) + ``` + """ + + model_type = "deepseek_v2" + keys_to_ignore_at_inference = ["past_key_values"] + + base_model_tp_plan = { + "layers.*.self_attn.q_proj": "colwise", + "layers.*.self_attn.q_a_proj": "colwise", + "layers.*.self_attn.q_b_proj": "colwise", + "layers.*.self_attn.kv_b_proj": "colwise", + "layers.*.self_attn.o_proj": "rowwise", + "layers.*.mlp.gate_proj": "colwise", + "layers.*.mlp.up_proj": "colwise", + "layers.*.mlp.down_proj": "rowwise", + } + base_model_pp_plan = { + "embed_tokens": (["input_ids"], ["inputs_embeds"]), + "layers": (["hidden_states", "attention_mask"], ["hidden_states"]), + "norm": (["hidden_states"], ["hidden_states"]), + } + + def __init__( + self, + vocab_size=32000, + hidden_size=4096, + intermediate_size=11008, + num_hidden_layers=32, + num_attention_heads=32, + num_key_value_heads=None, + hidden_act="silu", + max_position_embeddings=2048, + initializer_range=0.02, + rms_norm_eps=1e-6, + use_cache=True, + pad_token_id=None, + bos_token_id=1, + eos_token_id=2, + tie_word_embeddings=False, + rope_theta=10000.0, + rope_scaling=None, + attention_bias=False, + attention_dropout=0.0, + mlp_bias=False, + aux_loss_alpha=0.001, + first_k_dense_replace=0, + kv_lora_rank=512, + q_lora_rank=1536, + n_group=None, + n_routed_experts=64, + n_shared_experts=2, + qk_nope_head_dim=128, + qk_rope_head_dim=64, + routed_scaling_factor=1.0, + seq_aux=True, + topk_group=None, + topk_method="greedy", + v_head_dim=128, + num_experts_per_tok=None, + norm_topk_prob=False, + moe_intermediate_size=1407, + **kwargs, + ): + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + + # for backward compatibility + if num_key_value_heads is None: + num_key_value_heads = num_attention_heads + + self.num_key_value_heads = num_key_value_heads + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + self.use_cache = use_cache + self.rope_theta = rope_theta + self.rope_scaling = rope_scaling + self.attention_bias = attention_bias + self.attention_dropout = attention_dropout + self.mlp_bias = mlp_bias + self.head_dim = qk_rope_head_dim + # Validate the correctness of rotary position embeddings parameters + # BC: if there is a 'type' field, copy it it to 'rope_type'. + if self.rope_scaling is not None and "type" in self.rope_scaling: + self.rope_scaling["rope_type"] = self.rope_scaling["type"] + rope_config_validation(self) + self.aux_loss_alpha = aux_loss_alpha + self.first_k_dense_replace = first_k_dense_replace + self.kv_lora_rank = kv_lora_rank + self.q_lora_rank = q_lora_rank + self.n_group = n_group + self.n_routed_experts = n_routed_experts + self.n_shared_experts = n_shared_experts + self.qk_nope_head_dim = qk_nope_head_dim + self.qk_rope_head_dim = qk_rope_head_dim + self.routed_scaling_factor = routed_scaling_factor + self.seq_aux = seq_aux + self.topk_group = topk_group + self.topk_method = topk_method + self.v_head_dim = v_head_dim + self.num_experts_per_tok = num_experts_per_tok + self.norm_topk_prob = norm_topk_prob + self.moe_intermediate_size = moe_intermediate_size + + +__all__ = ["DeepseekV2Config"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deepseek_v2/modeling_deepseek_v2.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deepseek_v2/modeling_deepseek_v2.py new file mode 100644 index 0000000000000000000000000000000000000000..fa291e7689570d602b04e6efa3e9f7c921dcab10 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deepseek_v2/modeling_deepseek_v2.py @@ -0,0 +1,640 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/deepseek_v2/modular_deepseek_v2.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_deepseek_v2.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright 2025 HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import warnings +from typing import Callable, Optional, Union + +import torch +import torch.nn.functional as F +from torch import nn + +from ...activations import ACT2FN +from ...cache_utils import Cache, DynamicCache +from ...generation import GenerationMixin +from ...integrations import use_kernel_forward_from_hub +from ...masking_utils import create_causal_mask +from ...modeling_layers import GenericForSequenceClassification, GradientCheckpointingLayer +from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast +from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update +from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel +from ...processing_utils import Unpack +from ...utils import TransformersKwargs, auto_docstring, can_return_tuple +from ...utils.deprecation import deprecate_kwarg +from ...utils.generic import check_model_inputs +from .configuration_deepseek_v2 import DeepseekV2Config + + +class DeepseekV2MoEGate(nn.Module): + def __init__(self, config: DeepseekV2Config): + super().__init__() + self.config = config + self.top_k = config.num_experts_per_tok + self.num_experts = config.n_routed_experts + self.routed_scaling_factor = config.routed_scaling_factor + self.alpha = config.aux_loss_alpha + self.seq_aux = config.seq_aux + self.topk_method = config.topk_method + self.num_group = config.n_group + self.topk_group = config.topk_group + + # topk selection algorithm + self.norm_topk_prob = config.norm_topk_prob + self.gating_dim = config.hidden_size + self.weight = nn.Parameter(torch.empty((self.num_experts, self.gating_dim))) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + batch_size, seq_len, hidden_dim = hidden_states.shape + ### compute gating score + hidden_states = hidden_states.view(-1, hidden_dim) + logits = F.linear(hidden_states.type(torch.float32), self.weight.type(torch.float32), None) + scores = logits.softmax(dim=-1, dtype=torch.float32) + + # select top-k experts + # greedy method is used for DeepSeek-V2-Lite + # group_limited_greedy for DeepSeek-V2 and DeepSeek-V2-Chat + if self.topk_method == "greedy": + topk_weight, topk_idx = torch.topk(scores, k=self.top_k, dim=-1, sorted=False) + elif self.topk_method == "group_limited_greedy": + group_scores = scores.view(batch_size * seq_len, self.num_group, -1).max(dim=-1).values # [n, num_group] + group_idx = torch.topk(group_scores, k=self.topk_group, dim=-1, sorted=False)[1] # [n, top_k_group] + group_mask = torch.zeros_like(group_scores) # [n, num_group] + group_mask.scatter_(1, group_idx, 1) # [n, num_group] + score_mask = ( + group_mask.unsqueeze(-1) + .expand(batch_size * seq_len, self.num_group, self.num_experts // self.num_group) + .reshape(batch_size * seq_len, -1) + ) # [n, e] + tmp_scores = scores.masked_fill(~score_mask.bool(), 0.0) # [n, e] + topk_weight, topk_idx = torch.topk(tmp_scores, k=self.top_k, dim=-1, sorted=False) + + topk_weight = topk_weight * self.routed_scaling_factor + ### expert-level computation auxiliary loss + return topk_idx, topk_weight + + +class DeepseekV2MoE(nn.Module): + """ + A mixed expert module containing shared experts. + """ + + def __init__(self, config: DeepseekV2Config): + super().__init__() + self.config = config + self.num_experts_per_tok = config.num_experts_per_tok + + self.experts = nn.ModuleList( + [ + (DeepseekV2MLP(config, intermediate_size=config.moe_intermediate_size)) + for _ in range(config.n_routed_experts) + ] + ) + self.gate = DeepseekV2MoEGate(config) + if config.n_shared_experts is not None: + intermediate_size = config.moe_intermediate_size * config.n_shared_experts + self.shared_experts = DeepseekV2MLP(config=config, intermediate_size=intermediate_size) + self.ep_rank = 0 + self.experts_per_rank = config.n_routed_experts + + def moe(self, hidden_states: torch.Tensor, topk_ids: torch.Tensor, topk_weight: torch.Tensor) -> torch.Tensor: + cnts = topk_ids.new_zeros((topk_ids.shape[0], len(self.experts))) + cnts.scatter_(1, topk_ids, 1) + tokens_per_expert = cnts.sum(dim=0) + indices = topk_ids.view(-1).argsort() + sorted_tokens = hidden_states[indices // topk_ids.shape[1]] + + # Process experts + outputs = [] + start_idx = 0 + for i, num_tokens in enumerate(tokens_per_expert): + if num_tokens == 0: + continue + end_idx = start_idx + num_tokens + expert = self.experts[i + self.ep_rank * self.experts_per_rank] + tokens_for_this_expert = sorted_tokens[start_idx:end_idx] + expert_out = expert(tokens_for_this_expert) + outputs.append(expert_out) + start_idx = end_idx + + outs = torch.cat(outputs, dim=0) if outputs else sorted_tokens.new_empty(0) + + # Reorder and combine outputs + new_x = torch.empty_like(outs) + new_x[indices] = outs + hidden_states = ( + new_x.view(*topk_ids.shape, -1) + .type(topk_weight.dtype) + .mul_(topk_weight.unsqueeze(dim=-1)) + .sum(dim=1) + .type(new_x.dtype) + ) + return hidden_states + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + residuals = hidden_states + orig_shape = hidden_states.shape + topk_indices, topk_weights = self.gate(hidden_states) + hidden_states = hidden_states.view(-1, hidden_states.shape[-1]) + hidden_states = self.moe(hidden_states, topk_indices, topk_weights).view(*orig_shape) + hidden_states = hidden_states + self.shared_experts(residuals) + return hidden_states + + +class DeepseekV2MLP(nn.Module): + def __init__(self, config: DeepseekV2Config, hidden_size=None, intermediate_size=None): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size if hidden_size is None else hidden_size + self.intermediate_size = config.intermediate_size if intermediate_size is None else intermediate_size + self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias) + self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias) + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias) + self.act_fn = ACT2FN[config.hidden_act] + + def forward(self, x): + down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) + return down_proj + + +@use_kernel_forward_from_hub("RMSNorm") +class DeepseekV2RMSNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + """ + DeepseekV2RMSNorm is equivalent to T5LayerNorm + """ + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + return self.weight * hidden_states.to(input_dtype) + + def extra_repr(self): + return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}" + + +class DeepseekV2RotaryEmbedding(nn.Module): + inv_freq: torch.Tensor # fix linting for `register_buffer` + + def __init__(self, config: DeepseekV2Config, device=None): + super().__init__() + # BC: "rope_type" was originally "type" + self.rope_type = ( + config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) + if config.rope_scaling is not None + else "default" + ) + + self.max_seq_len_cached = config.max_position_embeddings + self.original_max_seq_len = config.max_position_embeddings + + self.config = config + self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + + inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) + self.original_inv_freq = self.inv_freq + + @torch.no_grad() + @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) + def forward(self, x, position_ids): + inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1) + position_ids_expanded = position_ids[:, None, :].float() + + device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" + with torch.autocast(device_type=device_type, enabled=False): # Force float32 + freqs = (inv_freq_expanded.to(x.device) @ position_ids_expanded).transpose(1, 2) + freqs_cis = torch.polar(torch.ones_like(freqs), freqs) # Convert to complex representation + freqs_cis = freqs_cis * self.attention_scaling + + return freqs_cis + + +def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: + """ + This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, + num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) + """ + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) + + +def eager_attention_forward( + module: nn.Module, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attention_mask: Optional[torch.Tensor], + scaling: float, + dropout: float = 0.0, + **kwargs: Unpack[TransformersKwargs], +): + key_states = repeat_kv(key, module.num_key_value_groups) + value_states = repeat_kv(value, module.num_key_value_groups) + + attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling + if attention_mask is not None: + causal_mask = attention_mask[:, :, :, : key_states.shape[-2]] + attn_weights = attn_weights + causal_mask + + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype) + attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) + attn_output = torch.matmul(attn_weights, value_states) + attn_output = attn_output.transpose(1, 2).contiguous() + + return attn_output, attn_weights + + +def apply_rotary_emb( + xq: torch.Tensor, + xk: torch.Tensor, + freqs_cis: torch.Tensor, +) -> tuple[torch.Tensor, torch.Tensor]: + xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2)) + xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2)) + + # Broadcast to [1, 1, seq_len, dim // 2] + freqs_cis = freqs_cis.unsqueeze(1).to(xq_.device) + + xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3).type_as(xq) + xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3).type_as(xk) + return xq_out, xk_out + + +class DeepseekV2Attention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config: DeepseekV2Config, layer_idx: Optional[int] = None): + super().__init__() + self.config = config + self.layer_idx = layer_idx + self.attention_dropout = config.attention_dropout + self.hidden_size = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = config.head_dim + self.max_position_embeddings = config.max_position_embeddings + self.rope_theta = config.rope_theta + self.q_lora_rank = config.q_lora_rank + self.qk_rope_head_dim = config.qk_rope_head_dim + self.kv_lora_rank = config.kv_lora_rank + self.v_head_dim = config.v_head_dim + self.qk_nope_head_dim = config.qk_nope_head_dim + self.qk_head_dim = config.qk_nope_head_dim + config.qk_rope_head_dim + self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads + + self.is_causal = True + + if self.q_lora_rank is None: + self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.qk_head_dim, bias=False) + else: + self.q_a_proj = nn.Linear(self.hidden_size, config.q_lora_rank, bias=config.attention_bias) + self.q_a_layernorm = DeepseekV2RMSNorm(config.q_lora_rank) + self.q_b_proj = nn.Linear(config.q_lora_rank, self.num_heads * self.qk_head_dim, bias=False) + + self.kv_a_proj_with_mqa = nn.Linear( + self.hidden_size, + config.kv_lora_rank + config.qk_rope_head_dim, + bias=config.attention_bias, + ) + self.kv_a_layernorm = DeepseekV2RMSNorm(config.kv_lora_rank) + self.kv_b_proj = nn.Linear( + config.kv_lora_rank, + self.num_heads * (self.qk_head_dim - self.qk_rope_head_dim + self.v_head_dim), + bias=False, + ) + + self.o_proj = nn.Linear( + self.num_heads * self.v_head_dim, + self.hidden_size, + bias=config.attention_bias, + ) + + self.scaling = self.qk_head_dim ** (-0.5) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[Cache] = None, + cache_position: Optional[torch.LongTensor] = None, + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, + position_ids: Optional[torch.Tensor] = None, + **kwargs, + ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]: + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" + ) + batch_size, seq_length = hidden_states.shape[:-1] + query_shape = (batch_size, seq_length, -1, self.qk_head_dim) + key_shape = (batch_size, seq_length, -1, self.qk_nope_head_dim + self.v_head_dim) + + if self.q_lora_rank is None: + q = self.q_proj(hidden_states) + else: + q = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states))) + q = q.view(query_shape).transpose(1, 2) + q_nope, q_pe = torch.split(q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1) + + compressed_kv = self.kv_a_proj_with_mqa(hidden_states) + k_nope, k_pe = torch.split(compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1) + k_nope = self.kv_b_proj(self.kv_a_layernorm(k_nope)).view(key_shape).transpose(1, 2) + k_nope, value_states = torch.split(k_nope, [self.qk_nope_head_dim, self.v_head_dim], dim=-1) + + k_pe = k_pe.view(batch_size, 1, seq_length, self.qk_rope_head_dim) + q_pe, k_pe = apply_rotary_emb(q_pe, k_pe, position_embeddings.to(q_pe.device)) + + k_pe = k_pe.expand(*k_nope.shape[:-1], -1) + query_states = torch.cat((q_nope, q_pe), dim=-1) + key_states = torch.cat((k_nope, k_pe), dim=-1) + + if past_key_values is not None: + # sin and cos are specific to RoPE models; cache_position needed for the static cache + cache_kwargs = {"cache_position": cache_position} + key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs) + + if self.config._attn_implementation == "flash_attention_2" and self.qk_head_dim != self.v_head_dim: + value_states = F.pad(value_states, [0, self.qk_head_dim - self.v_head_dim]) + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + attn_output, attn_weights = attention_interface( + self, + query_states, + key_states, + value_states, + attention_mask, + dropout=0.0 if not self.training else self.attention_dropout, + scaling=self.scaling, + **kwargs, + ) + + if self.config._attn_implementation == "flash_attention_2" and self.qk_head_dim != self.v_head_dim: + attn_output = attn_output[:, :, :, : self.v_head_dim] + + attn_output = attn_output.reshape(batch_size, seq_length, -1).contiguous() + attn_output = self.o_proj(attn_output) + return attn_output, attn_weights + + +class DeepseekV2DecoderLayer(GradientCheckpointingLayer): + def __init__(self, config: DeepseekV2Config, layer_idx: int): + super().__init__() + self.hidden_size = config.hidden_size + + self.self_attn = DeepseekV2Attention(config=config, layer_idx=layer_idx) + self.mlp = DeepseekV2MoE(config) if layer_idx >= config.first_k_dense_replace else DeepseekV2MLP(config) + + self.input_layernorm = DeepseekV2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = DeepseekV2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + use_cache: Optional[bool] = False, + cache_position: Optional[torch.LongTensor] = None, + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + **kwargs: Unpack[TransformersKwargs], + ) -> torch.Tensor: + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + # Self Attention + hidden_states, _ = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + use_cache=use_cache, + cache_position=cache_position, + position_embeddings=position_embeddings, + **kwargs, + ) + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + return hidden_states + + +@auto_docstring +class DeepseekV2PreTrainedModel(PreTrainedModel): + config: DeepseekV2Config + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["DeepseekV2DecoderLayer"] + _skip_keys_device_placement = ["past_key_values"] + _supports_flash_attn = True + _supports_sdpa = True + _supports_flex_attn = True + _can_compile_fullgraph = False + _supports_attention_backend = True + _can_record_outputs = { + "hidden_states": DeepseekV2DecoderLayer, + "attentions": DeepseekV2Attention, + } + + def _init_weights(self, module): + super()._init_weights(module) + if isinstance(module, DeepseekV2MoEGate): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + + +@auto_docstring +class DeepseekV2Model(DeepseekV2PreTrainedModel): + def __init__(self, config: DeepseekV2Config): + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) + self.layers = nn.ModuleList( + [DeepseekV2DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + ) + self.norm = DeepseekV2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.rotary_emb = DeepseekV2RotaryEmbedding(config=config) + self.gradient_checkpointing = False + + # Initialize weights and apply final processing + self.post_init() + + @check_model_inputs + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + cache_position: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> BaseModelOutputWithPast: + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") + + if inputs_embeds is None: + inputs_embeds: torch.Tensor = self.embed_tokens(input_ids) + + if use_cache and past_key_values is None: + past_key_values = DynamicCache(config=self.config) + + if cache_position is None: + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + cache_position: torch.Tensor = torch.arange( + past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device + ) + + if position_ids is None: + position_ids = cache_position.unsqueeze(0) + + causal_mask = create_causal_mask( + config=self.config, + input_embeds=inputs_embeds, + attention_mask=attention_mask, + cache_position=cache_position, + past_key_values=past_key_values, + position_ids=position_ids, + ) + + hidden_states = inputs_embeds + position_embeddings = self.rotary_emb(hidden_states, position_ids) + + for decoder_layer in self.layers[: self.config.num_hidden_layers]: + hidden_states = decoder_layer( + hidden_states, + attention_mask=causal_mask, + position_ids=position_ids, + past_key_values=past_key_values, + cache_position=cache_position, + position_embeddings=position_embeddings, + **kwargs, + ) + + hidden_states = self.norm(hidden_states) + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=past_key_values, + ) + + +@auto_docstring +class DeepseekV2ForCausalLM(DeepseekV2PreTrainedModel, GenerationMixin): + _tied_weights_keys = ["lm_head.weight"] + _tp_plan = {"lm_head": "colwise_rep"} + _pp_plan = {"lm_head": (["hidden_states"], ["logits"])} + + def __init__(self, config): + super().__init__(config) + self.model = DeepseekV2Model(config) + self.vocab_size = config.vocab_size + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + @can_return_tuple + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + logits_to_keep: Union[int, torch.Tensor] = 0, + **kwargs: Unpack[TransformersKwargs], + ) -> CausalLMOutputWithPast: + r""" + Example: + + ```python + >>> from transformers import AutoTokenizer, DeepseekV2ForCausalLM + + >>> model = DeepseekV2ForCausalLM.from_pretrained("meta-deepseek_v2/DeepseekV2-2-7b-hf") + >>> tokenizer = AutoTokenizer.from_pretrained("meta-deepseek_v2/DeepseekV2-2-7b-hf") + + >>> prompt = "Hey, are you conscious? Can you talk to me?" + >>> inputs = tokenizer(prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(inputs.input_ids, max_length=30) + >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you." + ```""" + outputs: BaseModelOutputWithPast = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + cache_position=cache_position, + **kwargs, + ) + + hidden_states = outputs.last_hidden_state + # Only compute necessary logits, and do not upcast them to float if we are not computing the loss + slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep + logits = self.lm_head(hidden_states[:, slice_indices, :]) + + loss = None + if labels is not None: + loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs) + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +class DeepseekV2ForSequenceClassification(GenericForSequenceClassification, DeepseekV2PreTrainedModel): + pass + + +__all__ = [ + "DeepseekV2PreTrainedModel", + "DeepseekV2Model", + "DeepseekV2ForCausalLM", + "DeepseekV2ForSequenceClassification", +] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deepseek_v2/modular_deepseek_v2.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deepseek_v2/modular_deepseek_v2.py new file mode 100644 index 0000000000000000000000000000000000000000..80fa2c8860a3442a2904ec1061514a020290132f --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deepseek_v2/modular_deepseek_v2.py @@ -0,0 +1,533 @@ +# coding=utf-8 +# Copyright 2025 HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import warnings +from typing import Callable, Optional + +import torch +import torch.nn.functional as F +from torch import nn + +from ...cache_utils import Cache +from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel +from ...utils import ( + logging, +) +from ...utils.deprecation import deprecate_kwarg +from ..llama.configuration_llama import LlamaConfig +from ..llama.modeling_llama import ( + LlamaDecoderLayer, + LlamaForCausalLM, + LlamaForSequenceClassification, + LlamaMLP, + LlamaModel, + LlamaPreTrainedModel, + LlamaRMSNorm, + eager_attention_forward, +) +from ..llama4.modeling_llama4 import Llama4TextRotaryEmbedding + + +logger = logging.get_logger(__name__) + + +class DeepseekV2Config(LlamaConfig): + r""" + This is the configuration class to store the configuration of a [`DeepseekV2Model`]. It is used to instantiate a DeepSeek + model according to the specified arguments, defining the model architecture. Instantiating a configuration with the + defaults will yield a similar configuration to that of DeepSeek-V2-Lite" [deepseek-ai/DeepSeek-V2-Lite"](https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite"). + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + vocab_size (`int`, *optional*, defaults to 32000): + Vocabulary size of the DeepSeek model. Defines the number of different tokens that can be represented by the + `input_ids` passed when calling [`DeepseekV2Model`]. + hidden_size (`int`, *optional*, defaults to 4096): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 11008): + Dimension of the MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 32): + Number of hidden layers in the Transformer decoder. + num_attention_heads (`int`, *optional*, defaults to 32): + Number of attention heads for each attention layer in the Transformer decoder. + num_key_value_heads (`int`, *optional*): + The number of key-value heads used to implement Grouped Query Attention (GQA). If + `num_key_value_heads=num_attention_heads`, the model will use Multi-Head Attention (MHA). If + `num_key_value_heads=1`, the model will use Multi-Query Attention (MQA). Otherwise, GQA is used. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the decoder. + max_position_embeddings (`int`, *optional*, defaults to 2048): + The maximum sequence length that this model might ever be used with. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated normal initializer for initializing all weight matrices. + rms_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon value used by the RMS normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/value attentions (useful for inference optimization). + pad_token_id (`int`, *optional*): + Padding token ID. + bos_token_id (`int`, *optional*, defaults to 1): + Beginning-of-sequence token ID. + eos_token_id (`int`, *optional*, defaults to 2): + End-of-sequence token ID. + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether to tie input and output embeddings. + rope_theta (`float`, *optional*, defaults to 10000.0): + The base period of the Rotary Position Embeddings (RoPE). + rope_scaling (`Dict`, *optional*): + Configuration for scaling RoPE embeddings. Supports `linear` and `dynamic` scaling strategies. + attention_bias (`bool`, *optional*, defaults to `False`): + Whether to use a bias in the query, key, value, and output projection layers during self-attention. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout probability applied to attention weights. + mlp_bias (`bool`, *optional*, defaults to `False`): + Whether to use a bias term in the MLP layers. + aux_loss_alpha (`float`, *optional*, defaults to 0.001): + Weight coefficient for auxiliary loss in Mixture of Experts (MoE) models. + first_k_dense_replace (`int`, *optional*, defaults to 0): + Number of dense layers in the shallow layers before switching to MoE layers. + kv_lora_rank (`int`, *optional*, defaults to 512): + Rank of the LoRA decomposition for key-value projections. + q_lora_rank (`int`, *optional*, defaults to 1536): + Rank of the LoRA decomposition for query projections. + Specifically, it determines the dimensionality to which the query (q) vectors are compressed before being expanded back to their original size. + It reduces computational overhead while maintaining model performance. + n_group (`int`, *optional*): + Number of groups for routed experts. + n_routed_experts (`int`, *optional*, defaults to 64): + Number of routed experts (None indicates a dense model). + n_shared_experts (`int`, *optional*, defaults to 2): + Number of shared experts (None indicates a dense model). + qk_nope_head_dim (`int`, *optional*, defaults to 128): + The head dimension for the QK (query-key) projections when using NOPE (Neural Operator Position Encoding). + qk_rope_head_dim (`int`, *optional*, defaults to 64): + The head dimension for QK projections when using RoPE. + routed_scaling_factor (`float`, *optional*, defaults to 1.0): + Scaling factor for routed experts in MoE models. + seq_aux (`bool`, *optional*, defaults to `True`): + Whether to compute the auxiliary loss for each individual sequence. + topk_group (`int`, *optional*): + Number of selected groups per token for expert selection. + topk_method (`str`, *optional*, defaults to `"greedy"`): + The method used for selecting top-k experts in the routed gate mechanism. + v_head_dim (`int`, *optional*, defaults to 128): + The dimension of value projections in the attention layers. + num_experts_per_tok (`int`, *optional*): + The number of experts selected per token. If `None`, the model behaves as a dense Transformer. + norm_topk_prob (`bool`, *optional*, defaults to `False`): + Whether to normalize the probability distribution over top-k selected experts. + moe_intermediate_size (`int`, *optional*, defaults to 1407): + Dimension of the MoE (Mixture of Experts) representations. + + ```python + >>> from transformers import DeepseekV2Model, DeepseekV2Config + >>> # Initializing a DeepSeek-V2 style configuration + >>> configuration = DeepseekV2Config() + >>> # Accessing the model configuration + >>> model = DeepseekV2Model(configuration) + >>> print(model.config) + ``` + """ + + base_model_tp_plan = { + "layers.*.self_attn.q_proj": "colwise", + "layers.*.self_attn.q_a_proj": "colwise", + "layers.*.self_attn.q_b_proj": "colwise", + "layers.*.self_attn.kv_b_proj": "colwise", + "layers.*.self_attn.o_proj": "rowwise", + "layers.*.mlp.gate_proj": "colwise", + "layers.*.mlp.up_proj": "colwise", + "layers.*.mlp.down_proj": "rowwise", + } + + model_type = "deepseek_v2" + keys_to_ignore_at_inference = ["past_key_values"] + + def __init__( + self, + vocab_size=32000, + hidden_size=4096, + intermediate_size=11008, + num_hidden_layers=32, + num_attention_heads=32, + num_key_value_heads=None, + hidden_act="silu", + max_position_embeddings=2048, + initializer_range=0.02, + rms_norm_eps=1e-6, + use_cache=True, + pad_token_id=None, + bos_token_id=1, + eos_token_id=2, + tie_word_embeddings=False, + rope_theta=10000.0, + rope_scaling=None, + attention_bias=False, + attention_dropout=0.0, + mlp_bias=False, + aux_loss_alpha=0.001, + first_k_dense_replace=0, + kv_lora_rank=512, + q_lora_rank=1536, + n_group=None, + n_routed_experts=64, + n_shared_experts=2, + qk_nope_head_dim=128, + qk_rope_head_dim=64, + routed_scaling_factor=1.0, + seq_aux=True, + topk_group=None, + topk_method="greedy", + v_head_dim=128, + num_experts_per_tok=None, + norm_topk_prob=False, + moe_intermediate_size=1407, + **kwargs, + ): + super().__init__(**kwargs) + + del self.pretraining_tp + self.aux_loss_alpha = aux_loss_alpha + self.first_k_dense_replace = first_k_dense_replace + self.kv_lora_rank = kv_lora_rank + self.q_lora_rank = q_lora_rank + self.n_group = n_group + self.n_routed_experts = n_routed_experts + self.n_shared_experts = n_shared_experts + self.qk_nope_head_dim = qk_nope_head_dim + self.qk_rope_head_dim = qk_rope_head_dim + self.routed_scaling_factor = routed_scaling_factor + self.seq_aux = seq_aux + self.topk_group = topk_group + self.topk_method = topk_method + self.v_head_dim = v_head_dim + self.num_experts_per_tok = num_experts_per_tok + self.norm_topk_prob = norm_topk_prob + self.moe_intermediate_size = moe_intermediate_size + self.head_dim = qk_rope_head_dim + + +def apply_rotary_emb( + xq: torch.Tensor, + xk: torch.Tensor, + freqs_cis: torch.Tensor, +) -> tuple[torch.Tensor, torch.Tensor]: + xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2)) + xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2)) + + # Broadcast to [1, 1, seq_len, dim // 2] + freqs_cis = freqs_cis.unsqueeze(1).to(xq_.device) + + xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3).type_as(xq) + xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3).type_as(xk) + return xq_out, xk_out + + +class DeepseekV2MoEGate(nn.Module): + def __init__(self, config: DeepseekV2Config): + super().__init__() + self.config = config + self.top_k = config.num_experts_per_tok + self.num_experts = config.n_routed_experts + self.routed_scaling_factor = config.routed_scaling_factor + self.alpha = config.aux_loss_alpha + self.seq_aux = config.seq_aux + self.topk_method = config.topk_method + self.num_group = config.n_group + self.topk_group = config.topk_group + + # topk selection algorithm + self.norm_topk_prob = config.norm_topk_prob + self.gating_dim = config.hidden_size + self.weight = nn.Parameter(torch.empty((self.num_experts, self.gating_dim))) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + batch_size, seq_len, hidden_dim = hidden_states.shape + ### compute gating score + hidden_states = hidden_states.view(-1, hidden_dim) + logits = F.linear(hidden_states.type(torch.float32), self.weight.type(torch.float32), None) + scores = logits.softmax(dim=-1, dtype=torch.float32) + + # select top-k experts + # greedy method is used for DeepSeek-V2-Lite + # group_limited_greedy for DeepSeek-V2 and DeepSeek-V2-Chat + if self.topk_method == "greedy": + topk_weight, topk_idx = torch.topk(scores, k=self.top_k, dim=-1, sorted=False) + elif self.topk_method == "group_limited_greedy": + group_scores = scores.view(batch_size * seq_len, self.num_group, -1).max(dim=-1).values # [n, num_group] + group_idx = torch.topk(group_scores, k=self.topk_group, dim=-1, sorted=False)[1] # [n, top_k_group] + group_mask = torch.zeros_like(group_scores) # [n, num_group] + group_mask.scatter_(1, group_idx, 1) # [n, num_group] + score_mask = ( + group_mask.unsqueeze(-1) + .expand(batch_size * seq_len, self.num_group, self.num_experts // self.num_group) + .reshape(batch_size * seq_len, -1) + ) # [n, e] + tmp_scores = scores.masked_fill(~score_mask.bool(), 0.0) # [n, e] + topk_weight, topk_idx = torch.topk(tmp_scores, k=self.top_k, dim=-1, sorted=False) + + topk_weight = topk_weight * self.routed_scaling_factor + ### expert-level computation auxiliary loss + return topk_idx, topk_weight + + +class DeepseekV2MoE(nn.Module): + """ + A mixed expert module containing shared experts. + """ + + def __init__(self, config: DeepseekV2Config): + super().__init__() + self.config = config + self.num_experts_per_tok = config.num_experts_per_tok + + self.experts = nn.ModuleList( + [ + (DeepseekV2MLP(config, intermediate_size=config.moe_intermediate_size)) + for _ in range(config.n_routed_experts) + ] + ) + self.gate = DeepseekV2MoEGate(config) + if config.n_shared_experts is not None: + intermediate_size = config.moe_intermediate_size * config.n_shared_experts + self.shared_experts = DeepseekV2MLP(config=config, intermediate_size=intermediate_size) + self.ep_rank = 0 + self.experts_per_rank = config.n_routed_experts + + def moe(self, hidden_states: torch.Tensor, topk_ids: torch.Tensor, topk_weight: torch.Tensor) -> torch.Tensor: + cnts = topk_ids.new_zeros((topk_ids.shape[0], len(self.experts))) + cnts.scatter_(1, topk_ids, 1) + tokens_per_expert = cnts.sum(dim=0) + indices = topk_ids.view(-1).argsort() + sorted_tokens = hidden_states[indices // topk_ids.shape[1]] + + # Process experts + outputs = [] + start_idx = 0 + for i, num_tokens in enumerate(tokens_per_expert): + if num_tokens == 0: + continue + end_idx = start_idx + num_tokens + expert = self.experts[i + self.ep_rank * self.experts_per_rank] + tokens_for_this_expert = sorted_tokens[start_idx:end_idx] + expert_out = expert(tokens_for_this_expert) + outputs.append(expert_out) + start_idx = end_idx + + outs = torch.cat(outputs, dim=0) if outputs else sorted_tokens.new_empty(0) + + # Reorder and combine outputs + new_x = torch.empty_like(outs) + new_x[indices] = outs + hidden_states = ( + new_x.view(*topk_ids.shape, -1) + .type(topk_weight.dtype) + .mul_(topk_weight.unsqueeze(dim=-1)) + .sum(dim=1) + .type(new_x.dtype) + ) + return hidden_states + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + residuals = hidden_states + orig_shape = hidden_states.shape + topk_indices, topk_weights = self.gate(hidden_states) + hidden_states = hidden_states.view(-1, hidden_states.shape[-1]) + hidden_states = self.moe(hidden_states, topk_indices, topk_weights).view(*orig_shape) + hidden_states = hidden_states + self.shared_experts(residuals) + return hidden_states + + +class DeepseekV2MLP(LlamaMLP): + def __init__(self, config: DeepseekV2Config, hidden_size=None, intermediate_size=None): + super().__init__(config) + self.hidden_size = config.hidden_size if hidden_size is None else hidden_size + self.intermediate_size = config.intermediate_size if intermediate_size is None else intermediate_size + + +class DeepseekV2RMSNorm(LlamaRMSNorm): + pass + + +class DeepseekV2RotaryEmbedding(Llama4TextRotaryEmbedding): + def __init__(self, config: DeepseekV2Config, device=None): + super().__init__(config=config, device=device) + # BC: "rope_type" was originally "type" + self.rope_type = ( + config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) + if config.rope_scaling is not None + else "default" + ) + + +class DeepseekV2Attention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config: DeepseekV2Config, layer_idx: Optional[int] = None): + super().__init__() + self.config = config + self.layer_idx = layer_idx + self.attention_dropout = config.attention_dropout + self.hidden_size = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = config.head_dim + self.max_position_embeddings = config.max_position_embeddings + self.rope_theta = config.rope_theta + self.q_lora_rank = config.q_lora_rank + self.qk_rope_head_dim = config.qk_rope_head_dim + self.kv_lora_rank = config.kv_lora_rank + self.v_head_dim = config.v_head_dim + self.qk_nope_head_dim = config.qk_nope_head_dim + self.qk_head_dim = config.qk_nope_head_dim + config.qk_rope_head_dim + self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads + + self.is_causal = True + + if self.q_lora_rank is None: + self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.qk_head_dim, bias=False) + else: + self.q_a_proj = nn.Linear(self.hidden_size, config.q_lora_rank, bias=config.attention_bias) + self.q_a_layernorm = DeepseekV2RMSNorm(config.q_lora_rank) + self.q_b_proj = nn.Linear(config.q_lora_rank, self.num_heads * self.qk_head_dim, bias=False) + + self.kv_a_proj_with_mqa = nn.Linear( + self.hidden_size, + config.kv_lora_rank + config.qk_rope_head_dim, + bias=config.attention_bias, + ) + self.kv_a_layernorm = DeepseekV2RMSNorm(config.kv_lora_rank) + self.kv_b_proj = nn.Linear( + config.kv_lora_rank, + self.num_heads * (self.qk_head_dim - self.qk_rope_head_dim + self.v_head_dim), + bias=False, + ) + + self.o_proj = nn.Linear( + self.num_heads * self.v_head_dim, + self.hidden_size, + bias=config.attention_bias, + ) + + self.scaling = self.qk_head_dim ** (-0.5) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[Cache] = None, + cache_position: Optional[torch.LongTensor] = None, + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, + position_ids: Optional[torch.Tensor] = None, + **kwargs, + ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]: + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" + ) + batch_size, seq_length = hidden_states.shape[:-1] + query_shape = (batch_size, seq_length, -1, self.qk_head_dim) + key_shape = (batch_size, seq_length, -1, self.qk_nope_head_dim + self.v_head_dim) + + if self.q_lora_rank is None: + q = self.q_proj(hidden_states) + else: + q = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states))) + q = q.view(query_shape).transpose(1, 2) + q_nope, q_pe = torch.split(q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1) + + compressed_kv = self.kv_a_proj_with_mqa(hidden_states) + k_nope, k_pe = torch.split(compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1) + k_nope = self.kv_b_proj(self.kv_a_layernorm(k_nope)).view(key_shape).transpose(1, 2) + k_nope, value_states = torch.split(k_nope, [self.qk_nope_head_dim, self.v_head_dim], dim=-1) + + k_pe = k_pe.view(batch_size, 1, seq_length, self.qk_rope_head_dim) + q_pe, k_pe = apply_rotary_emb(q_pe, k_pe, position_embeddings.to(q_pe.device)) + + k_pe = k_pe.expand(*k_nope.shape[:-1], -1) + query_states = torch.cat((q_nope, q_pe), dim=-1) + key_states = torch.cat((k_nope, k_pe), dim=-1) + + if past_key_values is not None: + # sin and cos are specific to RoPE models; cache_position needed for the static cache + cache_kwargs = {"cache_position": cache_position} + key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs) + + if self.config._attn_implementation == "flash_attention_2" and self.qk_head_dim != self.v_head_dim: + value_states = F.pad(value_states, [0, self.qk_head_dim - self.v_head_dim]) + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + attn_output, attn_weights = attention_interface( + self, + query_states, + key_states, + value_states, + attention_mask, + dropout=0.0 if not self.training else self.attention_dropout, + scaling=self.scaling, + **kwargs, + ) + + if self.config._attn_implementation == "flash_attention_2" and self.qk_head_dim != self.v_head_dim: + attn_output = attn_output[:, :, :, : self.v_head_dim] + + attn_output = attn_output.reshape(batch_size, seq_length, -1).contiguous() + attn_output = self.o_proj(attn_output) + return attn_output, attn_weights + + +class DeepseekV2DecoderLayer(LlamaDecoderLayer): + def __init__(self, config: DeepseekV2Config, layer_idx: int): + super().__init__(config, layer_idx) + + self.self_attn = DeepseekV2Attention(config=config, layer_idx=layer_idx) + self.mlp = DeepseekV2MoE(config) if layer_idx >= config.first_k_dense_replace else DeepseekV2MLP(config) + + self.input_layernorm = DeepseekV2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = DeepseekV2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + +class DeepseekV2PreTrainedModel(LlamaPreTrainedModel): + _can_compile_fullgraph = False + + def _init_weights(self, module): + PreTrainedModel._init_weights(self, module) + if isinstance(module, DeepseekV2MoEGate): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + + +class DeepseekV2Model(LlamaModel): + pass + + +class DeepseekV2ForCausalLM(LlamaForCausalLM): + pass + + +class DeepseekV2ForSequenceClassification(LlamaForSequenceClassification): + pass + + +__all__ = [ + "DeepseekV2PreTrainedModel", + "DeepseekV2Model", + "DeepseekV2ForCausalLM", + "DeepseekV2ForSequenceClassification", + "DeepseekV2Config", +] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deepseek_v3/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deepseek_v3/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..298f4c968375e6d90728a3d4e78c5046e4591c01 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deepseek_v3/__init__.py @@ -0,0 +1,27 @@ +# Copyright 2025 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_deepseek_v3 import * + from .modeling_deepseek_v3 import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deepseek_v3/configuration_deepseek_v3.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deepseek_v3/configuration_deepseek_v3.py new file mode 100644 index 0000000000000000000000000000000000000000..0b885f8d5ac6e22a1bf3da5e736a3aad26226a21 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deepseek_v3/configuration_deepseek_v3.py @@ -0,0 +1,253 @@ +# coding=utf-8 +# Copyright 2025 bzantium and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on the DeepSeekV3 implementations from the DeepSeek AI team. (https://huggingface.co/deepseek-ai/DeepSeek-V3) + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""DeepSeekV3 model configuration""" + +from ...configuration_utils import PretrainedConfig +from ...modeling_rope_utils import rope_config_validation + + +DEEPSEEK_PRETRAINED_CONFIG_ARCHIVE_MAP = {} + + +class DeepseekV3Config(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`DeepseekV3Model`]. It is used to instantiate an DeepSeek + model according to the specified arguments, defining the model architecture. Instantiating a configuration with the + defaults will yield a similar configuration to that of the DeepSeek-V3. + e.g. [bzantium/tiny-deepseek-v3](https://huggingface.co/bzantium/tiny-deepseek-v3) + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 129280): + Vocabulary size of the Deep model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`DeepseekV3Model`] + hidden_size (`int`, *optional*, defaults to 7168): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 18432): + Dimension of the MLP representations. + moe_intermediate_size (`int`, *optional*, defaults to 2048): + Dimension of the MoE representations. + num_hidden_layers (`int`, *optional*, defaults to 61): + Number of hidden layers in the Transformer decoder. + num_attention_heads (`int`, *optional*, defaults to 128): + Number of attention heads for each attention layer in the Transformer decoder. + num_key_value_heads (`int`, *optional*, defaults to 128): + This is the number of key_value heads that should be used to implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When + converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed + by meanpooling all the original heads within that group. For more details, check out [this + paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to + `num_attention_heads`. + n_shared_experts (`int`, *optional*, defaults to 1): + Number of shared experts. + n_routed_experts (`int`, *optional*, defaults to 256): + Number of routed experts. + routed_scaling_factor (`float`, *optional*, defaults to 2.5): + Scaling factor or routed experts. + kv_lora_rank (`int`, *optional*, defaults to 512): + Rank of the LoRA matrices for key and value projections. + q_lora_rank (`int`, *optional*, defaults to 1536): + Rank of the LoRA matrices for query projections. + qk_rope_head_dim (`int`, *optional*, defaults to 64): + Dimension of the query/key heads that use rotary position embeddings. + v_head_dim (`int`, *optional*, defaults to 128): + Dimension of the value heads. + qk_nope_head_dim (`int`, *optional*, defaults to 128): + Dimension of the query/key heads that don't use rotary position embeddings. + n_group (`int`, *optional*, defaults to 8): + Number of groups for routed experts. + topk_group (`int`, *optional*, defaults to 4): + Number of selected groups for each token(for each token, ensuring the selected experts is only within `topk_group` groups). + num_experts_per_tok (`int`, *optional*, defaults to 8): + Number of selected experts, None means dense model. + first_k_dense_replace (`int`, *optional*, defaults to 3): + Number of dense layers in shallow layers(embed->dense->dense->...->dense->moe->moe...->lm_head). + \--k dense layers--/ + norm_topk_prob (`bool`, *optional*, defaults to `True`): + Whether to normalize the weights of the routed experts. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the decoder. + max_position_embeddings (`int`, *optional*, defaults to 4096): + The maximum sequence length that this model might ever be used with. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + rms_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon used by the rms normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + pad_token_id (`int`, *optional*): + Padding token id. + bos_token_id (`int`, *optional*, defaults to 0): + Beginning of stream token id. + eos_token_id (`int`, *optional*, defaults to 1): + End of stream token id. + pretraining_tp (`int`, *optional*, defaults to 1): + Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this + document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is + necessary to ensure exact reproducibility of the pretraining results. Please refer to [this + issue](https://github.com/pytorch/pytorch/issues/76232). + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether to tie weight embeddings + rope_theta (`float`, *optional*, defaults to 10000.0): + The base period of the RoPE embeddings. + rope_scaling (`Dict`, *optional*): + Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling + strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is + `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update + `max_position_embeddings` to the expected new maximum. + rope_interleave (`bool`, *optional*, defaults to `True`): + Whether to interleave the rotary position embeddings. + attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`): + Whether to use a bias in the query, key, value and output projection layers during self-attention. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + + ```python + >>> from transformers import DeepseekV3Model, DeepseekV3Config + + >>> # Initializing a Deepseek-V3 style configuration + >>> configuration = DeepseekV3Config() + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "deepseek_v3" + keys_to_ignore_at_inference = ["past_key_values"] + base_model_tp_plan = { # TODO: only replicate attention layers when > first_k_dense_replace + "layers.*.mlp.experts.*.gate_proj": "local_colwise", + "layers.*.mlp.experts.*.up_proj": "local_colwise", + "layers.*.mlp.experts.*.down_proj": "local_rowwise", + "layers.*.mlp.experts.*": "local", # each expert is wrapped in a module list + "layers.*.mlp.shared_experts.gate_proj": "local_colwise", + "layers.*.mlp.shared_experts.up_proj": "local_colwise", + "layers.*.mlp.shared_experts.down_proj": "local_rowwise", + "layers.*.mlp.shared_experts": "local", + "layers.*.mlp.gate_proj": "local_colwise", + "layers.*.mlp.up_proj": "local_colwise", + "layers.*.mlp.down_proj": "local_rowwise", + "layers.*.mlp": "gather", # This is the only moment where results are gathered + } + base_model_pp_plan = { + "embed_tokens": (["input_ids"], ["inputs_embeds"]), + "layers": (["hidden_states", "attention_mask"], ["hidden_states"]), + "norm": (["hidden_states"], ["hidden_states"]), + } + + def __init__( + self, + vocab_size=129280, + hidden_size=7168, + intermediate_size=18432, + moe_intermediate_size=2048, + num_hidden_layers=61, + num_attention_heads=128, + num_key_value_heads=128, + n_shared_experts=1, + n_routed_experts=256, + routed_scaling_factor=2.5, + kv_lora_rank=512, + q_lora_rank=1536, + qk_rope_head_dim=64, + v_head_dim=128, + qk_nope_head_dim=128, + n_group=8, + topk_group=4, + num_experts_per_tok=8, + first_k_dense_replace=3, + norm_topk_prob=True, + hidden_act="silu", + max_position_embeddings=4096, + initializer_range=0.02, + rms_norm_eps=1e-6, + use_cache=True, + pad_token_id=None, + bos_token_id=0, + eos_token_id=1, + pretraining_tp=1, + tie_word_embeddings=False, + rope_theta=10000.0, + rope_scaling=None, + rope_interleave=True, + attention_bias=False, + attention_dropout=0.0, + **kwargs, + ): + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.moe_intermediate_size = moe_intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.n_shared_experts = n_shared_experts + self.n_routed_experts = n_routed_experts + self.routed_scaling_factor = routed_scaling_factor + self.kv_lora_rank = kv_lora_rank + self.q_lora_rank = q_lora_rank + self.qk_rope_head_dim = qk_rope_head_dim + self.v_head_dim = v_head_dim + self.qk_nope_head_dim = qk_nope_head_dim + self.qk_head_dim = qk_nope_head_dim + qk_rope_head_dim + self.head_dim = qk_rope_head_dim + self.n_group = n_group + self.topk_group = topk_group + self.num_experts_per_tok = num_experts_per_tok + self.first_k_dense_replace = first_k_dense_replace + self.norm_topk_prob = norm_topk_prob + self.rope_interleave = rope_interleave + + # for backward compatibility + if num_key_value_heads is None: + num_key_value_heads = num_attention_heads + + self.num_key_value_heads = num_key_value_heads + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + self.pretraining_tp = pretraining_tp + self.use_cache = use_cache + self.rope_theta = rope_theta + self.rope_scaling = rope_scaling + self.attention_bias = attention_bias + self.attention_dropout = attention_dropout + # Validate the correctness of rotary position embeddings parameters + # BC: if there is a 'type' field, copy it it to 'rope_type'. + if self.rope_scaling is not None and "type" in self.rope_scaling: + self.rope_scaling["rope_type"] = self.rope_scaling["type"] + + if self.rope_scaling is not None: + for key in ["beta_fast", "beta_slow", "factor"]: + if key in self.rope_scaling: + self.rope_scaling[key] = float(self.rope_scaling[key]) + + rope_config_validation(self) + + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) + + +__all__ = ["DeepseekV3Config"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deepseek_v3/modeling_deepseek_v3.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deepseek_v3/modeling_deepseek_v3.py new file mode 100644 index 0000000000000000000000000000000000000000..c4552fb218eec0550322702103d645ac286fa0b0 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deepseek_v3/modeling_deepseek_v3.py @@ -0,0 +1,693 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/deepseek_v3/modular_deepseek_v3.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_deepseek_v3.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +import math +from typing import Callable, Optional, Union + +import torch +import torch.nn.functional as F +from torch import nn + +from ...activations import ACT2FN +from ...cache_utils import Cache, DynamicCache +from ...generation import GenerationMixin +from ...integrations import use_kernel_forward_from_hub +from ...masking_utils import create_causal_mask +from ...modeling_flash_attention_utils import FlashAttentionKwargs +from ...modeling_layers import ( + GenericForSequenceClassification, + GenericForTokenClassification, + GradientCheckpointingLayer, +) +from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast +from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update +from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel +from ...processing_utils import Unpack +from ...utils import TransformersKwargs, auto_docstring, can_return_tuple +from ...utils.deprecation import deprecate_kwarg +from ...utils.generic import check_model_inputs +from .configuration_deepseek_v3 import DeepseekV3Config + + +@use_kernel_forward_from_hub("RMSNorm") +class DeepseekV3RMSNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + """ + DeepseekV3RMSNorm is equivalent to T5LayerNorm + """ + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + return self.weight * hidden_states.to(input_dtype) + + def extra_repr(self): + return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}" + + +class DeepseekV3RotaryEmbedding(nn.Module): + inv_freq: torch.Tensor # fix linting for `register_buffer` + + def __init__(self, config: DeepseekV3Config, device=None): + super().__init__() + # BC: "rope_type" was originally "type" + if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): + self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) + else: + self.rope_type = "default" + self.max_seq_len_cached = config.max_position_embeddings + self.original_max_seq_len = config.max_position_embeddings + + self.config = config + self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + + inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) + self.original_inv_freq = self.inv_freq + + @torch.no_grad() + @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) + def forward(self, x, position_ids): + inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device) + position_ids_expanded = position_ids[:, None, :].float() + + device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" + with torch.autocast(device_type=device_type, enabled=False): # Force float32 + freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) + emb = torch.cat((freqs, freqs), dim=-1) + cos = emb.cos() * self.attention_scaling + sin = emb.sin() * self.attention_scaling + + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) + + +class DeepseekV3MLP(nn.Module): + def __init__(self, config, hidden_size=None, intermediate_size=None): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size if hidden_size is None else hidden_size + self.intermediate_size = config.intermediate_size if intermediate_size is None else intermediate_size + + self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) + self.act_fn = ACT2FN[config.hidden_act] + + def forward(self, x): + down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) + return down_proj + + +class DeepseekV3TopkRouter(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.top_k = config.num_experts_per_tok + self.n_routed_experts = config.n_routed_experts + self.routed_scaling_factor = config.routed_scaling_factor + self.n_group = config.n_group + self.topk_group = config.topk_group + self.norm_topk_prob = config.norm_topk_prob + + self.weight = nn.Parameter(torch.empty((self.n_routed_experts, config.hidden_size))) + self.register_buffer("e_score_correction_bias", torch.zeros(self.n_routed_experts)) + + @torch.no_grad() + def get_topk_indices(self, scores): + scores_for_choice = scores.view(-1, self.n_routed_experts) + self.e_score_correction_bias.unsqueeze(0) + group_scores = ( + scores_for_choice.view(-1, self.n_group, self.n_routed_experts // self.n_group) + .topk(2, dim=-1)[0] + .sum(dim=-1) + ) + group_idx = torch.topk(group_scores, k=self.topk_group, dim=-1, sorted=False)[1] + group_mask = torch.zeros_like(group_scores) + group_mask.scatter_(1, group_idx, 1) + score_mask = ( + group_mask.unsqueeze(-1) + .expand(-1, self.n_group, self.n_routed_experts // self.n_group) + .reshape(-1, self.n_routed_experts) + ) + scores_for_choice = scores_for_choice.masked_fill(~score_mask.bool(), 0.0) + topk_indices = torch.topk(scores_for_choice, k=self.top_k, dim=-1, sorted=False)[1] + return topk_indices + + def forward(self, hidden_states): + hidden_states = hidden_states.view(-1, self.config.hidden_size) + router_logits = F.linear(hidden_states.type(torch.float32), self.weight.type(torch.float32)) + scores = router_logits.sigmoid() + topk_indices = self.get_topk_indices(scores) + topk_weights = scores.gather(1, topk_indices) + if self.norm_topk_prob: + denominator = topk_weights.sum(dim=-1, keepdim=True) + 1e-20 + topk_weights /= denominator + topk_weights = topk_weights * self.routed_scaling_factor + return topk_indices, topk_weights + + +class DeepseekV3MoE(nn.Module): + """ + A mixed expert module containing shared experts. + """ + + def __init__(self, config): + super().__init__() + self.config = config + self.experts = nn.ModuleList( + [ + DeepseekV3MLP(config, intermediate_size=config.moe_intermediate_size) + for _ in range(config.n_routed_experts) + ] + ) + self.gate = DeepseekV3TopkRouter(config) + self.shared_experts = DeepseekV3MLP( + config=config, intermediate_size=config.moe_intermediate_size * config.n_shared_experts + ) + + def moe(self, hidden_states: torch.Tensor, topk_indices: torch.Tensor, topk_weights: torch.Tensor): + r""" + CALL FOR CONTRIBUTION! I don't have time to optimise this right now, but expert weights need to be fused + to not have to do a loop here (deepseek has 256 experts soooo yeah). + """ + final_hidden_states = torch.zeros_like(hidden_states, dtype=topk_weights.dtype) + expert_mask = torch.nn.functional.one_hot(topk_indices, num_classes=len(self.experts)) + expert_mask = expert_mask.permute(2, 0, 1) + + for expert_idx in range(len(self.experts)): + expert = self.experts[expert_idx] + mask = expert_mask[expert_idx] + token_indices, weight_indices = torch.where(mask) + + if token_indices.numel() > 0: + expert_weights = topk_weights[token_indices, weight_indices] + expert_input = hidden_states[token_indices] + expert_output = expert(expert_input) + weighted_output = expert_output * expert_weights.unsqueeze(-1) + final_hidden_states.index_add_(0, token_indices, weighted_output) + + # in original deepseek, the output of the experts are gathered once we leave this module + # thus the moe module is itelsf an IsolatedParallel module + # and all expert are "local" meaning we shard but we don't gather + return final_hidden_states.type(hidden_states.dtype) + + def forward(self, hidden_states): + residuals = hidden_states + orig_shape = hidden_states.shape + topk_indices, topk_weights = self.gate(hidden_states) + hidden_states = hidden_states.view(-1, hidden_states.shape[-1]) + hidden_states = self.moe(hidden_states, topk_indices, topk_weights).view(*orig_shape) + hidden_states = hidden_states + self.shared_experts(residuals) + return hidden_states + + +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + +def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1): + """Applies Rotary Position Embedding to the query and key tensors. + + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + position_ids (`torch.Tensor`, *optional*): + Deprecated and unused. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + cos = cos.unsqueeze(unsqueeze_dim) + sin = sin.unsqueeze(unsqueeze_dim) + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + +def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: + """ + This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, + num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) + """ + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) + + +def eager_attention_forward( + module: nn.Module, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attention_mask: Optional[torch.Tensor], + scaling: float, + dropout: float = 0.0, + **kwargs: Unpack[TransformersKwargs], +): + key_states = repeat_kv(key, module.num_key_value_groups) + value_states = repeat_kv(value, module.num_key_value_groups) + + attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling + if attention_mask is not None: + causal_mask = attention_mask[:, :, :, : key_states.shape[-2]] + attn_weights = attn_weights + causal_mask + + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype) + attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) + attn_output = torch.matmul(attn_weights, value_states) + attn_output = attn_output.transpose(1, 2).contiguous() + + return attn_output, attn_weights + + +def apply_rotary_pos_emb_interleave(q, k, cos, sin, position_ids=None, unsqueeze_dim=1): + r""" + TODO let's just use the original freqcis computation to not have the view + transpose + reshape! This is not optimized! + Applies Rotary Position Embedding to the query and key tensors. + + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + position_ids (`torch.Tensor`): + The position indices of the tokens corresponding to the query and key tensors. For example, this can be + used to pass offsetted position ids when working with a KV-cache. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + cos = cos.unsqueeze(unsqueeze_dim) + sin = sin.unsqueeze(unsqueeze_dim) + + b, h, s, d = q.shape + q = q.view(b, h, s, d // 2, 2).transpose(4, 3).reshape(b, h, s, d) + + b, h, s, d = k.shape + k = k.view(b, h, s, d // 2, 2).transpose(4, 3).reshape(b, h, s, d) + + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + +def yarn_get_mscale(scale=1, mscale=1): + if scale <= 1: + return 1.0 + return 0.1 * mscale * math.log(scale) + 1.0 + + +class DeepseekV3Attention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config: DeepseekV3Config, layer_idx: int): + super().__init__() + self.config = config + self.layer_idx = layer_idx + self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads + self.attention_dropout = config.attention_dropout + self.num_heads = config.num_attention_heads + self.rope_theta = config.rope_theta + self.q_lora_rank = config.q_lora_rank + self.qk_rope_head_dim = config.qk_rope_head_dim + self.kv_lora_rank = config.kv_lora_rank + self.v_head_dim = config.v_head_dim + self.qk_nope_head_dim = config.qk_nope_head_dim + self.qk_head_dim = config.qk_head_dim + + self.is_causal = True + if self.q_lora_rank is None: + self.q_proj = nn.Linear(config.hidden_size, self.num_heads * self.qk_head_dim, bias=False) + else: + self.q_a_proj = nn.Linear(config.hidden_size, config.q_lora_rank, bias=config.attention_bias) + self.q_a_layernorm = DeepseekV3RMSNorm(config.q_lora_rank) + self.q_b_proj = nn.Linear(config.q_lora_rank, self.num_heads * self.qk_head_dim, bias=False) + + self.kv_a_proj_with_mqa = nn.Linear( + config.hidden_size, + self.kv_lora_rank + self.qk_rope_head_dim, + bias=config.attention_bias, + ) + self.kv_a_layernorm = DeepseekV3RMSNorm(self.kv_lora_rank) + self.kv_b_proj = nn.Linear( + self.kv_lora_rank, + self.num_heads * (self.qk_nope_head_dim + self.v_head_dim), + bias=False, + ) + + self.o_proj = nn.Linear( + self.num_heads * self.v_head_dim, + config.hidden_size, + bias=config.attention_bias, + ) + + self.scaling = self.qk_head_dim ** (-0.5) + if self.config.rope_scaling is not None: + mscale_all_dim = self.config.rope_scaling.get("mscale_all_dim", 0) + scaling_factor = self.config.rope_scaling["factor"] + if mscale_all_dim: + mscale = yarn_get_mscale(scaling_factor, mscale_all_dim) + self.scaling = self.scaling * mscale * mscale + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + position_embeddings: tuple[torch.Tensor, torch.Tensor], + attention_mask: Optional[torch.Tensor], + past_key_values: Optional[Cache] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Unpack[FlashAttentionKwargs], + ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]: + batch_size, seq_length = hidden_states.shape[:-1] + query_shape = (batch_size, seq_length, -1, self.qk_head_dim) + key_shape = (batch_size, seq_length, -1, self.qk_nope_head_dim + self.v_head_dim) + + if self.q_lora_rank is None: + q_states = self.q_proj(hidden_states) + else: + q_states = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states))) + q_states = q_states.view(query_shape).transpose(1, 2) + q_pass, q_rot = torch.split(q_states, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1) + + compressed_kv = self.kv_a_proj_with_mqa(hidden_states) + k_pass, k_rot = torch.split(compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1) + + k_pass = self.kv_b_proj(self.kv_a_layernorm(k_pass)).view(key_shape).transpose(1, 2) + k_pass, value_states = torch.split(k_pass, [self.qk_nope_head_dim, self.v_head_dim], dim=-1) + + k_rot = k_rot.view(batch_size, 1, seq_length, self.qk_rope_head_dim) + + cos, sin = position_embeddings + if self.config.rope_interleave: # support using interleaved weights for efficiency + q_rot, k_rot = apply_rotary_pos_emb_interleave(q_rot, k_rot, cos, sin) + else: + q_rot, k_rot = apply_rotary_pos_emb(q_rot, k_rot, cos, sin) + k_rot = k_rot.expand(*k_pass.shape[:-1], -1) + + query_states = torch.cat((q_pass, q_rot), dim=-1) + key_states = torch.cat((k_pass, k_rot), dim=-1) + + if past_key_values is not None: + # sin and cos are specific to RoPE models; cache_position needed for the static cache + cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} + key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs) + + if self.config._attn_implementation == "flash_attention_2" and self.qk_head_dim != self.v_head_dim: + value_states = F.pad(value_states, [0, self.qk_head_dim - self.v_head_dim]) + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + attn_output, attn_weights = attention_interface( + self, + query_states, + key_states, + value_states, + attention_mask, + dropout=0.0 if not self.training else self.attention_dropout, + scaling=self.scaling, + **kwargs, + ) + + if self.config._attn_implementation == "flash_attention_2" and self.qk_head_dim != self.v_head_dim: + attn_output = attn_output[:, :, :, : self.v_head_dim] + + attn_output = attn_output.reshape(batch_size, seq_length, -1).contiguous() + attn_output = self.o_proj(attn_output) + return attn_output, attn_weights + + +class DeepseekV3DecoderLayer(GradientCheckpointingLayer): + def __init__(self, config: DeepseekV3Config, layer_idx: int): + super().__init__() + self.hidden_size = config.hidden_size + + self.self_attn = DeepseekV3Attention(config=config, layer_idx=layer_idx) + + if layer_idx >= config.first_k_dense_replace: + self.mlp = DeepseekV3MoE(config) + else: + self.mlp = DeepseekV3MLP(config) + + self.input_layernorm = DeepseekV3RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = DeepseekV3RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + use_cache: Optional[bool] = False, + cache_position: Optional[torch.LongTensor] = None, + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + **kwargs: Unpack[TransformersKwargs], + ) -> torch.Tensor: + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + # Self Attention + hidden_states, _ = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + use_cache=use_cache, + cache_position=cache_position, + position_embeddings=position_embeddings, + **kwargs, + ) + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + return hidden_states + + +@auto_docstring +class DeepseekV3PreTrainedModel(PreTrainedModel): + config: DeepseekV3Config + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["DeepseekV3DecoderLayer"] + _skip_keys_device_placement = ["past_key_values"] + _supports_flash_attn = True + _supports_sdpa = True + _supports_flex_attn = True + _can_compile_fullgraph = False + _supports_attention_backend = True + _can_record_outputs = { + "hidden_states": DeepseekV3DecoderLayer, + "attentions": DeepseekV3Attention, + } + + def _init_weights(self, module): + super()._init_weights(module) + if isinstance(module, DeepseekV3TopkRouter): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + + +@auto_docstring +class DeepseekV3Model(DeepseekV3PreTrainedModel): + _keys_to_ignore_on_load_unexpected = [r"model\.layers\.61.*"] + + def __init__(self, config: DeepseekV3Config): + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) + self.layers = nn.ModuleList( + [DeepseekV3DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + ) + self.norm = DeepseekV3RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.rotary_emb = DeepseekV3RotaryEmbedding(config=config) + self.gradient_checkpointing = False + + # Initialize weights and apply final processing + self.post_init() + + @check_model_inputs + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + cache_position: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> BaseModelOutputWithPast: + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") + + if inputs_embeds is None: + inputs_embeds: torch.Tensor = self.embed_tokens(input_ids) + + if use_cache and past_key_values is None: + past_key_values = DynamicCache(config=self.config) + + if cache_position is None: + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + cache_position: torch.Tensor = torch.arange( + past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device + ) + + if position_ids is None: + position_ids = cache_position.unsqueeze(0) + + causal_mask = create_causal_mask( + config=self.config, + input_embeds=inputs_embeds, + attention_mask=attention_mask, + cache_position=cache_position, + past_key_values=past_key_values, + position_ids=position_ids, + ) + + hidden_states = inputs_embeds + position_embeddings = self.rotary_emb(hidden_states, position_ids) + + for decoder_layer in self.layers[: self.config.num_hidden_layers]: + hidden_states = decoder_layer( + hidden_states, + attention_mask=causal_mask, + position_ids=position_ids, + past_key_values=past_key_values, + cache_position=cache_position, + position_embeddings=position_embeddings, + **kwargs, + ) + + hidden_states = self.norm(hidden_states) + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=past_key_values, + ) + + +@auto_docstring +class DeepseekV3ForCausalLM(DeepseekV3PreTrainedModel, GenerationMixin): + _tied_weights_keys = ["lm_head.weight"] + _tp_plan = {"lm_head": "colwise_rep"} + _pp_plan = {"lm_head": (["hidden_states"], ["logits"])} + + def __init__(self, config): + super().__init__(config) + self.model = DeepseekV3Model(config) + self.vocab_size = config.vocab_size + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + @can_return_tuple + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + logits_to_keep: Union[int, torch.Tensor] = 0, + **kwargs: Unpack[TransformersKwargs], + ) -> CausalLMOutputWithPast: + r""" + Example: + + ```python + >>> from transformers import AutoTokenizer, DeepseekV3ForCausalLM + + >>> model = DeepseekV3ForCausalLM.from_pretrained("meta-deepseek_v3/DeepseekV3-2-7b-hf") + >>> tokenizer = AutoTokenizer.from_pretrained("meta-deepseek_v3/DeepseekV3-2-7b-hf") + + >>> prompt = "Hey, are you conscious? Can you talk to me?" + >>> inputs = tokenizer(prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(inputs.input_ids, max_length=30) + >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you." + ```""" + outputs: BaseModelOutputWithPast = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + cache_position=cache_position, + **kwargs, + ) + + hidden_states = outputs.last_hidden_state + # Only compute necessary logits, and do not upcast them to float if we are not computing the loss + slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep + logits = self.lm_head(hidden_states[:, slice_indices, :]) + + loss = None + if labels is not None: + loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs) + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +class DeepseekV3ForSequenceClassification(GenericForSequenceClassification, DeepseekV3PreTrainedModel): + pass + + +class DeepseekV3ForTokenClassification(GenericForTokenClassification, DeepseekV3PreTrainedModel): + pass + + +__all__ = [ + "DeepseekV3PreTrainedModel", + "DeepseekV3Model", + "DeepseekV3ForCausalLM", + "DeepseekV3ForSequenceClassification", + "DeepseekV3ForTokenClassification", +] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deepseek_v3/modular_deepseek_v3.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deepseek_v3/modular_deepseek_v3.py new file mode 100644 index 0000000000000000000000000000000000000000..fc3dc0c4ce3b382213917f215171d1bffaf914bc --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deepseek_v3/modular_deepseek_v3.py @@ -0,0 +1,373 @@ +import math +from typing import Callable, Optional + +import torch +import torch.nn.functional as F +from torch import nn + +from ...activations import ACT2FN +from ...cache_utils import Cache +from ...modeling_flash_attention_utils import FlashAttentionKwargs +from ...modeling_layers import GenericForSequenceClassification, GenericForTokenClassification +from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel +from ...processing_utils import Unpack +from ...utils import logging +from ...utils.deprecation import deprecate_kwarg +from ..llama.modeling_llama import ( + LlamaDecoderLayer, + LlamaForCausalLM, + LlamaModel, + LlamaPreTrainedModel, + LlamaRMSNorm, + LlamaRotaryEmbedding, + apply_rotary_pos_emb, + eager_attention_forward, + rotate_half, +) +from .configuration_deepseek_v3 import DeepseekV3Config + + +logger = logging.get_logger(__name__) + + +class DeepseekV3RMSNorm(LlamaRMSNorm): + pass + + +class DeepseekV3RotaryEmbedding(LlamaRotaryEmbedding): + pass + + +def apply_rotary_pos_emb_interleave(q, k, cos, sin, position_ids=None, unsqueeze_dim=1): + r""" + TODO let's just use the original freqcis computation to not have the view + transpose + reshape! This is not optimized! + Applies Rotary Position Embedding to the query and key tensors. + + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + position_ids (`torch.Tensor`): + The position indices of the tokens corresponding to the query and key tensors. For example, this can be + used to pass offsetted position ids when working with a KV-cache. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + cos = cos.unsqueeze(unsqueeze_dim) + sin = sin.unsqueeze(unsqueeze_dim) + + b, h, s, d = q.shape + q = q.view(b, h, s, d // 2, 2).transpose(4, 3).reshape(b, h, s, d) + + b, h, s, d = k.shape + k = k.view(b, h, s, d // 2, 2).transpose(4, 3).reshape(b, h, s, d) + + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + +def yarn_get_mscale(scale=1, mscale=1): + if scale <= 1: + return 1.0 + return 0.1 * mscale * math.log(scale) + 1.0 + + +class DeepseekV3MLP(nn.Module): + def __init__(self, config, hidden_size=None, intermediate_size=None): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size if hidden_size is None else hidden_size + self.intermediate_size = config.intermediate_size if intermediate_size is None else intermediate_size + + self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) + self.act_fn = ACT2FN[config.hidden_act] + + def forward(self, x): + down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) + return down_proj + + +class DeepseekV3TopkRouter(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.top_k = config.num_experts_per_tok + self.n_routed_experts = config.n_routed_experts + self.routed_scaling_factor = config.routed_scaling_factor + self.n_group = config.n_group + self.topk_group = config.topk_group + self.norm_topk_prob = config.norm_topk_prob + + self.weight = nn.Parameter(torch.empty((self.n_routed_experts, config.hidden_size))) + self.register_buffer("e_score_correction_bias", torch.zeros(self.n_routed_experts)) + + @torch.no_grad() + def get_topk_indices(self, scores): + scores_for_choice = scores.view(-1, self.n_routed_experts) + self.e_score_correction_bias.unsqueeze(0) + group_scores = ( + scores_for_choice.view(-1, self.n_group, self.n_routed_experts // self.n_group) + .topk(2, dim=-1)[0] + .sum(dim=-1) + ) + group_idx = torch.topk(group_scores, k=self.topk_group, dim=-1, sorted=False)[1] + group_mask = torch.zeros_like(group_scores) + group_mask.scatter_(1, group_idx, 1) + score_mask = ( + group_mask.unsqueeze(-1) + .expand(-1, self.n_group, self.n_routed_experts // self.n_group) + .reshape(-1, self.n_routed_experts) + ) + scores_for_choice = scores_for_choice.masked_fill(~score_mask.bool(), 0.0) + topk_indices = torch.topk(scores_for_choice, k=self.top_k, dim=-1, sorted=False)[1] + return topk_indices + + def forward(self, hidden_states): + hidden_states = hidden_states.view(-1, self.config.hidden_size) + router_logits = F.linear(hidden_states.type(torch.float32), self.weight.type(torch.float32)) + scores = router_logits.sigmoid() + topk_indices = self.get_topk_indices(scores) + topk_weights = scores.gather(1, topk_indices) + if self.norm_topk_prob: + denominator = topk_weights.sum(dim=-1, keepdim=True) + 1e-20 + topk_weights /= denominator + topk_weights = topk_weights * self.routed_scaling_factor + return topk_indices, topk_weights + + +class DeepseekV3MoE(nn.Module): + """ + A mixed expert module containing shared experts. + """ + + def __init__(self, config): + super().__init__() + self.config = config + self.experts = nn.ModuleList( + [ + DeepseekV3MLP(config, intermediate_size=config.moe_intermediate_size) + for _ in range(config.n_routed_experts) + ] + ) + self.gate = DeepseekV3TopkRouter(config) + self.shared_experts = DeepseekV3MLP( + config=config, intermediate_size=config.moe_intermediate_size * config.n_shared_experts + ) + + def moe(self, hidden_states: torch.Tensor, topk_indices: torch.Tensor, topk_weights: torch.Tensor): + r""" + CALL FOR CONTRIBUTION! I don't have time to optimise this right now, but expert weights need to be fused + to not have to do a loop here (deepseek has 256 experts soooo yeah). + """ + final_hidden_states = torch.zeros_like(hidden_states, dtype=topk_weights.dtype) + expert_mask = torch.nn.functional.one_hot(topk_indices, num_classes=len(self.experts)) + expert_mask = expert_mask.permute(2, 0, 1) + + for expert_idx in range(len(self.experts)): + expert = self.experts[expert_idx] + mask = expert_mask[expert_idx] + token_indices, weight_indices = torch.where(mask) + + if token_indices.numel() > 0: + expert_weights = topk_weights[token_indices, weight_indices] + expert_input = hidden_states[token_indices] + expert_output = expert(expert_input) + weighted_output = expert_output * expert_weights.unsqueeze(-1) + final_hidden_states.index_add_(0, token_indices, weighted_output) + + # in original deepseek, the output of the experts are gathered once we leave this module + # thus the moe module is itelsf an IsolatedParallel module + # and all expert are "local" meaning we shard but we don't gather + return final_hidden_states.type(hidden_states.dtype) + + def forward(self, hidden_states): + residuals = hidden_states + orig_shape = hidden_states.shape + topk_indices, topk_weights = self.gate(hidden_states) + hidden_states = hidden_states.view(-1, hidden_states.shape[-1]) + hidden_states = self.moe(hidden_states, topk_indices, topk_weights).view(*orig_shape) + hidden_states = hidden_states + self.shared_experts(residuals) + return hidden_states + + +class DeepseekV3Attention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config: DeepseekV3Config, layer_idx: int): + super().__init__() + self.config = config + self.layer_idx = layer_idx + self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads + self.attention_dropout = config.attention_dropout + self.num_heads = config.num_attention_heads + self.rope_theta = config.rope_theta + self.q_lora_rank = config.q_lora_rank + self.qk_rope_head_dim = config.qk_rope_head_dim + self.kv_lora_rank = config.kv_lora_rank + self.v_head_dim = config.v_head_dim + self.qk_nope_head_dim = config.qk_nope_head_dim + self.qk_head_dim = config.qk_head_dim + + self.is_causal = True + if self.q_lora_rank is None: + self.q_proj = nn.Linear(config.hidden_size, self.num_heads * self.qk_head_dim, bias=False) + else: + self.q_a_proj = nn.Linear(config.hidden_size, config.q_lora_rank, bias=config.attention_bias) + self.q_a_layernorm = DeepseekV3RMSNorm(config.q_lora_rank) + self.q_b_proj = nn.Linear(config.q_lora_rank, self.num_heads * self.qk_head_dim, bias=False) + + self.kv_a_proj_with_mqa = nn.Linear( + config.hidden_size, + self.kv_lora_rank + self.qk_rope_head_dim, + bias=config.attention_bias, + ) + self.kv_a_layernorm = DeepseekV3RMSNorm(self.kv_lora_rank) + self.kv_b_proj = nn.Linear( + self.kv_lora_rank, + self.num_heads * (self.qk_nope_head_dim + self.v_head_dim), + bias=False, + ) + + self.o_proj = nn.Linear( + self.num_heads * self.v_head_dim, + config.hidden_size, + bias=config.attention_bias, + ) + + self.scaling = self.qk_head_dim ** (-0.5) + if self.config.rope_scaling is not None: + mscale_all_dim = self.config.rope_scaling.get("mscale_all_dim", 0) + scaling_factor = self.config.rope_scaling["factor"] + if mscale_all_dim: + mscale = yarn_get_mscale(scaling_factor, mscale_all_dim) + self.scaling = self.scaling * mscale * mscale + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + position_embeddings: tuple[torch.Tensor, torch.Tensor], + attention_mask: Optional[torch.Tensor], + past_key_values: Optional[Cache] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Unpack[FlashAttentionKwargs], + ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]: + batch_size, seq_length = hidden_states.shape[:-1] + query_shape = (batch_size, seq_length, -1, self.qk_head_dim) + key_shape = (batch_size, seq_length, -1, self.qk_nope_head_dim + self.v_head_dim) + + if self.q_lora_rank is None: + q_states = self.q_proj(hidden_states) + else: + q_states = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states))) + q_states = q_states.view(query_shape).transpose(1, 2) + q_pass, q_rot = torch.split(q_states, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1) + + compressed_kv = self.kv_a_proj_with_mqa(hidden_states) + k_pass, k_rot = torch.split(compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1) + + k_pass = self.kv_b_proj(self.kv_a_layernorm(k_pass)).view(key_shape).transpose(1, 2) + k_pass, value_states = torch.split(k_pass, [self.qk_nope_head_dim, self.v_head_dim], dim=-1) + + k_rot = k_rot.view(batch_size, 1, seq_length, self.qk_rope_head_dim) + + cos, sin = position_embeddings + if self.config.rope_interleave: # support using interleaved weights for efficiency + q_rot, k_rot = apply_rotary_pos_emb_interleave(q_rot, k_rot, cos, sin) + else: + q_rot, k_rot = apply_rotary_pos_emb(q_rot, k_rot, cos, sin) + k_rot = k_rot.expand(*k_pass.shape[:-1], -1) + + query_states = torch.cat((q_pass, q_rot), dim=-1) + key_states = torch.cat((k_pass, k_rot), dim=-1) + + if past_key_values is not None: + # sin and cos are specific to RoPE models; cache_position needed for the static cache + cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} + key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs) + + if self.config._attn_implementation == "flash_attention_2" and self.qk_head_dim != self.v_head_dim: + value_states = F.pad(value_states, [0, self.qk_head_dim - self.v_head_dim]) + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + attn_output, attn_weights = attention_interface( + self, + query_states, + key_states, + value_states, + attention_mask, + dropout=0.0 if not self.training else self.attention_dropout, + scaling=self.scaling, + **kwargs, + ) + + if self.config._attn_implementation == "flash_attention_2" and self.qk_head_dim != self.v_head_dim: + attn_output = attn_output[:, :, :, : self.v_head_dim] + + attn_output = attn_output.reshape(batch_size, seq_length, -1).contiguous() + attn_output = self.o_proj(attn_output) + return attn_output, attn_weights + + +class DeepseekV3DecoderLayer(LlamaDecoderLayer): + def __init__(self, config: DeepseekV3Config, layer_idx: int): + nn.Module.__init__(self) + self.hidden_size = config.hidden_size + + self.self_attn = DeepseekV3Attention(config=config, layer_idx=layer_idx) + + if layer_idx >= config.first_k_dense_replace: + self.mlp = DeepseekV3MoE(config) + else: + self.mlp = DeepseekV3MLP(config) + + self.input_layernorm = DeepseekV3RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = DeepseekV3RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + +class DeepseekV3PreTrainedModel(LlamaPreTrainedModel): + _can_compile_fullgraph = False + + def _init_weights(self, module): + PreTrainedModel._init_weights(self, module) + if isinstance(module, DeepseekV3TopkRouter): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + + +class DeepseekV3Model(LlamaModel): + _keys_to_ignore_on_load_unexpected = [r"model\.layers\.61.*"] + + +class DeepseekV3ForCausalLM(LlamaForCausalLM): + pass + + +class DeepseekV3ForSequenceClassification(GenericForSequenceClassification, DeepseekV3PreTrainedModel): + pass + + +class DeepseekV3ForTokenClassification(GenericForTokenClassification, DeepseekV3PreTrainedModel): + pass + + +__all__ = [ + "DeepseekV3PreTrainedModel", + "DeepseekV3Model", + "DeepseekV3ForCausalLM", + "DeepseekV3ForSequenceClassification", + "DeepseekV3ForTokenClassification", +] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deepseek_vl/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deepseek_vl/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..2422b31e31050b378f7373dd6ed73830ee1b9f0d --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deepseek_vl/__init__.py @@ -0,0 +1,30 @@ +# Copyright 2025 Deepseek AI and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_deepseek_vl import * + from .image_processing_deepseek_vl import * + from .image_processing_deepseek_vl_fast import * + from .modeling_deepseek_vl import * + from .processing_deepseek_vl import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deepseek_vl/configuration_deepseek_vl.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deepseek_vl/configuration_deepseek_vl.py new file mode 100644 index 0000000000000000000000000000000000000000..b3abae5af0a7f7613432e71ef13a1c3b9774176c --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deepseek_vl/configuration_deepseek_vl.py @@ -0,0 +1,97 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/deepseek_vl/modular_deepseek_vl.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_deepseek_vl.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# Copyright 2025 Deepseek AI and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Optional + +from ...configuration_utils import PretrainedConfig +from ...utils import logging +from ..auto import CONFIG_MAPPING, AutoConfig + + +logger = logging.get_logger(__name__) + + +class DeepseekVLConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`DeepseekVLModel`]. It is used to instantiate a + DeepseekVL model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of the DeepseekVL + [deepseek-community/deepseek-vl-1.3b-chat](https://huggingface.co/deepseek-community/deepseek-vl-1.3b-chat) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `LlamaConfig`): + The config object or dictionary of the text backbone. + vision_config (`Union[AutoConfig, dict]`, *optional*, defaults to `SiglipVisionConfig`): + The config object or dictionary of the vision backbone. + image_token_id (`int`, *optional*, defaults to 100015): + The index representing image tokens in the model's token vocabulary. + + Example: + + ```python + >>> from transformers import DeepseekVLConfig, DeepseekVLModel + + >>> # Initializing a DeepseekVL deepseek-community/deepseek-vl-1.3b-chat style configuration + >>> configuration = DeepseekVLConfig() + + >>> # Initializing a model (with random weights) from the deepseek-community/deepseek-vl-1.3b-chat style configuration + >>> model = DeepseekVLModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "deepseek_vl" + sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig} + + def __init__( + self, + text_config: Optional[AutoConfig] = None, + vision_config: Optional[AutoConfig] = None, + image_token_id: int = 100015, + **kwargs, + ): + super().__init__(**kwargs) + + if text_config is None: + text_config = {} + logger.info("`text_config` is `None`. Initializing the `LlamaConfig` with default values.") + + if vision_config is None: + vision_config = {} + logger.info("`vision_config` is `None`. Initializing the `SiglipVisionConfig` with default values.") + + if isinstance(text_config, dict): + text_config["model_type"] = text_config.get("model_type", "llama") + text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config) + + if isinstance(vision_config, dict): + vision_config["model_type"] = vision_config.get("model_type", "siglip_vision_model") + vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config) + + self.text_config = text_config + self.vision_config = vision_config + self.image_token_id = image_token_id + + +__all__ = ["DeepseekVLConfig"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deepseek_vl/image_processing_deepseek_vl.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deepseek_vl/image_processing_deepseek_vl.py new file mode 100644 index 0000000000000000000000000000000000000000..12aa7caf892e52262c5d612551a19e8f9bd4f58d --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deepseek_vl/image_processing_deepseek_vl.py @@ -0,0 +1,425 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/deepseek_vl/modular_deepseek_vl.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_deepseek_vl.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# Copyright 2025 Deepseek AI and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Optional, Union + +import numpy as np + +from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict +from ...image_transforms import convert_to_rgb, resize, to_channel_dimension_format +from ...image_utils import ( + OPENAI_CLIP_MEAN, + OPENAI_CLIP_STD, + ChannelDimension, + ImageInput, + PILImageResampling, + get_image_size, + infer_channel_dimension_format, + is_scaled_image, + make_flat_list_of_images, + to_numpy_array, + valid_images, + validate_preprocess_arguments, +) +from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging + + +if is_vision_available(): + import PIL + + +logger = logging.get_logger(__name__) + + +class DeepseekVLImageProcessor(BaseImageProcessor): + r""" + Constructs a DEEPSEEK_VL image processor. + + Args: + do_resize (`bool`, *optional*, defaults to `True`): + Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the + `do_resize` parameter in the `preprocess` method. + size (`dict`, *optional*, defaults to `{"height": 384, "width": 384}`): + Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess` + method. + min_size (`int`, *optional*, defaults to 14): + The minimum allowed size for the resized image. Ensures that neither the height nor width + falls below this value after resizing. + resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`): + Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be + overridden by the `resample` parameter in the `preprocess` method. + do_rescale (`bool`, *optional*, defaults to `True`): + Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the + `do_rescale` parameter in the `preprocess` method. + rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): + Scale factor to use if rescaling the image. Only has an effect if `do_rescale` is set to `True`. Can be + overridden by the `rescale_factor` parameter in the `preprocess` method. + do_normalize (`bool`, *optional*, defaults to `True`): + Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess` + method. Can be overridden by the `do_normalize` parameter in the `preprocess` method. + image_mean (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`): + Mean to use if normalizing the image. This is a float or list of floats the length of the number of + channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can be + overridden by the `image_mean` parameter in the `preprocess` method. + image_std (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`): + Standard deviation to use if normalizing the image. This is a float or list of floats the length of the + number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method. + Can be overridden by the `image_std` parameter in the `preprocess` method. + do_convert_rgb (`bool`, *optional*, defaults to `True`): + Whether to convert the image to RGB. + do_pad (`bool`, *optional*, defaults to `True`): + Whether to pad the image to square or not. + """ + + model_input_names = ["pixel_values"] + + def __init__( + self, + do_resize: bool = True, + size: Optional[dict[str, int]] = None, + min_size: int = 14, + resample: PILImageResampling = PILImageResampling.BICUBIC, + do_rescale: bool = True, + rescale_factor: Union[int, float] = 1 / 255, + do_normalize: bool = True, + image_mean: Optional[Union[float, list[float]]] = None, + image_std: Optional[Union[float, list[float]]] = None, + do_convert_rgb: Optional[bool] = None, + do_pad: Optional[bool] = True, + **kwargs, + ) -> None: + super().__init__(**kwargs) + size = size if size is not None else {"height": 384, "width": 384} + size = get_size_dict(size, default_to_square=True) + + self.do_resize = do_resize + self.size = size + self.resample = resample + self.do_rescale = do_rescale + self.rescale_factor = rescale_factor + self.do_normalize = do_normalize + self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN + self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD + self.do_convert_rgb = do_convert_rgb + + self.do_pad = do_pad + self.min_size = min_size + if image_mean is None: + self.background_color = (127, 127, 127) + else: + self.background_color = tuple(int(x * 255) for x in image_mean) + + def resize( + self, + image: np.ndarray, + size: Union[dict[str, int], int], + resample: PILImageResampling = PILImageResampling.BICUBIC, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ) -> np.ndarray: + """ + Resize an image to dynamically calculated size. + + Args: + image (`np.ndarray`): + Image to resize. + size (`dict[str, int]` or `int`): + The size to resize the image to. If a dictionary, it should have the keys `"height"` and `"width"`. + resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`): + `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BICUBIC`. + data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the output image. If unset, the channel dimension format of the input + image is used. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `None`: will be inferred from input + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + + Returns: + `np.ndarray`: The resized image. + """ + if input_data_format is None: + input_data_format = infer_channel_dimension_format(image) + + height, width = get_image_size(image, input_data_format) + max_size = max(height, width) + + size = get_size_dict(size, default_to_square=True) + if size["height"] != size["width"]: + raise ValueError( + f"Output height and width must be the same. Got height={size['height']} and width={size['width']}" + ) + size = size["height"] + + delta = size / max_size + # Largest side becomes `size` and the other side is scaled according to the aspect ratio. + output_size_nonpadded = [ + max(int(height * delta), self.min_size), + max(int(width * delta), self.min_size), + ] + + image = resize( + image, + size=output_size_nonpadded, + resample=resample, + data_format=data_format, + input_data_format=input_data_format, + **kwargs, + ) + return image + + @filter_out_non_signature_kwargs() + def preprocess( + self, + images: ImageInput, + do_resize: Optional[bool] = None, + size: Optional[dict[str, int]] = None, + resample: Optional[PILImageResampling] = None, + do_rescale: Optional[bool] = None, + rescale_factor: Optional[float] = None, + do_normalize: Optional[bool] = None, + image_mean: Optional[Union[float, list[float]]] = None, + image_std: Optional[Union[float, list[float]]] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + do_convert_rgb: Optional[bool] = None, + background_color: Optional[Union[int, tuple[int, int, int]]] = None, + do_pad: Optional[bool] = None, + data_format: ChannelDimension = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> PIL.Image.Image: + """ + Preprocess an image or batch of images. + + Args: + images (`ImageInput`): + Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If + passing in images with pixel values between 0 and 1, set `do_rescale=False`. + do_resize (`bool`, *optional*, defaults to `self.do_resize`): + Whether to resize the image. + size (`dict[str, int]`, *optional*, defaults to `self.size`): + Controls the size of the image after `resize`. The shortest edge of the image is resized to + `size["shortest_edge"]` whilst preserving the aspect ratio. If the longest edge of this resized image + is > `int(size["shortest_edge"] * (1333 / 800))`, then the image is resized again to make the longest + edge equal to `int(size["shortest_edge"] * (1333 / 800))`. + resample (`PILImageResampling`, *optional*, defaults to `self.resample`): + Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. + do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): + Whether to rescale the image values between [0 - 1]. + rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): + Rescale factor to rescale the image by if `do_rescale` is set to `True`. + do_normalize (`bool`, *optional*, defaults to `self.do_normalize`): + Whether to normalize the image. + image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`): + Image mean to normalize the image by if `do_normalize` is set to `True`. + image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`): + Image standard deviation to normalize the image by if `do_normalize` is set to `True`. + do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`): + Whether to convert the image to RGB. + background_color (`tuple[int, int, int]`): + The background color to use for the padding. + do_pad (`bool`, *optional*, defaults to `self.do_pad`): + Whether to pad the image to square or not. + return_tensors (`str` or `TensorType`, *optional*): + The type of tensors to return. Can be one of: + - Unset: Return a list of `np.ndarray`. + - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`. + - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. + - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. + - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. + data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - Unset: Use the channel dimension format of the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + """ + do_resize = do_resize if do_resize is not None else self.do_resize + resample = resample if resample is not None else self.resample + do_rescale = do_rescale if do_rescale is not None else self.do_rescale + rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor + do_normalize = do_normalize if do_normalize is not None else self.do_normalize + image_mean = image_mean if image_mean is not None else self.image_mean + image_std = image_std if image_std is not None else self.image_std + do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb + do_pad = do_pad if do_pad is not None else self.do_pad + background_color = background_color if background_color is not None else self.background_color + + size = size if size is not None else self.size + size = get_size_dict(size, default_to_square=False) + images = self.fetch_images(images) + images = make_flat_list_of_images(images) + + if not valid_images(images): + raise ValueError( + "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " + "torch.Tensor, tf.Tensor or jax.ndarray." + ) + + validate_preprocess_arguments( + do_rescale=do_rescale, + rescale_factor=rescale_factor, + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + do_resize=do_resize, + size=size, + resample=resample, + ) + # PIL RGBA images are converted to RGB + if do_convert_rgb: + images = [convert_to_rgb(image) for image in images] + + # All transformations expect numpy arrays. + images = [to_numpy_array(image) for image in images] + + if do_rescale and is_scaled_image(images[0]): + logger.warning_once( + "It looks like you are trying to rescale already rescaled images. If the input" + " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again." + ) + + if input_data_format is None: + # We assume that all images have the same channel dimension format. + input_data_format = infer_channel_dimension_format(images[0]) + + if do_resize: + images = [ + self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format) + for image in images + ] + + if do_pad: + # Expand and pad the images to obtain a square image of dimensions `size x size` + images = [ + self.pad_to_square( + image=image, + background_color=background_color, + input_data_format=input_data_format, + ) + for image in images + ] + + if do_rescale: + images = [ + self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format) + for image in images + ] + + if do_normalize: + images = [ + self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format) + for image in images + ] + + images = [ + to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images + ] + + encoded_outputs = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors) + + return encoded_outputs + + def pad_to_square( + self, + image: np.ndarray, + background_color: Union[int, tuple[int, int, int]] = 0, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> np.ndarray: + """ + Pads an image to a square based on the longest edge. + + Args: + image (`np.ndarray`): + The image to pad. + background_color (`int` or `tuple[int, int, int]`, *optional*, defaults to 0): + The color to use for the padding. Can be an integer for single channel or a + tuple of integers representing for multi-channel images. If passed as integer + in multi-channel mode, it will default to `0` in subsequent channels. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + If unset, will use same as the input image. + input_data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format for the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + + Returns: + `np.ndarray`: The padded image. + """ + height, width = get_image_size(image, input_data_format) + num_channels = image.shape[0] if input_data_format == ChannelDimension.FIRST else image.shape[-1] + + if height == width: + image = ( + to_channel_dimension_format(image, data_format, input_data_format) + if data_format is not None + else image + ) + return image + + max_dim = max(height, width) + + # Ensure background_color is the correct shape + if isinstance(background_color, int): + background_color = [background_color] + elif len(background_color) != num_channels: + raise ValueError( + f"background_color must have no more than {num_channels} elements to match the number of channels" + ) + + if input_data_format == ChannelDimension.FIRST: + result = np.zeros((num_channels, max_dim, max_dim), dtype=image.dtype) + for i, color in enumerate(background_color): + result[i, :, :] = color + if width > height: + start = (max_dim - height) // 2 + result[:, start : start + height, :] = image + else: + start = (max_dim - width) // 2 + result[:, :, start : start + width] = image + else: + result = np.zeros((max_dim, max_dim, num_channels), dtype=image.dtype) + for i, color in enumerate(background_color): + result[:, :, i] = color + if width > height: + start = (max_dim - height) // 2 + result[start : start + height, :, :] = image + else: + start = (max_dim - width) // 2 + result[:, start : start + width, :] = image + + return result + + +__all__ = ["DeepseekVLImageProcessor"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py new file mode 100644 index 0000000000000000000000000000000000000000..896e91f0692c13f99d53ca475ddbebb60dd8affe --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py @@ -0,0 +1,193 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/deepseek_vl/modular_deepseek_vl.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_deepseek_vl.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# Copyright 2025 Deepseek AI and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Optional, Union + +import torch +import torch.nn.functional as F + +from ...image_processing_utils import BatchFeature +from ...image_processing_utils_fast import ( + BaseImageProcessorFast, + DefaultFastImageProcessorKwargs, + group_images_by_shape, + reorder_images, +) +from ...image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, PILImageResampling, SizeDict +from ...processing_utils import Unpack +from ...utils import TensorType, auto_docstring + + +class DeepseekVLFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): + r""" + min_size (`int`, *optional*, defaults to 14): + The minimum allowed size for the resized image. Ensures that neither the height nor width + falls below this value after resizing. + """ + + min_size: int + + +@auto_docstring +class DeepseekVLImageProcessorFast(BaseImageProcessorFast): + resample = PILImageResampling.BICUBIC + image_mean = OPENAI_CLIP_MEAN + image_std = OPENAI_CLIP_STD + size = {"height": 384, "width": 384} + min_size = 14 + do_resize = True + do_rescale = True + do_normalize = True + do_pad = True + valid_kwargs = DeepseekVLFastImageProcessorKwargs + + def __init__(self, **kwargs: Unpack[DeepseekVLFastImageProcessorKwargs]): + super().__init__(**kwargs) + if kwargs.get("image_mean") is None: + background_color = (127, 127, 127) + else: + background_color = tuple(int(x * 255) for x in kwargs.get("image_mean")) + self.background_color = tuple(background_color) + + def resize( + self, + image: "torch.Tensor", + size: SizeDict, + min_size: int, + interpolation: Optional["F.InterpolationMode"] = None, + antialias: bool = True, + **kwargs, + ) -> "torch.Tensor": + if size.height is None or size.width is None or size.height != size.width: + raise ValueError( + f"Output height and width must be the same. Got height={size['height']} and width={size['width']}" + ) + size = size.height + + height, width = image.shape[-2:] + max_size = max(height, width) + + delta = size / max_size + # Largest side becomes `size` and the other side is scaled according to the aspect ratio. + output_size_nonpadded = SizeDict( + height=max(int(height * delta), min_size), + width=max(int(width * delta), min_size), + ) + + return super().resize(image, size=output_size_nonpadded, interpolation=interpolation, antialias=antialias) + + def pad_to_square( + self, + images: "torch.Tensor", + background_color: Union[int, tuple[int, int, int]] = 0, + ) -> "torch.Tensor": + """ + Pads an image to a square based on the longest edge. + + Args: + images (`torch.Tensor`): + The images to pad. + background_color (`int` or `tuple[int, int, int]`, *optional*, defaults to 0): + The color to use for the padding. Can be an integer for single channel or a + tuple of integers representing for multi-channel images. If passed as integer + in multi-channel mode, it will default to `0` in subsequent channels. + + Returns: + `torch.Tensor`: The padded images. + """ + height, width = images.shape[-2:] + num_channels = images.shape[1] + batch_size = images.shape[0] + + if height == width: + return images + + max_dim = max(height, width) + + # Ensure background_color is the correct shape + if isinstance(background_color, int): + background_color = [background_color] + elif len(background_color) != num_channels: + raise ValueError( + f"background_color must have no more than {num_channels} elements to match the number of channels" + ) + + padded_images = torch.zeros( + (batch_size, num_channels, max_dim, max_dim), dtype=images.dtype, device=images.device + ) + for i, color in enumerate(background_color): + padded_images[:, i, :, :] = color + if width > height: + start = (max_dim - height) // 2 + padded_images[:, :, start : start + height, :] = images + else: + start = (max_dim - width) // 2 + padded_images[:, :, :, start : start + width] = images + + return padded_images + + def _preprocess( + self, + images: list["torch.Tensor"], + do_resize: bool, + size: SizeDict, + min_size: int, + interpolation: Optional["F.InterpolationMode"], + do_rescale: bool, + rescale_factor: float, + do_normalize: bool, + image_mean: Optional[Union[float, list[float]]], + image_std: Optional[Union[float, list[float]]], + disable_grouping: Optional[bool], + return_tensors: Optional[Union[str, TensorType]], + do_pad: bool = True, + **kwargs, + ) -> BatchFeature: + # Group images by size for batched resizing + grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping) + resized_images_grouped = {} + for shape, stacked_images in grouped_images.items(): + if do_resize: + stacked_images = self.resize( + image=stacked_images, size=size, min_size=min_size, interpolation=interpolation + ) + resized_images_grouped[shape] = stacked_images + resized_images = reorder_images(resized_images_grouped, grouped_images_index) + + # Group images by size for further processing + # Needed in case do_resize is False, or resize returns images with different sizes + grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping) + processed_images_grouped = {} + for shape, stacked_images in grouped_images.items(): + if do_pad: + stacked_images = self.pad_to_square(stacked_images, background_color=self.background_color) + # Fused rescale and normalize + stacked_images = self.rescale_and_normalize( + stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std + ) + processed_images_grouped[shape] = stacked_images + + processed_images = reorder_images(processed_images_grouped, grouped_images_index) + processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images + + return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors) + + +__all__ = ["DeepseekVLImageProcessorFast"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deepseek_vl/modeling_deepseek_vl.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deepseek_vl/modeling_deepseek_vl.py new file mode 100644 index 0000000000000000000000000000000000000000..ce884da8d08bb8680152bd2a58e537f876919f0f --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deepseek_vl/modeling_deepseek_vl.py @@ -0,0 +1,350 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/deepseek_vl/modular_deepseek_vl.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_deepseek_vl.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# Copyright 2025 Deepseek AI and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass +from typing import Optional, Union + +import torch +import torch.nn as nn + +from ...cache_utils import Cache +from ...generation import GenerationMixin +from ...modeling_outputs import ModelOutput +from ...modeling_utils import PreTrainedModel +from ...processing_utils import Unpack +from ...utils import TransformersKwargs, auto_docstring, can_return_tuple +from ..auto import AutoModel +from .configuration_deepseek_vl import DeepseekVLConfig + + +@dataclass +@auto_docstring( + custom_intro=""" + Base class for DeepseekVL model's outputs that may also contain a past key/values (to speed up sequential decoding). + """ +) +class DeepseekVLBaseModelOutputWithPast(ModelOutput): + r""" + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + + If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1, + hidden_size)` is output. + past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache). + + Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if + `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values` + input) to speed up sequential decoding. + image_hidden_states (`tuple(torch.FloatTensor)`, *optional*): + Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images, + sequence_length, hidden_size)`. + + image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver + """ + + last_hidden_state: Optional[torch.FloatTensor] = None + past_key_values: Optional[Cache] = None + hidden_states: Optional[tuple[torch.FloatTensor]] = None + attentions: Optional[tuple[torch.FloatTensor]] = None + image_hidden_states: Optional[tuple[torch.FloatTensor]] = None + + +@dataclass +@auto_docstring( + custom_intro=""" + Base class for DeepseekVL causal language model (or autoregressive) outputs. + """ +) +class DeepseekVLCausalLMOutputWithPast(ModelOutput): + r""" + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Language modeling loss (for next-token prediction). + logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache). + + Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see + `past_key_values` input) to speed up sequential decoding. + image_hidden_states (`tuple(torch.FloatTensor)`, *optional*): + Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images, + sequence_length, hidden_size)`. + + image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver + """ + + loss: Optional[torch.FloatTensor] = None + logits: Optional[torch.FloatTensor] = None + past_key_values: Optional[Cache] = None + hidden_states: Optional[tuple[torch.FloatTensor]] = None + attentions: Optional[tuple[torch.FloatTensor]] = None + image_hidden_states: Optional[tuple[torch.FloatTensor]] = None + + +class DeepseekVLAligner(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + + in_features = config.vision_config.hidden_size + out_features = config.text_config.hidden_size + + self.linear1 = nn.Linear(in_features, out_features) + self.activation = nn.GELU() + self.linear2 = nn.Linear(out_features, out_features) + + def forward(self, vision_encodings: torch.Tensor) -> torch.Tensor: + x = self.linear1(vision_encodings) + x = self.activation(x) + x = self.linear2(x) + return x + + +@auto_docstring +class DeepseekVLPreTrainedModel(PreTrainedModel): + config: DeepseekVLConfig + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["LlamaDecoderLayer"] + _skip_keys_device_placement = ["past_key_values", "causal_mask"] + _supports_flash_attn = True + _supports_sdpa = True + + _can_compile_fullgraph = True + _supports_param_buffer_assignment = False + + def _init_weights(self, module): + """Initialize the weights""" + # Required only for Linear layer in DeepseekVLAligner + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=self.config.text_config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + + +@auto_docstring +class DeepseekVLModel(DeepseekVLPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.config = config + + self.vision_model = AutoModel.from_config(config.vision_config) + self.aligner = DeepseekVLAligner(config) + + self.language_model = AutoModel.from_config(config=config.text_config) + + self.gradient_checkpointing = False + # Initialize weights and apply final processing. + self.post_init() + + def get_input_embeddings(self): + return self.language_model.get_input_embeddings() + + def set_input_embeddings(self, value): + self.language_model.set_input_embeddings(value) + + def get_image_features(self, pixel_values): + image_embeds = self.vision_model(pixel_values) + image_embeds = self.aligner(image_embeds.last_hidden_state) + return image_embeds + + def get_placeholder_mask( + self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor, image_features: torch.FloatTensor + ): + """ + Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is + equal to the length of multimodal features. If the lengths are different, an error is raised. + """ + if input_ids is None: + special_image_mask = inputs_embeds == self.get_input_embeddings()( + torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device) + ) + special_image_mask = special_image_mask.all(-1) + else: + special_image_mask = input_ids == self.config.image_token_id + + n_image_tokens = special_image_mask.sum() + special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device) + if inputs_embeds[special_image_mask].numel() != image_features.numel(): + n_image_features = image_features.shape[0] * image_features.shape[1] + raise ValueError( + f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}" + ) + return special_image_mask + + @can_return_tuple + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + cache_position: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + logits_to_keep: Union[int, torch.Tensor] = 0, + **kwargs, + ): + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError( + "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one" + ) + if inputs_embeds is None: + inputs_embeds = self.get_input_embeddings()(input_ids) + + if pixel_values is not None: + image_embeds = self.get_image_features(pixel_values) + image_features = image_embeds.reshape(-1, inputs_embeds.shape[-1]) + image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype) + image_attention_mask = self.get_placeholder_mask( + input_ids, inputs_embeds=inputs_embeds, image_features=image_features + ) + inputs_embeds = inputs_embeds.masked_scatter(image_attention_mask, image_features) + + lm_output = self.language_model( + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + use_cache=use_cache, + cache_position=cache_position, + logits_to_keep=logits_to_keep, + **kwargs, + ) + + return DeepseekVLBaseModelOutputWithPast( + last_hidden_state=lm_output.last_hidden_state, + past_key_values=lm_output.past_key_values, + hidden_states=lm_output.hidden_states, + attentions=lm_output.attentions, + image_hidden_states=image_embeds if pixel_values is not None else None, + ) + + +class DeepseekVLForConditionalGeneration(DeepseekVLPreTrainedModel, GenerationMixin): + _tied_weights_keys = ["model.language_model.embed_tokens.weight", "lm_head.weight"] + _can_compile_fullgraph = True + + def __init__(self, config: DeepseekVLConfig): + super().__init__(config) + self.config = config + self.model = DeepseekVLModel(config) + self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False) + + # Initialize weights and apply final processing. + self.post_init() + + def get_input_embeddings(self): + return self.model.language_model.get_input_embeddings() + + def set_input_embeddings(self, value): + self.model.language_model.set_input_embeddings(value) + + def prepare_embeddings_for_image_generation(self) -> torch.Tensor: + raise AttributeError("Not needed for DeepseekVL") + + @can_return_tuple + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + cache_position: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + logits_to_keep: Union[int, torch.Tensor] = 0, + **kwargs: Unpack[TransformersKwargs], + ): + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + """ + outputs = self.model( + input_ids=input_ids, + pixel_values=pixel_values, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + cache_position=cache_position, + **kwargs, + ) + hidden_states = outputs.last_hidden_state + # Only compute necessary logits, and do not upcast them to float if we are not computing the loss + slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep + logits = self.lm_head(hidden_states[:, slice_indices, :]) + + loss = None + if labels is not None: + loss = self.loss_function( + logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **kwargs + ) + + return DeepseekVLCausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + image_hidden_states=outputs.image_hidden_states, + ) + + def prepare_inputs_for_generation( + self, + input_ids, + pixel_values=None, + past_key_values=None, + attention_mask=None, + inputs_embeds=None, + cache_position=None, + logits_to_keep=None, + **kwargs, + ): + # Overwritten -- extra custom processing + + model_inputs = super().prepare_inputs_for_generation( + input_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + cache_position=cache_position, + logits_to_keep=logits_to_keep, + **kwargs, + ) + + # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore + # Otherwise we need pixel values to be passed to model + if cache_position[0] == 0: + model_inputs["pixel_values"] = pixel_values + + return model_inputs + + +__all__ = ["DeepseekVLPreTrainedModel", "DeepseekVLModel", "DeepseekVLForConditionalGeneration"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deepseek_vl/modular_deepseek_vl.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deepseek_vl/modular_deepseek_vl.py new file mode 100644 index 0000000000000000000000000000000000000000..5bfc0ae7d74c3b48f9a009edcc7daba60ccc3382 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deepseek_vl/modular_deepseek_vl.py @@ -0,0 +1,333 @@ +# Copyright 2025 Deepseek AI and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Optional, Union + +import torch +import torch.nn as nn + +from ...configuration_utils import PretrainedConfig +from ...image_processing_utils import BatchFeature +from ...image_utils import ImageInput +from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack +from ...tokenization_utils_base import ( + PreTokenizedInput, + TextInput, +) +from ...utils import ( + auto_docstring, + logging, +) +from ..auto import CONFIG_MAPPING, AutoConfig, AutoModel +from ..idefics.modeling_idefics import IdeficsBaseModelOutputWithPast, IdeficsCausalLMOutputWithPast +from ..janus.image_processing_janus import JanusImageProcessor +from ..janus.image_processing_janus_fast import JanusImageProcessorFast +from ..janus.modeling_janus import JanusForConditionalGeneration, JanusModel, JanusPreTrainedModel + + +logger = logging.get_logger(__name__) + + +class DeepseekVLConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`DeepseekVLModel`]. It is used to instantiate a + DeepseekVL model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of the DeepseekVL + [deepseek-community/deepseek-vl-1.3b-chat](https://huggingface.co/deepseek-community/deepseek-vl-1.3b-chat) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `LlamaConfig`): + The config object or dictionary of the text backbone. + vision_config (`Union[AutoConfig, dict]`, *optional*, defaults to `SiglipVisionConfig`): + The config object or dictionary of the vision backbone. + image_token_id (`int`, *optional*, defaults to 100015): + The index representing image tokens in the model's token vocabulary. + + Example: + + ```python + >>> from transformers import DeepseekVLConfig, DeepseekVLModel + + >>> # Initializing a DeepseekVL deepseek-community/deepseek-vl-1.3b-chat style configuration + >>> configuration = DeepseekVLConfig() + + >>> # Initializing a model (with random weights) from the deepseek-community/deepseek-vl-1.3b-chat style configuration + >>> model = DeepseekVLModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "deepseek_vl" + sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig} + + def __init__( + self, + text_config: Optional[AutoConfig] = None, + vision_config: Optional[AutoConfig] = None, + image_token_id: int = 100015, + **kwargs, + ): + super().__init__(**kwargs) + + if text_config is None: + text_config = {} + logger.info("`text_config` is `None`. Initializing the `LlamaConfig` with default values.") + + if vision_config is None: + vision_config = {} + logger.info("`vision_config` is `None`. Initializing the `SiglipVisionConfig` with default values.") + + if isinstance(text_config, dict): + text_config["model_type"] = text_config.get("model_type", "llama") + text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config) + + if isinstance(vision_config, dict): + vision_config["model_type"] = vision_config.get("model_type", "siglip_vision_model") + vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config) + + self.text_config = text_config + self.vision_config = vision_config + self.image_token_id = image_token_id + + +class DeepseekVLBaseModelOutputWithPast(IdeficsBaseModelOutputWithPast): + pass + + +class DeepseekVLCausalLMOutputWithPast(IdeficsCausalLMOutputWithPast): + pass + + +class DeepseekVLAligner(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + + in_features = config.vision_config.hidden_size + out_features = config.text_config.hidden_size + + self.linear1 = nn.Linear(in_features, out_features) + self.activation = nn.GELU() + self.linear2 = nn.Linear(out_features, out_features) + + def forward(self, vision_encodings: torch.Tensor) -> torch.Tensor: + x = self.linear1(vision_encodings) + x = self.activation(x) + x = self.linear2(x) + return x + + +class DeepseekVLPreTrainedModel(JanusPreTrainedModel): + _no_split_modules = ["LlamaDecoderLayer"] + + def _init_weights(self, module): + """Initialize the weights""" + # Required only for Linear layer in DeepseekVLAligner + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=self.config.text_config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + + +@auto_docstring +class DeepseekVLModel(JanusModel): + def __init__(self, config): + super().__init__(config) + self.config = config + + self.vision_model = AutoModel.from_config(config.vision_config) + self.aligner = DeepseekVLAligner(config) + + self.language_model = AutoModel.from_config(config=config.text_config) + + self.gradient_checkpointing = False + # Initialize weights and apply final processing. + self.post_init() + + del self.vqmodel + del self.generation_embeddings + del self.generation_aligner + del self.generation_head + + +class DeepseekVLForConditionalGeneration(JanusForConditionalGeneration): + def prepare_embeddings_for_image_generation(self): + raise AttributeError("Not needed for DeepseekVL") + + def decode_image_tokens(self): + raise AttributeError("Not needed for DeepseekVL") + + def generate(self): + raise AttributeError("Not needed for DeepseekVL") + + +class DeepseekVLImageProcessor(JanusImageProcessor): + def __init__(self, **super_kwargs): + super().__init__(**super_kwargs) + + def postprocess(self): + raise AttributeError("Not needed for DeepseekVL") + + def unnormalize(self): + raise AttributeError("Not needed for DeepseekVL") + + +class DeepseekVLImageProcessorFast(JanusImageProcessorFast): + def __init__(self, **super_kwargs): + super().__init__(**super_kwargs) + + def postprocess(self): + raise AttributeError("Not needed for DeepseekVL") + + +class DeepseekVLProcessorKwargs(ProcessingKwargs, total=False): + _defaults = { + "text_kwargs": {"padding": False}, + "common_kwargs": {"return_tensors": "pt"}, + } + + +class DeepseekVLProcessor(ProcessorMixin): + r""" + Constructs a DeepseekVL processor which wraps a DeepseekVL Image Processor and a Llama tokenizer into a single processor. + + [`DeepseekVLProcessor`] offers all the functionalities of [`DeepseekVLImageProcessor`] and [`LlamaTokenizerFast`]. See the + [`~DeepseekVLProcessor.__call__`] and [`~DeepseekVLProcessor.decode`] for more information. + + Args: + image_processor ([`DeepseekVLImageProcessor`]): + The image processor is a required input. + tokenizer ([`LlamaTokenizerFast`]): + The tokenizer is a required input. + chat_template (`str`, *optional*): + A Jinja template which will be used to convert lists of messages + in a chat into a tokenizable string. + num_image_tokens (`int`, *optional*, defaults to 576): + The number of special image tokens used as placeholders for visual content in text sequences. + """ + + attributes = ["image_processor", "tokenizer"] + valid_kwargs = ["chat_template", "num_image_tokens"] + image_processor_class = "AutoImageProcessor" + tokenizer_class = "AutoTokenizer" + + def __init__( + self, + image_processor, + tokenizer, + chat_template=None, + num_image_tokens=576, + ): + self.image_token = tokenizer.image_token + self.num_image_tokens = num_image_tokens + + super().__init__(image_processor, tokenizer, chat_template=chat_template) + + def __call__( + self, + text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None, + images: Optional[ImageInput] = None, + **kwargs: Unpack[DeepseekVLProcessorKwargs], + ) -> BatchFeature: + """ + Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text` + and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode + the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to + DeepseekVLImageProcessor's [`~DeepseekVLImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring + of the above two methods for more information. + + Args: + text (`str`, `List[str]`, `List[List[str]]`): + The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings + (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set + `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). + images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`): + The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch + tensor. Both channels-first and channels-last formats are supported. + return_tensors (`str` or [`~utils.TensorType`], *optional*): + If set, will return tensors of a particular framework. Acceptable values are: + - `'tf'`: Return TensorFlow `tf.constant` objects. + - `'pt'`: Return PyTorch `torch.Tensor` objects. + - `'np'`: Return NumPy `np.ndarray` objects. + - `'jax'`: Return JAX `jnp.ndarray` objects. + + Returns: + [`BatchFeature`]: A [`BatchFeature`] with the following fields: + + - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`. + - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when + `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not + `None`). + - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`. + """ + output_kwargs = self._merge_kwargs( + DeepseekVLProcessorKwargs, tokenizer_init_kwargs=self.tokenizer.init_kwargs, **kwargs + ) + if text is None and images is None: + raise ValueError("You must specify either text or images.") + + if text is not None: + if isinstance(text, str): + text = [text] + elif not (isinstance(text, (list, tuple)) and all(isinstance(t, str) for t in text)): + raise ValueError("Invalid input text. Please provide a string, or a list of strings") + + prompt_strings = [] + one_img_tokens = self.image_token * self.num_image_tokens + for prompt in text: + prompt = prompt.replace(self.image_token, one_img_tokens) + prompt_strings.append(prompt) + + data = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"]) + + # process images if pixel_values are provided + if images is not None: + data["pixel_values"] = self.image_processor(images, **output_kwargs["images_kwargs"])["pixel_values"] + + return BatchFeature(data=data) + + def batch_decode(self, *args, **kwargs): + """ + This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please + refer to the docstring of this method for more information. + """ + return self.tokenizer.batch_decode(*args, **kwargs) + + def decode(self, *args, **kwargs): + """ + This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to + the docstring of this method for more information. + """ + return self.tokenizer.decode(*args, **kwargs) + + @property + def model_input_names(self): + tokenizer_input_names = self.tokenizer.model_input_names + image_processor_input_names = self.image_processor.model_input_names + return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names)) + + +__all__ = [ + "DeepseekVLConfig", + "DeepseekVLPreTrainedModel", + "DeepseekVLModel", + "DeepseekVLForConditionalGeneration", + "DeepseekVLImageProcessor", + "DeepseekVLImageProcessorFast", + "DeepseekVLProcessor", +] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deepseek_vl/processing_deepseek_vl.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deepseek_vl/processing_deepseek_vl.py new file mode 100644 index 0000000000000000000000000000000000000000..26d59d85a2955022c2a428509dcd29eab5c7c5d4 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deepseek_vl/processing_deepseek_vl.py @@ -0,0 +1,156 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/deepseek_vl/modular_deepseek_vl.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_deepseek_vl.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# Copyright 2025 Deepseek AI and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Optional, Union + +from ...image_processing_utils import BatchFeature +from ...image_utils import ImageInput +from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack +from ...tokenization_utils_base import PreTokenizedInput, TextInput + + +class DeepseekVLProcessorKwargs(ProcessingKwargs, total=False): + _defaults = { + "text_kwargs": {"padding": False}, + "common_kwargs": {"return_tensors": "pt"}, + } + + +class DeepseekVLProcessor(ProcessorMixin): + r""" + Constructs a DeepseekVL processor which wraps a DeepseekVL Image Processor and a Llama tokenizer into a single processor. + + [`DeepseekVLProcessor`] offers all the functionalities of [`DeepseekVLImageProcessor`] and [`LlamaTokenizerFast`]. See the + [`~DeepseekVLProcessor.__call__`] and [`~DeepseekVLProcessor.decode`] for more information. + + Args: + image_processor ([`DeepseekVLImageProcessor`]): + The image processor is a required input. + tokenizer ([`LlamaTokenizerFast`]): + The tokenizer is a required input. + chat_template (`str`, *optional*): + A Jinja template which will be used to convert lists of messages + in a chat into a tokenizable string. + num_image_tokens (`int`, *optional*, defaults to 576): + The number of special image tokens used as placeholders for visual content in text sequences. + """ + + attributes = ["image_processor", "tokenizer"] + valid_kwargs = ["chat_template", "num_image_tokens"] + image_processor_class = "AutoImageProcessor" + tokenizer_class = "AutoTokenizer" + + def __init__( + self, + image_processor, + tokenizer, + chat_template=None, + num_image_tokens=576, + ): + self.image_token = tokenizer.image_token + self.num_image_tokens = num_image_tokens + + super().__init__(image_processor, tokenizer, chat_template=chat_template) + + def __call__( + self, + text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None, + images: Optional[ImageInput] = None, + **kwargs: Unpack[DeepseekVLProcessorKwargs], + ) -> BatchFeature: + """ + Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text` + and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode + the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to + DeepseekVLImageProcessor's [`~DeepseekVLImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring + of the above two methods for more information. + + Args: + text (`str`, `List[str]`, `List[List[str]]`): + The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings + (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set + `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). + images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`): + The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch + tensor. Both channels-first and channels-last formats are supported. + return_tensors (`str` or [`~utils.TensorType`], *optional*): + If set, will return tensors of a particular framework. Acceptable values are: + - `'tf'`: Return TensorFlow `tf.constant` objects. + - `'pt'`: Return PyTorch `torch.Tensor` objects. + - `'np'`: Return NumPy `np.ndarray` objects. + - `'jax'`: Return JAX `jnp.ndarray` objects. + + Returns: + [`BatchFeature`]: A [`BatchFeature`] with the following fields: + + - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`. + - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when + `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not + `None`). + - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`. + """ + output_kwargs = self._merge_kwargs( + DeepseekVLProcessorKwargs, tokenizer_init_kwargs=self.tokenizer.init_kwargs, **kwargs + ) + if text is None and images is None: + raise ValueError("You must specify either text or images.") + + if text is not None: + if isinstance(text, str): + text = [text] + elif not (isinstance(text, (list, tuple)) and all(isinstance(t, str) for t in text)): + raise ValueError("Invalid input text. Please provide a string, or a list of strings") + + prompt_strings = [] + one_img_tokens = self.image_token * self.num_image_tokens + for prompt in text: + prompt = prompt.replace(self.image_token, one_img_tokens) + prompt_strings.append(prompt) + + data = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"]) + + # process images if pixel_values are provided + if images is not None: + data["pixel_values"] = self.image_processor(images, **output_kwargs["images_kwargs"])["pixel_values"] + + return BatchFeature(data=data) + + def batch_decode(self, *args, **kwargs): + """ + This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please + refer to the docstring of this method for more information. + """ + return self.tokenizer.batch_decode(*args, **kwargs) + + def decode(self, *args, **kwargs): + """ + This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to + the docstring of this method for more information. + """ + return self.tokenizer.decode(*args, **kwargs) + + @property + def model_input_names(self): + tokenizer_input_names = self.tokenizer.model_input_names + image_processor_input_names = self.image_processor.model_input_names + return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names)) + + +__all__ = ["DeepseekVLProcessor"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deepseek_vl_hybrid/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deepseek_vl_hybrid/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..da85178ccc84d9385fd3473a25eb51757dc9d34b --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deepseek_vl_hybrid/__init__.py @@ -0,0 +1,31 @@ +# Copyright 2025 Deepseek AI and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_deepseek_vl_hybrid import * + from .image_processing_deepseek_vl_fast_hybrid import * + from .image_processing_deepseek_vl_hybrid import * + from .image_processing_deepseek_vl_hybrid_fast import * + from .modeling_deepseek_vl_hybrid import * + from .processing_deepseek_vl_hybrid import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deepseek_vl_hybrid/configuration_deepseek_vl_hybrid.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deepseek_vl_hybrid/configuration_deepseek_vl_hybrid.py new file mode 100644 index 0000000000000000000000000000000000000000..e8c6e2df6ea31b295fa4fd277a9e7257e976ca8f --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deepseek_vl_hybrid/configuration_deepseek_vl_hybrid.py @@ -0,0 +1,110 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_deepseek_vl_hybrid.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# Copyright 2025 Deepseek AI and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Optional + +from ...configuration_utils import PretrainedConfig +from ...utils import logging +from ..auto import CONFIG_MAPPING, AutoConfig + + +logger = logging.get_logger(__name__) + + +class DeepseekVLHybridConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`DeepseekVLHybridModel`]. It is used to instantiate a + DeepseekVLHybrid model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of the DeepseekVLHybrid + [deepseek-community/deepseek-vl-7b-chat](https://huggingface.co/deepseek-community/deepseek-vl-7b-chat) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `LlamaConfig`): + The config object or dictionary of the text backbone. + vision_config (`Union[AutoConfig, dict]`, *optional*, defaults to `SiglipVisionConfig`): + The config object or dictionary of the vision backbone. + high_res_vision_config (`Union[AutoConfig, dict]`, *optional*, defaults to `SamVisionConfig`): + The config object or dictionary of the high resolution vision backbone. + image_token_id (`int`, *optional*, defaults to 100015): + The index representing image tokens in the model's token vocabulary. + + Example: + + ```python + >>> from transformers import DeepseekVLHybridConfig, DeepseekVLHybridModel + + >>> # Initializing a DeepseekVLHybrid deepseek-community/deepseek-vl-7b-chat style configuration + >>> configuration = DeepseekVLHybridConfig() + + >>> # Initializing a model (with random weights) from the deepseek-community/deepseek-vl-7b-chat style configuration + >>> model = DeepseekVLHybridModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "deepseek_vl_hybrid" + sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig, "high_res_vision_config": AutoConfig} + + def __init__( + self, + text_config: Optional[AutoConfig] = None, + vision_config: Optional[AutoConfig] = None, + high_res_vision_config: Optional[AutoConfig] = None, + image_token_id: int = 100015, + **kwargs, + ): + super().__init__(**kwargs) + + if text_config is None: + text_config = {} + logger.info("`text_config` is `None`. Initializing the `LlamaConfig` with default values.") + + if vision_config is None: + vision_config = {} + logger.info("`vision_config` is `None`. Initializing the `SiglipVisionConfig` with default values.") + + if isinstance(text_config, dict): + text_config["model_type"] = text_config.get("model_type", "llama") + text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config) + + if isinstance(vision_config, dict): + vision_config["model_type"] = vision_config.get("model_type", "siglip_vision_model") + vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config) + + self.text_config = text_config + self.vision_config = vision_config + self.image_token_id = image_token_id + + if high_res_vision_config is None: + high_res_vision_config = {} + logger.info("`high_res_vision_config` is `None`. Initializing the `SamVisionConfig` with default values.") + + if isinstance(high_res_vision_config, dict): + high_res_vision_config["model_type"] = high_res_vision_config.get("model_type", "sam_vision_model") + high_res_vision_config = CONFIG_MAPPING[high_res_vision_config["model_type"]](**high_res_vision_config) + + self.high_res_vision_config = high_res_vision_config + + +__all__ = ["DeepseekVLHybridConfig"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py new file mode 100644 index 0000000000000000000000000000000000000000..865e13fa964fac4cc3d630ca80ada376a3556cf1 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py @@ -0,0 +1,498 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_deepseek_vl_hybrid.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# Copyright 2025 Deepseek AI and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Optional, Union + +import numpy as np + +from ...image_processing_utils import BaseImageProcessor +from ...image_processing_utils_fast import BatchFeature, get_size_dict +from ...image_transforms import convert_to_rgb, resize, to_channel_dimension_format +from ...image_utils import ( + OPENAI_CLIP_MEAN, + OPENAI_CLIP_STD, + ChannelDimension, + ImageInput, + PILImageResampling, + get_image_size, + infer_channel_dimension_format, + is_scaled_image, + make_flat_list_of_images, + to_numpy_array, + valid_images, + validate_preprocess_arguments, +) +from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging + + +if is_vision_available(): + import PIL + + +logger = logging.get_logger(__name__) + + +class DeepseekVLHybridImageProcessor(BaseImageProcessor): + r""" + Constructs a DEEPSEEK_VL_HYBRID image processor. + + Args: + do_resize (`bool`, *optional*, defaults to `True`): + Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the + `do_resize` parameter in the `preprocess` method. + size (`dict`, *optional*, defaults to `{"height": 384, "width": 384}`): + Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess` + method. + high_res_size (`dict`, *optional*, defaults to `{"height": 1024, "width": 1024}`): + Size of the high resolution output image after resizing. Can be overridden by the `high_res_size` parameter in the `preprocess` + method. + min_size (`int`, *optional*, defaults to 14): + The minimum allowed size for the resized image. Ensures that neither the height nor width + falls below this value after resizing. + resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`): + Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be + overridden by the `resample` parameter in the `preprocess` method. + high_res_resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`): + Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be + overridden by the `high_res_resample` parameter in the `preprocess` method. + do_rescale (`bool`, *optional*, defaults to `True`): + Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the + `do_rescale` parameter in the `preprocess` method. + rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): + Scale factor to use if rescaling the image. Only has an effect if `do_rescale` is set to `True`. Can be + overridden by the `rescale_factor` parameter in the `preprocess` method. + do_normalize (`bool`, *optional*, defaults to `True`): + Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess` + method. Can be overridden by the `do_normalize` parameter in the `preprocess` method. + image_mean (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`): + Mean to use if normalizing the image. This is a float or list of floats the length of the number of + channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can be + overridden by the `image_mean` parameter in the `preprocess` method. + image_std (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`): + Standard deviation to use if normalizing the image. This is a float or list of floats the length of the + number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method. + Can be overridden by the `image_std` parameter in the `preprocess` method. + high_res_image_mean (`float` or `list[float]`, *optional*, defaults to `OPENAI_CLIP_MEAN`): + Mean to use if normalizing the high resolution image. This is a float or list of floats the length of the number of + channels in the image. Can be overridden by the `high_res_image_mean` parameter in the `preprocess` method. + high_res_image_std (`float` or `list[float]`, *optional*, defaults to `OPENAI_CLIP_STD`): + Standard deviation to use if normalizing the high resolution image. This is a float or list of floats the length of the + number of channels in the image. Can be overridden by the `high_res_image_std` parameter in the `preprocess` method. + do_convert_rgb (`bool`, *optional*, defaults to `True`): + Whether to convert the image to RGB. + do_pad (`bool`, *optional*, defaults to `True`): + Whether to pad the image to square or not. + """ + + model_input_names = ["pixel_values", "high_res_pixel_values"] + + def __init__( + self, + do_resize: bool = True, + size: Optional[dict[str, int]] = None, + high_res_size: Optional[dict[str, int]] = None, + min_size: int = 14, + resample: PILImageResampling = PILImageResampling.BICUBIC, + high_res_resample: PILImageResampling = PILImageResampling.BICUBIC, + do_rescale: bool = True, + rescale_factor: Union[int, float] = 1 / 255, + do_normalize: bool = True, + image_mean: Optional[Union[float, list[float]]] = None, + image_std: Optional[Union[float, list[float]]] = None, + high_res_image_mean: Optional[Union[float, list[float]]] = None, + high_res_image_std: Optional[Union[float, list[float]]] = None, + do_convert_rgb: Optional[bool] = None, + do_pad: bool = True, + **kwargs, + ) -> None: + super().__init__(**kwargs) + high_res_size = high_res_size if high_res_size is not None else {"height": 1024, "width": 1024} + high_res_size = get_size_dict(high_res_size, default_to_square=True) + + self.high_res_size = high_res_size + self.high_res_image_mean = high_res_image_mean if high_res_image_mean is not None else OPENAI_CLIP_MEAN + self.high_res_image_std = high_res_image_std if high_res_image_std is not None else OPENAI_CLIP_STD + + self.resample = resample + self.high_res_resample = high_res_resample + size = size if size is not None else {"height": 384, "width": 384} + size = get_size_dict(size, default_to_square=True) + + self.do_resize = do_resize + self.size = size + self.resample = resample + self.do_rescale = do_rescale + self.rescale_factor = rescale_factor + self.do_normalize = do_normalize + self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN + self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD + self.do_convert_rgb = do_convert_rgb + + self.do_pad = do_pad + self.min_size = min_size + if image_mean is None: + self.background_color = (127, 127, 127) + else: + self.background_color = tuple(int(x * 255) for x in image_mean) + + if high_res_image_mean is None: + self.high_res_background_color = (127, 127, 127) + else: + self.high_res_background_color = tuple(int(x * 255) for x in high_res_image_mean) + + def resize( + self, + image: np.ndarray, + size: Union[dict[str, int], int], + resample: PILImageResampling = PILImageResampling.BICUBIC, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ) -> np.ndarray: + """ + Resize an image to dynamically calculated size. + + Args: + image (`np.ndarray`): + Image to resize. + size (`dict[str, int]` or `int`): + The size to resize the image to. If a dictionary, it should have the keys `"height"` and `"width"`. + resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`): + `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BICUBIC`. + data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the output image. If unset, the channel dimension format of the input + image is used. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `None`: will be inferred from input + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + + Returns: + `np.ndarray`: The resized image. + """ + if input_data_format is None: + input_data_format = infer_channel_dimension_format(image) + + height, width = get_image_size(image, input_data_format) + max_size = max(height, width) + + size = get_size_dict(size, default_to_square=True) + if size["height"] != size["width"]: + raise ValueError( + f"Output height and width must be the same. Got height={size['height']} and width={size['width']}" + ) + size = size["height"] + + delta = size / max_size + # Largest side becomes `size` and the other side is scaled according to the aspect ratio. + output_size_nonpadded = [ + max(int(height * delta), self.min_size), + max(int(width * delta), self.min_size), + ] + + image = resize( + image, + size=output_size_nonpadded, + resample=resample, + data_format=data_format, + input_data_format=input_data_format, + **kwargs, + ) + return image + + @filter_out_non_signature_kwargs() + def preprocess( + self, + images: ImageInput, + do_resize: Optional[bool] = None, + size: Optional[dict[str, int]] = None, + high_res_size: Optional[dict[str, int]] = None, + resample: Optional[PILImageResampling] = None, + high_res_resample: Optional[PILImageResampling] = None, + do_rescale: Optional[bool] = None, + rescale_factor: Optional[float] = None, + do_normalize: Optional[bool] = None, + image_mean: Optional[Union[float, list[float]]] = None, + image_std: Optional[Union[float, list[float]]] = None, + high_res_image_mean: Optional[Union[float, list[float]]] = None, + high_res_image_std: Optional[Union[float, list[float]]] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + do_convert_rgb: Optional[bool] = None, + do_pad: Optional[bool] = None, + background_color: Optional[tuple[int, int, int]] = None, + ) -> PIL.Image.Image: + """ + Preprocess an image or batch of images. + + Args: + images (`ImageInput`): + Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If + passing in images with pixel values between 0 and 1, set `do_rescale=False`. + do_resize (`bool`, *optional*, defaults to `self.do_resize`): + Whether to resize the image. + size (`Dict[str, int]`, *optional*, defaults to `self.size`): + Dictionary in the format `{"height": h, "width": w}` specifying the size of the output image after + resizing. + high_res_size (`Dict[str, int]`, *optional*, defaults to `self.high_res_size`): + Dictionary in the format `{"height": h, "width": w}` specifying the size of the high resolution output image after + resizing. + resample (`PILImageResampling` filter, *optional*, defaults to `self.resample`): + `PILImageResampling` filter to use if resizing the image e.g. `PILImageResampling.BILINEAR`. Only has + an effect if `do_resize` is set to `True`. + high_res_resample (`PILImageResampling` filter, *optional*, defaults to `self.resample`): + `PILImageResampling` filter to use if resizing the image e.g. `PILImageResampling.BICUBIC`. Only has + an effect if `do_resize` is set to `True`. + do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): + Whether to rescale the image values between [0 - 1]. + rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): + Rescale factor to rescale the image by if `do_rescale` is set to `True`. + do_normalize (`bool`, *optional*, defaults to `self.do_normalize`): + Whether to normalize the image. + image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`): + Image mean to use if `do_normalize` is set to `True`. + image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`): + Image standard deviation to use if `do_normalize` is set to `True`. + high_res_image_mean (`float` or `List[float]`, *optional*, defaults to `self.high_res_image_mean`): + Image mean to use if `do_normalize` is set to `True`. + high_res_image_std (`float` or `List[float]`, *optional*, defaults to `self.high_res_image_std`): + Image standard deviation to use if `do_normalize` is set to `True`. + return_tensors (`str` or `TensorType`, *optional*): + The type of tensors to return. Can be one of: + - Unset: Return a list of `np.ndarray`. + - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`. + - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. + - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. + - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. + data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - Unset: Use the channel dimension format of the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`): + Whether to convert the image to RGB. + do_pad (`bool`, *optional*, defaults to `self.do_pad`): + Whether to pad the image to square or not. + background_color (`tuple[int, int, int]`): + The background color to use for the padding. + """ + do_resize = do_resize if do_resize is not None else self.do_resize + do_rescale = do_rescale if do_rescale is not None else self.do_rescale + do_normalize = do_normalize if do_normalize is not None else self.do_normalize + resample = resample if resample is not None else self.resample + high_res_resample = high_res_resample if high_res_resample is not None else self.high_res_resample + rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor + image_mean = image_mean if image_mean is not None else self.image_mean + image_std = image_std if image_std is not None else self.image_std + high_res_image_mean = high_res_image_mean if high_res_image_mean is not None else self.high_res_image_mean + high_res_image_std = high_res_image_std if high_res_image_std is not None else self.high_res_image_std + do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb + do_pad = do_pad if do_pad is not None else self.do_pad + background_color = background_color if background_color is not None else self.background_color + + size = size if size is not None else self.size + size_dict = get_size_dict(size) + high_res_size = high_res_size if high_res_size is not None else self.high_res_size + high_res_size_dict = get_size_dict(high_res_size) + + images = self.fetch_images(images) + images = make_flat_list_of_images(images) + + if not valid_images(images): + raise ValueError( + "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " + "torch.Tensor, tf.Tensor or jax.ndarray." + ) + validate_preprocess_arguments( + do_rescale=do_rescale, + rescale_factor=rescale_factor, + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + do_resize=do_resize, + size=size, + resample=resample, + ) + + if do_convert_rgb: + images = [convert_to_rgb(image) for image in images] + + # All transformations expect numpy arrays. + images = [to_numpy_array(image) for image in images] + + if do_rescale and is_scaled_image(images[0]): + logger.warning_once( + "It looks like you are trying to rescale already rescaled images. If the input" + " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again." + ) + + if input_data_format is None: + # We assume that all images have the same channel dimension format. + input_data_format = infer_channel_dimension_format(images[0]) + + all_images = [] + all_high_res_images = [] + for image in images: + # high_res_image: resize (high) -> rescale -> normalize (high) + # low_res_image: resize (high) -> rescale -> resize (low) -> normalize (low) + high_res_image = image + if do_resize: + high_res_image = self.resize( + image=high_res_image, + size=high_res_size_dict, + resample=high_res_resample, + input_data_format=input_data_format, + ) + if do_pad: + # Expand and pad the images to obtain a square image of dimensions `size x size` + high_res_image = self.pad_to_square( + image=high_res_image, + background_color=background_color, + input_data_format=input_data_format, + ) + image = self.resize( + image=high_res_image, + size=size_dict, + resample=resample, + input_data_format=input_data_format, + ) + if do_pad: + image = self.pad_to_square( + image=image, + background_color=background_color, + input_data_format=input_data_format, + ) + + if do_rescale: + image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format) + high_res_image = self.rescale( + image=high_res_image, scale=rescale_factor, input_data_format=input_data_format + ) + + if do_normalize: + image = self.normalize( + image=image, mean=image_mean, std=image_std, input_data_format=input_data_format + ) + high_res_image = self.normalize( + image=high_res_image, + mean=high_res_image_mean, + std=high_res_image_std, + input_data_format=input_data_format, + ) + + image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) + high_res_image = to_channel_dimension_format( + high_res_image, data_format, input_channel_dim=input_data_format + ) + + all_images.append(image) + all_high_res_images.append(high_res_image) + + data = {"pixel_values": all_images, "high_res_pixel_values": all_high_res_images} + return BatchFeature(data=data, tensor_type=return_tensors) + + def pad_to_square( + self, + image: np.ndarray, + background_color: Union[int, tuple[int, int, int]] = 0, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> np.ndarray: + """ + Pads an image to a square based on the longest edge. + + Args: + image (`np.ndarray`): + The image to pad. + background_color (`int` or `tuple[int, int, int]`, *optional*, defaults to 0): + The color to use for the padding. Can be an integer for single channel or a + tuple of integers representing for multi-channel images. If passed as integer + in multi-channel mode, it will default to `0` in subsequent channels. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + If unset, will use same as the input image. + input_data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format for the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + + Returns: + `np.ndarray`: The padded image. + """ + height, width = get_image_size(image, input_data_format) + num_channels = image.shape[0] if input_data_format == ChannelDimension.FIRST else image.shape[-1] + + if height == width: + image = ( + to_channel_dimension_format(image, data_format, input_data_format) + if data_format is not None + else image + ) + return image + + max_dim = max(height, width) + + # Ensure background_color is the correct shape + if isinstance(background_color, int): + background_color = [background_color] + elif len(background_color) != num_channels: + raise ValueError( + f"background_color must have no more than {num_channels} elements to match the number of channels" + ) + + if input_data_format == ChannelDimension.FIRST: + result = np.zeros((num_channels, max_dim, max_dim), dtype=image.dtype) + for i, color in enumerate(background_color): + result[i, :, :] = color + if width > height: + start = (max_dim - height) // 2 + result[:, start : start + height, :] = image + else: + start = (max_dim - width) // 2 + result[:, :, start : start + width] = image + else: + result = np.zeros((max_dim, max_dim, num_channels), dtype=image.dtype) + for i, color in enumerate(background_color): + result[:, :, i] = color + if width > height: + start = (max_dim - height) // 2 + result[start : start + height, :, :] = image + else: + start = (max_dim - width) // 2 + result[:, start : start + width, :] = image + + return result + + +__all__ = ["DeepseekVLHybridImageProcessor"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py new file mode 100644 index 0000000000000000000000000000000000000000..c04e006e358d5fcc157cedabc3ba18137cf2eea6 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py @@ -0,0 +1,327 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_deepseek_vl_hybrid.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# Copyright 2025 Deepseek AI and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Optional, Union + +import torch +from torchvision.transforms.v2 import functional as F + +from ...image_processing_utils_fast import ( + BaseImageProcessorFast, + BatchFeature, + DefaultFastImageProcessorKwargs, + get_size_dict, + group_images_by_shape, + reorder_images, +) +from ...image_utils import ( + OPENAI_CLIP_MEAN, + OPENAI_CLIP_STD, + ChannelDimension, + PILImageResampling, + SizeDict, + pil_torch_interpolation_mapping, +) +from ...processing_utils import Unpack +from ...utils import TensorType, auto_docstring + + +class DeepseekVLHybridFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): + r""" + min_size (`int`, *optional*, defaults to 14): + The minimum allowed size for the resized image. Ensures that neither the height nor width + falls below this value after resizing. + high_res_size (`dict`, *optional*, defaults to `{"height": 1024, "width": 1024}`): + Size of the high resolution output image after resizing. Can be overridden by the `high_res_size` parameter in the `preprocess` + method. + high_res_resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`): + Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be + overridden by the `high_res_resample` parameter in the `preprocess` method. + high_res_image_mean (`float` or `list[float]`, *optional*, defaults to `OPENAI_CLIP_MEAN`): + Mean to use if normalizing the high resolution image. This is a float or list of floats the length of the number of + channels in the image. Can be overridden by the `high_res_image_mean` parameter in the `preprocess` method. + high_res_image_std (`float` or `list[float]`, *optional*, defaults to `OPENAI_CLIP_STD`): + Standard deviation to use if normalizing the high resolution image. This is a float or list of floats the length of the + number of channels in the image. Can be overridden by the `high_res_image_std` parameter in the `preprocess` method. + """ + + min_size: int + high_res_size: dict + high_res_resample: "PILImageResampling" + high_res_image_mean: list[float] + high_res_image_std: list[float] + + +@auto_docstring +class DeepseekVLHybridImageProcessorFast(BaseImageProcessorFast): + resample = PILImageResampling.BICUBIC + image_mean = OPENAI_CLIP_MEAN + image_std = OPENAI_CLIP_STD + size = {"height": 384, "width": 384} + min_size = 14 + do_resize = True + do_rescale = True + do_normalize = True + do_pad = True + valid_kwargs = DeepseekVLHybridFastImageProcessorKwargs + high_res_image_mean = OPENAI_CLIP_MEAN + high_res_image_std = OPENAI_CLIP_STD + high_res_size = {"height": 1024, "width": 1024} + high_res_resample = PILImageResampling.BICUBIC + model_input_names = ["pixel_values", "high_res_pixel_values"] + + def __init__(self, **kwargs: Unpack[DeepseekVLHybridFastImageProcessorKwargs]): + if kwargs.get("image_mean") is None: + background_color = (127, 127, 127) + else: + background_color = tuple([int(x * 255) for x in kwargs.get("image_mean")]) + if kwargs.get("high_res_image_mean") is None: + high_res_background_color = (127, 127, 127) + else: + high_res_background_color = tuple(int(x * 255) for x in kwargs.get("high_res_image_mean")) + super().__init__(**kwargs) + self.background_color = tuple(background_color) + self.high_res_background_color = tuple(high_res_background_color) + + def resize( + self, + image: "torch.Tensor", + size: SizeDict, + min_size: int, + interpolation: Optional["F.InterpolationMode"] = None, + antialias: bool = True, + **kwargs, + ) -> "torch.Tensor": + if size.height is None or size.width is None or size.height != size.width: + raise ValueError( + f"Output height and width must be the same. Got height={size['height']} and width={size['width']}" + ) + size = size.height + + height, width = image.shape[-2:] + max_size = max(height, width) + + delta = size / max_size + # Largest side becomes `size` and the other side is scaled according to the aspect ratio. + output_size_nonpadded = SizeDict( + height=max(int(height * delta), min_size), + width=max(int(width * delta), min_size), + ) + + return super().resize(image, size=output_size_nonpadded, interpolation=interpolation, antialias=antialias) + + def pad_to_square( + self, + images: "torch.Tensor", + background_color: Union[int, tuple[int, int, int]] = 0, + ) -> "torch.Tensor": + """ + Pads an image to a square based on the longest edge. + + Args: + images (`torch.Tensor`): + The images to pad. + background_color (`int` or `tuple[int, int, int]`, *optional*, defaults to 0): + The color to use for the padding. Can be an integer for single channel or a + tuple of integers representing for multi-channel images. If passed as integer + in multi-channel mode, it will default to `0` in subsequent channels. + + Returns: + `torch.Tensor`: The padded images. + """ + height, width = images.shape[-2:] + num_channels = images.shape[1] + batch_size = images.shape[0] + + if height == width: + return images + + max_dim = max(height, width) + + # Ensure background_color is the correct shape + if isinstance(background_color, int): + background_color = [background_color] + elif len(background_color) != num_channels: + raise ValueError( + f"background_color must have no more than {num_channels} elements to match the number of channels" + ) + + padded_images = torch.zeros( + (batch_size, num_channels, max_dim, max_dim), dtype=images.dtype, device=images.device + ) + for i, color in enumerate(background_color): + padded_images[:, i, :, :] = color + if width > height: + start = (max_dim - height) // 2 + padded_images[:, :, start : start + height, :] = images + else: + start = (max_dim - width) // 2 + padded_images[:, :, :, start : start + width] = images + + return padded_images + + def _preprocess( + self, + images: list["torch.Tensor"], + do_resize: bool, + size: SizeDict, + high_res_size: SizeDict, + min_size: int, + interpolation: Optional["F.InterpolationMode"], + high_res_interpolation: Optional["F.InterpolationMode"], + do_rescale: bool, + rescale_factor: float, + do_normalize: bool, + image_mean: Optional[Union[float, list[float]]], + image_std: Optional[Union[float, list[float]]], + high_res_image_mean: Optional[Union[float, list[float]]], + high_res_image_std: Optional[Union[float, list[float]]], + disable_grouping: Optional[bool], + return_tensors: Optional[Union[str, TensorType]], + do_pad: bool = True, + **kwargs, + ) -> BatchFeature: + # Group images by size for batched resizing + grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping) + high_res_resized_images_grouped = {} + for shape, stacked_images in grouped_images.items(): + if do_resize: + stacked_high_res_images = self.resize( + image=stacked_images, size=high_res_size, min_size=min_size, interpolation=high_res_interpolation + ) + high_res_resized_images_grouped[shape] = stacked_high_res_images + high_res_resized_images = reorder_images(high_res_resized_images_grouped, grouped_images_index) + + # Group images by size for further processing + # Needed in case do_resize is False, or resize returns images with different sizes + grouped_high_res_images, grouped_high_res_images_index = group_images_by_shape( + high_res_resized_images, disable_grouping=disable_grouping + ) + high_res_padded_images = {} + high_res_processed_images_grouped = {} + for shape, stacked_high_res_images in grouped_high_res_images.items(): + if do_pad: + stacked_high_res_images = self.pad_to_square( + stacked_high_res_images, background_color=self.high_res_background_color + ) + high_res_padded_images[shape] = stacked_high_res_images + # Fused rescale and normalize + stacked_high_res_images = self.rescale_and_normalize( + stacked_high_res_images, + do_rescale, + rescale_factor, + do_normalize, + high_res_image_mean, + high_res_image_std, + ) + high_res_processed_images_grouped[shape] = stacked_high_res_images + high_res_processed_images = reorder_images(high_res_processed_images_grouped, grouped_high_res_images_index) + high_res_processed_images = ( + torch.stack(high_res_processed_images, dim=0) if return_tensors else high_res_processed_images + ) + + resized_images_grouped = {} + for shape, stacked_high_res_padded_images in high_res_padded_images.items(): + if do_resize: + stacked_images = self.resize( + image=stacked_high_res_padded_images, size=size, min_size=min_size, interpolation=interpolation + ) + resized_images_grouped[shape] = stacked_images + resized_images = reorder_images(resized_images_grouped, grouped_high_res_images_index) + + grouped_resized_images, grouped_resized_images_index = group_images_by_shape( + resized_images, disable_grouping=disable_grouping + ) + processed_images_grouped = {} + for shape, stacked_images in grouped_resized_images.items(): + if do_pad: + stacked_images = self.pad_to_square(stacked_images, background_color=self.background_color) + # Fused rescale and normalize + stacked_images = self.rescale_and_normalize( + stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std + ) + processed_images_grouped[shape] = stacked_images + processed_images = reorder_images(processed_images_grouped, grouped_resized_images_index) + processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images + + return BatchFeature( + data={"pixel_values": processed_images, "high_res_pixel_values": high_res_processed_images}, + tensor_type=return_tensors, + ) + + def _further_process_kwargs( + self, + size: Optional[SizeDict] = None, + high_res_size: Optional[SizeDict] = None, + default_to_square: Optional[bool] = None, + image_mean: Optional[Union[float, list[float]]] = None, + image_std: Optional[Union[float, list[float]]] = None, + high_res_image_mean: Optional[Union[float, list[float]]] = None, + high_res_image_std: Optional[Union[float, list[float]]] = None, + data_format: Optional[ChannelDimension] = None, + **kwargs, + ) -> dict: + """ + Update kwargs that need further processing before being validated + Can be overridden by subclasses to customize the processing of kwargs. + """ + if kwargs is None: + kwargs = {} + if size is not None: + size = SizeDict(**get_size_dict(size=size, default_to_square=default_to_square)) + if high_res_size is not None: + high_res_size = SizeDict(**get_size_dict(size=high_res_size, default_to_square=default_to_square)) + if isinstance(image_mean, list): + image_mean = tuple(image_mean) + if isinstance(image_std, list): + image_std = tuple(image_std) + if isinstance(high_res_image_mean, list): + high_res_image_mean = tuple(high_res_image_mean) + if isinstance(high_res_image_std, list): + high_res_image_std = tuple(high_res_image_std) + if data_format is None: + data_format = ChannelDimension.FIRST + + high_res_resample = kwargs.pop("high_res_resample") + kwargs["high_res_interpolation"] = ( + pil_torch_interpolation_mapping[high_res_resample] + if isinstance(high_res_resample, (int, PILImageResampling)) + else high_res_resample + ) + + low_res_resample = kwargs.pop("resample") + kwargs["interpolation"] = ( + pil_torch_interpolation_mapping[low_res_resample] + if isinstance(low_res_resample, (int, PILImageResampling)) + else low_res_resample + ) + + kwargs["size"] = size + kwargs["high_res_size"] = high_res_size + kwargs["image_mean"] = image_mean + kwargs["image_std"] = image_std + kwargs["high_res_image_mean"] = high_res_image_mean + kwargs["high_res_image_std"] = high_res_image_std + kwargs["data_format"] = data_format + + return kwargs + + +__all__ = ["DeepseekVLHybridImageProcessorFast"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py new file mode 100644 index 0000000000000000000000000000000000000000..d9a85654e901576c61c83c6b3ce3c3ea54d3aeaf --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py @@ -0,0 +1,497 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_deepseek_vl_hybrid.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# Copyright 2025 Deepseek AI and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass +from typing import Optional, Union + +import torch +import torch.nn as nn + +from ...cache_utils import Cache +from ...generation import GenerationMixin +from ...modeling_outputs import ModelOutput +from ...modeling_utils import PreTrainedModel +from ...processing_utils import Unpack +from ...utils import TransformersKwargs, auto_docstring, can_return_tuple +from ..auto import AutoModel +from .configuration_deepseek_vl_hybrid import DeepseekVLHybridConfig + + +@dataclass +@auto_docstring( + custom_intro=""" + Base class for DeepseekVLHybrid model's outputs that may also contain a past key/values (to speed up sequential decoding). + """ +) +class DeepseekVLHybridBaseModelOutputWithPast(ModelOutput): + r""" + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + + If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1, + hidden_size)` is output. + past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache). + + Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if + `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values` + input) to speed up sequential decoding. + image_hidden_states (`tuple(torch.FloatTensor)`, *optional*): + Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images, + sequence_length, hidden_size)`. + + image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver + """ + + last_hidden_state: Optional[torch.FloatTensor] = None + past_key_values: Optional[Cache] = None + hidden_states: Optional[tuple[torch.FloatTensor]] = None + attentions: Optional[tuple[torch.FloatTensor]] = None + image_hidden_states: Optional[tuple[torch.FloatTensor]] = None + + +@dataclass +@auto_docstring( + custom_intro=""" + Base class for DeepseekVLHybrid causal language model (or autoregressive) outputs. + """ +) +class DeepseekVLHybridCausalLMOutputWithPast(ModelOutput): + r""" + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Language modeling loss (for next-token prediction). + logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache). + + Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see + `past_key_values` input) to speed up sequential decoding. + image_hidden_states (`tuple(torch.FloatTensor)`, *optional*): + Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images, + sequence_length, hidden_size)`. + + image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver + """ + + loss: Optional[torch.FloatTensor] = None + logits: Optional[torch.FloatTensor] = None + past_key_values: Optional[Cache] = None + hidden_states: Optional[tuple[torch.FloatTensor]] = None + attentions: Optional[tuple[torch.FloatTensor]] = None + image_hidden_states: Optional[tuple[torch.FloatTensor]] = None + + +class DeepseekVLHybridLayerNorm(nn.LayerNorm): + r"""LayerNorm that supports two data formats: channels_last (default) or channels_first. + The ordering of the dimensions in the inputs. channels_last corresponds to inputs with shape (batch_size, height, + width, channels) while channels_first corresponds to inputs with shape (batch_size, channels, height, width). + """ + + def __init__(self, normalized_shape, *, eps=1e-6, data_format="channels_last", **kwargs): + super().__init__(normalized_shape, eps=eps, **kwargs) + if data_format not in ["channels_last", "channels_first"]: + raise NotImplementedError(f"Unsupported data format: {data_format}") + self.data_format = data_format + + def forward(self, features: torch.Tensor) -> torch.Tensor: + """ + Args: + features: Tensor of shape (batch_size, channels, height, width) OR (batch_size, height, width, channels) + """ + if self.data_format == "channels_first": + features = features.permute(0, 2, 3, 1) + features = super().forward(features) + features = features.permute(0, 3, 1, 2) + else: + features = super().forward(features) + return features + + +class DeepseekVLSamVisionNeck(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + + self.conv1 = nn.Conv2d(config.hidden_size, config.output_channels, kernel_size=1, bias=False) + self.layer_norm1 = DeepseekVLHybridLayerNorm(config.output_channels, data_format="channels_first") + self.conv2 = nn.Conv2d(config.output_channels, config.output_channels, kernel_size=3, padding=1, bias=False) + self.layer_norm2 = DeepseekVLHybridLayerNorm(config.output_channels, data_format="channels_first") + + def forward(self, hidden_states): + hidden_states = hidden_states.permute(0, 3, 1, 2) + hidden_states = self.conv1(hidden_states) + hidden_states = self.layer_norm1(hidden_states) + + hidden_states = self.conv2(hidden_states) + hidden_states = self.layer_norm2(hidden_states) + return hidden_states + + +class DeepseekVLSamVisionProj(nn.Module): + def __init__(self, config, output_size: int = 24): + super().__init__() + self.config = config + self.output_size = output_size + + self.conv1 = nn.Conv2d( + config.output_channels, config.output_channels * 2, kernel_size=3, stride=2, padding=1, bias=False + ) + self.conv2 = nn.Conv2d( + config.output_channels * 2, config.output_channels * 4, kernel_size=3, stride=2, padding=1, bias=False + ) + + def forward(self, features: torch.Tensor) -> torch.Tensor: + # interpolate Sam encodings to match Siglip encodings + features = torch.nn.functional.interpolate( + features, + size=(4 * self.output_size, 4 * self.output_size), + mode="bilinear", + align_corners=False, + ) + features = self.conv1(features) + features = self.conv2(features) + return features + + +class DeepseekVLHybridAligner(nn.Module): + def __init__(self, config: DeepseekVLHybridConfig): + super().__init__() + + in_channels = config.vision_config.hidden_size + high_res_in_channels = config.high_res_vision_config.output_channels * 4 + out_channels = config.text_config.hidden_size + + self.vision_proj = nn.Linear(in_channels, out_channels // 2) + self.high_res_vision_proj = nn.Linear(high_res_in_channels, out_channels // 2) + + self.act = nn.GELU() + self.proj = nn.Linear(out_channels, out_channels) + + def forward( + self, + vision_encodings: torch.Tensor, + high_res_vision_encodings: torch.Tensor, + ) -> torch.Tensor: + vision_encodings = self.vision_proj(vision_encodings) + high_res_vision_encodings = self.high_res_vision_proj(high_res_vision_encodings) + + encodings = torch.concat([high_res_vision_encodings, vision_encodings], dim=-1) + encodings = self.act(encodings) + encodings = self.proj(encodings) + + return encodings + + +@auto_docstring +class DeepseekVLHybridPreTrainedModel(PreTrainedModel): + config: DeepseekVLHybridConfig + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["LlamaDecoderLayer"] + _skip_keys_device_placement = ["past_key_values", "causal_mask"] + _supports_flash_attn = True + _supports_sdpa = True + + _can_compile_fullgraph = True + _supports_param_buffer_assignment = False + + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=self.config.text_config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Conv2d): + nn.init.kaiming_normal_(module.weight, mode="fan_out", nonlinearity="relu") + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, DeepseekVLHybridLayerNorm): + module.weight.data.fill_(1.0) + module.bias.data.zero_() + elif isinstance(module, DeepseekVLHybridModel): + module.high_res_vision_alpha.data.zero_() + + +DEEPSEEK_VL_COMMON_CUSTOM_ARGS = r""" + high_res_pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size), *optional*): + The tensors corresponding to the input images. Pixel values can be obtained using + [`AutoImageProcessor`]. +""" + + +@auto_docstring +class DeepseekVLHybridModel(DeepseekVLHybridPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.output_size = config.vision_config.image_size // config.vision_config.patch_size + self.global_attn_index = config.high_res_vision_config.global_attn_indexes[0] + + self.high_res_vision_model = AutoModel.from_config(config.high_res_vision_config) + self.high_res_vision_neck = DeepseekVLSamVisionNeck(config.high_res_vision_config) + self.high_res_vision_proj = DeepseekVLSamVisionProj( + config.high_res_vision_config, output_size=self.output_size + ) + self.high_res_vision_alpha = nn.Parameter(torch.zeros(1)) + self.config = config + + self.vision_model = AutoModel.from_config(config.vision_config) + self.aligner = DeepseekVLHybridAligner(config) + + self.language_model = AutoModel.from_config(config=config.text_config) + + self.gradient_checkpointing = False + # Initialize weights and apply final processing. + self.post_init() + + def get_input_embeddings(self): + return self.language_model.get_input_embeddings() + + def set_input_embeddings(self, value): + self.language_model.set_input_embeddings(value) + + def get_image_features(self, pixel_values, high_res_pixel_values): + vision_encodings = self.get_low_res_image_features(pixel_values) + high_res_vision_encodings = self.get_high_res_image_features(high_res_pixel_values) + images_embeds = self.aligner(vision_encodings, high_res_vision_encodings) + return images_embeds + + def get_placeholder_mask( + self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor, image_features: torch.FloatTensor + ): + """ + Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is + equal to the length of multimodal features. If the lengths are different, an error is raised. + """ + if input_ids is None: + special_image_mask = inputs_embeds == self.get_input_embeddings()( + torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device) + ) + special_image_mask = special_image_mask.all(-1) + else: + special_image_mask = input_ids == self.config.image_token_id + + n_image_tokens = special_image_mask.sum() + special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device) + if inputs_embeds[special_image_mask].numel() != image_features.numel(): + n_image_features = image_features.shape[0] * image_features.shape[1] + raise ValueError( + f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}" + ) + return special_image_mask + + @can_return_tuple + @auto_docstring(custom_args=DEEPSEEK_VL_COMMON_CUSTOM_ARGS) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + high_res_pixel_values: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + cache_position: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + logits_to_keep: Union[int, torch.Tensor] = 0, + **kwargs, + ): + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError( + "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one" + ) + + if pixel_values is not None and high_res_pixel_values is None: + raise ValueError("Both pixel_values and high_res_pixel_values should be specified at the same time") + + if inputs_embeds is None: + inputs_embeds = self.get_input_embeddings()(input_ids) + + if pixel_values is not None: + if input_ids is None: + image_attention_mask = inputs_embeds == self.get_input_embeddings()( + torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device) + ) + image_attention_mask = image_attention_mask.all(-1) + else: + image_attention_mask = input_ids == self.config.image_token_id + + image_attention_mask = image_attention_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device) + image_embeds = self.get_image_features(pixel_values, high_res_pixel_values) + image_features = image_embeds.reshape(-1, inputs_embeds.shape[-1]) + image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype) + inputs_embeds = inputs_embeds.masked_scatter(image_attention_mask, image_features) + + lm_output = self.language_model( + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + use_cache=use_cache, + cache_position=cache_position, + logits_to_keep=logits_to_keep, + **kwargs, + ) + + return DeepseekVLHybridBaseModelOutputWithPast( + last_hidden_state=lm_output.last_hidden_state, + past_key_values=lm_output.past_key_values, + hidden_states=lm_output.hidden_states, + attentions=lm_output.attentions, + image_hidden_states=image_embeds if pixel_values is not None else None, + ) + + def get_low_res_image_features(self, pixel_values): + output = self.vision_model(pixel_values) + output = output[0] + return output + + def get_high_res_image_features(self, pixel_values): + output = self.high_res_vision_model( + pixel_values=pixel_values, + output_hidden_states=True, + return_dict=True, + ) + last_hidden_state = output.last_hidden_state + last_hidden_state = self.high_res_vision_proj(last_hidden_state) + + hidden_states = output.hidden_states + global_hidden_state = hidden_states[self.global_attn_index + 1] # +1 for embedding layer + global_hidden_state = self.high_res_vision_neck(global_hidden_state) + global_hidden_state = self.high_res_vision_proj(global_hidden_state) + + output = last_hidden_state + global_hidden_state * self.high_res_vision_alpha + + # batch_size, hidden_size, height, width -> batch_size, seq_len, hidden_size + output = output.permute(0, 2, 3, 1) + output = output.reshape(output.shape[0], -1, output.shape[-1]) + + return output + + +class DeepseekVLHybridForConditionalGeneration(DeepseekVLHybridPreTrainedModel, GenerationMixin): + _tied_weights_keys = ["model.language_model.embed_tokens.weight", "lm_head.weight"] + _can_compile_fullgraph = True + + def __init__(self, config: DeepseekVLHybridConfig): + super().__init__(config) + self.config = config + self.model = DeepseekVLHybridModel(config) + self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False) + + # Initialize weights and apply final processing. + self.post_init() + + def get_input_embeddings(self): + return self.model.language_model.get_input_embeddings() + + def set_input_embeddings(self, value): + self.model.language_model.set_input_embeddings(value) + + def prepare_embeddings_for_image_generation(self) -> torch.Tensor: + raise AttributeError("Not needed for DeepseekVLHybrid") + + @can_return_tuple + @auto_docstring(custom_args=DEEPSEEK_VL_COMMON_CUSTOM_ARGS) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + high_res_pixel_values: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + cache_position: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + logits_to_keep: Union[int, torch.Tensor] = 0, + **kwargs: Unpack[TransformersKwargs], + ): + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + """ + outputs = self.model( + input_ids=input_ids, + pixel_values=pixel_values, + high_res_pixel_values=high_res_pixel_values, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + cache_position=cache_position, + **kwargs, + ) + hidden_states = outputs.last_hidden_state + # Only compute necessary logits, and do not upcast them to float if we are not computing the loss + slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep + logits = self.lm_head(hidden_states[:, slice_indices, :]) + + loss = None + if labels is not None: + loss = self.loss_function( + logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **kwargs + ) + + return DeepseekVLHybridCausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + image_hidden_states=outputs.image_hidden_states, + ) + + def prepare_inputs_for_generation( + self, + input_ids, + past_key_values=None, + inputs_embeds=None, + pixel_values=None, + high_res_pixel_values=None, + attention_mask=None, + cache_position=None, + logits_to_keep=None, + **kwargs, + ): + model_inputs = super().prepare_inputs_for_generation( + input_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + cache_position=cache_position, + logits_to_keep=logits_to_keep, + **kwargs, + ) + + if cache_position[0] == 0: + # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore + # Otherwise we need pixel values to be passed to model + model_inputs["pixel_values"] = pixel_values + model_inputs["high_res_pixel_values"] = high_res_pixel_values + + return model_inputs + + +__all__ = ["DeepseekVLHybridPreTrainedModel", "DeepseekVLHybridModel", "DeepseekVLHybridForConditionalGeneration"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py new file mode 100644 index 0000000000000000000000000000000000000000..0da40603c2e939c0b9b85af929ff88a57240d5d5 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py @@ -0,0 +1,1007 @@ +# Copyright 2025 Deepseek AI and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Optional, Union + +import torch +import torch.nn as nn +from torchvision.transforms.v2 import functional as F + +from ...cache_utils import Cache +from ...image_processing_utils_fast import ( + BaseImageProcessorFast, + BatchFeature, + DefaultFastImageProcessorKwargs, + get_size_dict, + group_images_by_shape, + reorder_images, +) +from ...image_transforms import convert_to_rgb, to_channel_dimension_format +from ...image_utils import ( + OPENAI_CLIP_MEAN, + OPENAI_CLIP_STD, + ChannelDimension, + ImageInput, + PILImageResampling, + SizeDict, + infer_channel_dimension_format, + is_scaled_image, + make_flat_list_of_images, + pil_torch_interpolation_mapping, + to_numpy_array, + valid_images, + validate_preprocess_arguments, +) +from ...processing_utils import Unpack +from ...tokenization_utils_base import ( + PreTokenizedInput, + TextInput, +) +from ...utils import ( + TensorType, + TransformersKwargs, + auto_docstring, + can_return_tuple, + filter_out_non_signature_kwargs, + logging, +) +from ..auto import CONFIG_MAPPING, AutoConfig, AutoModel +from ..deepseek_vl.configuration_deepseek_vl import DeepseekVLConfig +from ..deepseek_vl.image_processing_deepseek_vl import DeepseekVLImageProcessor +from ..deepseek_vl.image_processing_deepseek_vl_fast import DeepseekVLImageProcessorFast +from ..deepseek_vl.modeling_deepseek_vl import ( + DeepseekVLForConditionalGeneration, + DeepseekVLModel, + DeepseekVLPreTrainedModel, +) +from ..deepseek_vl.processing_deepseek_vl import DeepseekVLProcessor, DeepseekVLProcessorKwargs +from ..idefics.modeling_idefics import IdeficsBaseModelOutputWithPast, IdeficsCausalLMOutputWithPast +from ..sam.modeling_sam import SamLayerNorm, SamVisionNeck + + +logger = logging.get_logger(__name__) + + +DEEPSEEK_VL_COMMON_CUSTOM_ARGS = r""" + high_res_pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size), *optional*): + The tensors corresponding to the input images. Pixel values can be obtained using + [`AutoImageProcessor`]. +""" + + +class DeepseekVLHybridConfig(DeepseekVLConfig): + r""" + This is the configuration class to store the configuration of a [`DeepseekVLHybridModel`]. It is used to instantiate a + DeepseekVLHybrid model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of the DeepseekVLHybrid + [deepseek-community/deepseek-vl-7b-chat](https://huggingface.co/deepseek-community/deepseek-vl-7b-chat) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `LlamaConfig`): + The config object or dictionary of the text backbone. + vision_config (`Union[AutoConfig, dict]`, *optional*, defaults to `SiglipVisionConfig`): + The config object or dictionary of the vision backbone. + high_res_vision_config (`Union[AutoConfig, dict]`, *optional*, defaults to `SamVisionConfig`): + The config object or dictionary of the high resolution vision backbone. + image_token_id (`int`, *optional*, defaults to 100015): + The index representing image tokens in the model's token vocabulary. + + Example: + + ```python + >>> from transformers import DeepseekVLHybridConfig, DeepseekVLHybridModel + + >>> # Initializing a DeepseekVLHybrid deepseek-community/deepseek-vl-7b-chat style configuration + >>> configuration = DeepseekVLHybridConfig() + + >>> # Initializing a model (with random weights) from the deepseek-community/deepseek-vl-7b-chat style configuration + >>> model = DeepseekVLHybridModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "deepseek_vl_hybrid" + sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig, "high_res_vision_config": AutoConfig} + + def __init__( + self, + text_config: Optional[AutoConfig] = None, + vision_config: Optional[AutoConfig] = None, + high_res_vision_config: Optional[AutoConfig] = None, + image_token_id: int = 100015, + **kwargs, + ): + super().__init__( + text_config=text_config, + vision_config=vision_config, + image_token_id=image_token_id, + **kwargs, + ) + + if high_res_vision_config is None: + high_res_vision_config = {} + logger.info("`high_res_vision_config` is `None`. Initializing the `SamVisionConfig` with default values.") + + if isinstance(high_res_vision_config, dict): + high_res_vision_config["model_type"] = high_res_vision_config.get("model_type", "sam_vision_model") + high_res_vision_config = CONFIG_MAPPING[high_res_vision_config["model_type"]](**high_res_vision_config) + + self.high_res_vision_config = high_res_vision_config + + +class DeepseekVLHybridBaseModelOutputWithPast(IdeficsBaseModelOutputWithPast): + pass + + +class DeepseekVLHybridCausalLMOutputWithPast(IdeficsCausalLMOutputWithPast): + pass + + +class DeepseekVLHybridLayerNorm(SamLayerNorm): + pass + + +class DeepseekVLSamVisionNeck(SamVisionNeck): + def __init__(self, config): + super().__init__(config) + + +class DeepseekVLSamVisionProj(nn.Module): + def __init__(self, config, output_size: int = 24): + super().__init__() + self.config = config + self.output_size = output_size + + self.conv1 = nn.Conv2d( + config.output_channels, config.output_channels * 2, kernel_size=3, stride=2, padding=1, bias=False + ) + self.conv2 = nn.Conv2d( + config.output_channels * 2, config.output_channels * 4, kernel_size=3, stride=2, padding=1, bias=False + ) + + def forward(self, features: torch.Tensor) -> torch.Tensor: + # interpolate Sam encodings to match Siglip encodings + features = torch.nn.functional.interpolate( + features, + size=(4 * self.output_size, 4 * self.output_size), + mode="bilinear", + align_corners=False, + ) + features = self.conv1(features) + features = self.conv2(features) + return features + + +class DeepseekVLHybridAligner(nn.Module): + def __init__(self, config: DeepseekVLHybridConfig): + super().__init__() + + in_channels = config.vision_config.hidden_size + high_res_in_channels = config.high_res_vision_config.output_channels * 4 + out_channels = config.text_config.hidden_size + + self.vision_proj = nn.Linear(in_channels, out_channels // 2) + self.high_res_vision_proj = nn.Linear(high_res_in_channels, out_channels // 2) + + self.act = nn.GELU() + self.proj = nn.Linear(out_channels, out_channels) + + def forward( + self, + vision_encodings: torch.Tensor, + high_res_vision_encodings: torch.Tensor, + ) -> torch.Tensor: + vision_encodings = self.vision_proj(vision_encodings) + high_res_vision_encodings = self.high_res_vision_proj(high_res_vision_encodings) + + encodings = torch.concat([high_res_vision_encodings, vision_encodings], dim=-1) + encodings = self.act(encodings) + encodings = self.proj(encodings) + + return encodings + + +class DeepseekVLHybridPreTrainedModel(DeepseekVLPreTrainedModel): + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=self.config.text_config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Conv2d): + nn.init.kaiming_normal_(module.weight, mode="fan_out", nonlinearity="relu") + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, DeepseekVLHybridLayerNorm): + module.weight.data.fill_(1.0) + module.bias.data.zero_() + elif isinstance(module, DeepseekVLHybridModel): + module.high_res_vision_alpha.data.zero_() + + +class DeepseekVLHybridModel(DeepseekVLModel): + def __init__(self, config): + self.output_size = config.vision_config.image_size // config.vision_config.patch_size + self.global_attn_index = config.high_res_vision_config.global_attn_indexes[0] + + self.high_res_vision_model = AutoModel.from_config(config.high_res_vision_config) + self.high_res_vision_neck = DeepseekVLSamVisionNeck(config.high_res_vision_config) + self.high_res_vision_proj = DeepseekVLSamVisionProj( + config.high_res_vision_config, output_size=self.output_size + ) + self.high_res_vision_alpha = nn.Parameter(torch.zeros(1)) + + super().__init__(config) + + def get_low_res_image_features(self, pixel_values): + output = self.vision_model(pixel_values) + output = output[0] + return output + + def get_high_res_image_features(self, pixel_values): + output = self.high_res_vision_model( + pixel_values=pixel_values, + output_hidden_states=True, + return_dict=True, + ) + last_hidden_state = output.last_hidden_state + last_hidden_state = self.high_res_vision_proj(last_hidden_state) + + hidden_states = output.hidden_states + global_hidden_state = hidden_states[self.global_attn_index + 1] # +1 for embedding layer + global_hidden_state = self.high_res_vision_neck(global_hidden_state) + global_hidden_state = self.high_res_vision_proj(global_hidden_state) + + output = last_hidden_state + global_hidden_state * self.high_res_vision_alpha + + # batch_size, hidden_size, height, width -> batch_size, seq_len, hidden_size + output = output.permute(0, 2, 3, 1) + output = output.reshape(output.shape[0], -1, output.shape[-1]) + + return output + + def get_image_features(self, pixel_values, high_res_pixel_values): + vision_encodings = self.get_low_res_image_features(pixel_values) + high_res_vision_encodings = self.get_high_res_image_features(high_res_pixel_values) + images_embeds = self.aligner(vision_encodings, high_res_vision_encodings) + return images_embeds + + @can_return_tuple + @auto_docstring(custom_args=DEEPSEEK_VL_COMMON_CUSTOM_ARGS) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + high_res_pixel_values: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + cache_position: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + logits_to_keep: Union[int, torch.Tensor] = 0, + **kwargs, + ): + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError( + "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one" + ) + + if pixel_values is not None and high_res_pixel_values is None: + raise ValueError("Both pixel_values and high_res_pixel_values should be specified at the same time") + + if inputs_embeds is None: + inputs_embeds = self.get_input_embeddings()(input_ids) + + if pixel_values is not None: + if input_ids is None: + image_attention_mask = inputs_embeds == self.get_input_embeddings()( + torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device) + ) + image_attention_mask = image_attention_mask.all(-1) + else: + image_attention_mask = input_ids == self.config.image_token_id + + image_attention_mask = image_attention_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device) + image_embeds = self.get_image_features(pixel_values, high_res_pixel_values) + image_features = image_embeds.reshape(-1, inputs_embeds.shape[-1]) + image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype) + inputs_embeds = inputs_embeds.masked_scatter(image_attention_mask, image_features) + + lm_output = self.language_model( + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + use_cache=use_cache, + cache_position=cache_position, + logits_to_keep=logits_to_keep, + **kwargs, + ) + + return DeepseekVLHybridBaseModelOutputWithPast( + last_hidden_state=lm_output.last_hidden_state, + past_key_values=lm_output.past_key_values, + hidden_states=lm_output.hidden_states, + attentions=lm_output.attentions, + image_hidden_states=image_embeds if pixel_values is not None else None, + ) + + +class DeepseekVLHybridForConditionalGeneration(DeepseekVLForConditionalGeneration): + @can_return_tuple + @auto_docstring(custom_args=DEEPSEEK_VL_COMMON_CUSTOM_ARGS) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + high_res_pixel_values: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + cache_position: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + logits_to_keep: Union[int, torch.Tensor] = 0, + **kwargs: Unpack[TransformersKwargs], + ): + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + """ + outputs = self.model( + input_ids=input_ids, + pixel_values=pixel_values, + high_res_pixel_values=high_res_pixel_values, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + cache_position=cache_position, + **kwargs, + ) + hidden_states = outputs.last_hidden_state + # Only compute necessary logits, and do not upcast them to float if we are not computing the loss + slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep + logits = self.lm_head(hidden_states[:, slice_indices, :]) + + loss = None + if labels is not None: + loss = self.loss_function( + logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **kwargs + ) + + return DeepseekVLHybridCausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + image_hidden_states=outputs.image_hidden_states, + ) + + def prepare_inputs_for_generation( + self, + input_ids, + past_key_values=None, + inputs_embeds=None, + pixel_values=None, + high_res_pixel_values=None, + attention_mask=None, + cache_position=None, + logits_to_keep=None, + **kwargs, + ): + model_inputs = super().prepare_inputs_for_generation( + input_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + cache_position=cache_position, + logits_to_keep=logits_to_keep, + **kwargs, + ) + + if cache_position[0] == 0: + # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore + # Otherwise we need pixel values to be passed to model + model_inputs["pixel_values"] = pixel_values + model_inputs["high_res_pixel_values"] = high_res_pixel_values + + return model_inputs + + +class DeepseekVLHybridImageProcessor(DeepseekVLImageProcessor): + r""" + Constructs a DEEPSEEK_VL_HYBRID image processor. + + Args: + do_resize (`bool`, *optional*, defaults to `True`): + Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the + `do_resize` parameter in the `preprocess` method. + size (`dict`, *optional*, defaults to `{"height": 384, "width": 384}`): + Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess` + method. + high_res_size (`dict`, *optional*, defaults to `{"height": 1024, "width": 1024}`): + Size of the high resolution output image after resizing. Can be overridden by the `high_res_size` parameter in the `preprocess` + method. + min_size (`int`, *optional*, defaults to 14): + The minimum allowed size for the resized image. Ensures that neither the height nor width + falls below this value after resizing. + resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`): + Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be + overridden by the `resample` parameter in the `preprocess` method. + high_res_resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`): + Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be + overridden by the `high_res_resample` parameter in the `preprocess` method. + do_rescale (`bool`, *optional*, defaults to `True`): + Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the + `do_rescale` parameter in the `preprocess` method. + rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): + Scale factor to use if rescaling the image. Only has an effect if `do_rescale` is set to `True`. Can be + overridden by the `rescale_factor` parameter in the `preprocess` method. + do_normalize (`bool`, *optional*, defaults to `True`): + Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess` + method. Can be overridden by the `do_normalize` parameter in the `preprocess` method. + image_mean (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`): + Mean to use if normalizing the image. This is a float or list of floats the length of the number of + channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can be + overridden by the `image_mean` parameter in the `preprocess` method. + image_std (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`): + Standard deviation to use if normalizing the image. This is a float or list of floats the length of the + number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method. + Can be overridden by the `image_std` parameter in the `preprocess` method. + high_res_image_mean (`float` or `list[float]`, *optional*, defaults to `OPENAI_CLIP_MEAN`): + Mean to use if normalizing the high resolution image. This is a float or list of floats the length of the number of + channels in the image. Can be overridden by the `high_res_image_mean` parameter in the `preprocess` method. + high_res_image_std (`float` or `list[float]`, *optional*, defaults to `OPENAI_CLIP_STD`): + Standard deviation to use if normalizing the high resolution image. This is a float or list of floats the length of the + number of channels in the image. Can be overridden by the `high_res_image_std` parameter in the `preprocess` method. + do_convert_rgb (`bool`, *optional*, defaults to `True`): + Whether to convert the image to RGB. + do_pad (`bool`, *optional*, defaults to `True`): + Whether to pad the image to square or not. + """ + + model_input_names = ["pixel_values", "high_res_pixel_values"] + + def __init__( + self, + do_resize: bool = True, + size: Optional[dict[str, int]] = None, + high_res_size: Optional[dict[str, int]] = None, + min_size: int = 14, + resample: PILImageResampling = PILImageResampling.BICUBIC, + high_res_resample: PILImageResampling = PILImageResampling.BICUBIC, + do_rescale: bool = True, + rescale_factor: Union[int, float] = 1 / 255, + do_normalize: bool = True, + image_mean: Optional[Union[float, list[float]]] = None, + image_std: Optional[Union[float, list[float]]] = None, + high_res_image_mean: Optional[Union[float, list[float]]] = None, + high_res_image_std: Optional[Union[float, list[float]]] = None, + do_convert_rgb: Optional[bool] = None, + do_pad: bool = True, + **kwargs, + ) -> None: + high_res_size = high_res_size if high_res_size is not None else {"height": 1024, "width": 1024} + high_res_size = get_size_dict(high_res_size, default_to_square=True) + + self.high_res_size = high_res_size + self.high_res_image_mean = high_res_image_mean if high_res_image_mean is not None else OPENAI_CLIP_MEAN + self.high_res_image_std = high_res_image_std if high_res_image_std is not None else OPENAI_CLIP_STD + + self.resample = resample + self.high_res_resample = high_res_resample + + super().__init__( + do_resize=do_resize, + size=size, + min_size=min_size, + resample=resample, + do_rescale=do_rescale, + rescale_factor=rescale_factor, + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + do_convert_rgb=do_convert_rgb, + do_pad=do_pad, + **kwargs, + ) + + if high_res_image_mean is None: + self.high_res_background_color = (127, 127, 127) + else: + self.high_res_background_color = tuple(int(x * 255) for x in high_res_image_mean) + + @filter_out_non_signature_kwargs() + def preprocess( + self, + images: ImageInput, + do_resize: Optional[bool] = None, + size: Optional[dict[str, int]] = None, + high_res_size: Optional[dict[str, int]] = None, + resample: Optional[PILImageResampling] = None, + high_res_resample: Optional[PILImageResampling] = None, + do_rescale: Optional[bool] = None, + rescale_factor: Optional[float] = None, + do_normalize: Optional[bool] = None, + image_mean: Optional[Union[float, list[float]]] = None, + image_std: Optional[Union[float, list[float]]] = None, + high_res_image_mean: Optional[Union[float, list[float]]] = None, + high_res_image_std: Optional[Union[float, list[float]]] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + do_convert_rgb: Optional[bool] = None, + do_pad: Optional[bool] = None, + background_color: Optional[tuple[int, int, int]] = None, + ): + """ + Preprocess an image or batch of images. + + Args: + images (`ImageInput`): + Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If + passing in images with pixel values between 0 and 1, set `do_rescale=False`. + do_resize (`bool`, *optional*, defaults to `self.do_resize`): + Whether to resize the image. + size (`Dict[str, int]`, *optional*, defaults to `self.size`): + Dictionary in the format `{"height": h, "width": w}` specifying the size of the output image after + resizing. + high_res_size (`Dict[str, int]`, *optional*, defaults to `self.high_res_size`): + Dictionary in the format `{"height": h, "width": w}` specifying the size of the high resolution output image after + resizing. + resample (`PILImageResampling` filter, *optional*, defaults to `self.resample`): + `PILImageResampling` filter to use if resizing the image e.g. `PILImageResampling.BILINEAR`. Only has + an effect if `do_resize` is set to `True`. + high_res_resample (`PILImageResampling` filter, *optional*, defaults to `self.resample`): + `PILImageResampling` filter to use if resizing the image e.g. `PILImageResampling.BICUBIC`. Only has + an effect if `do_resize` is set to `True`. + do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): + Whether to rescale the image values between [0 - 1]. + rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): + Rescale factor to rescale the image by if `do_rescale` is set to `True`. + do_normalize (`bool`, *optional*, defaults to `self.do_normalize`): + Whether to normalize the image. + image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`): + Image mean to use if `do_normalize` is set to `True`. + image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`): + Image standard deviation to use if `do_normalize` is set to `True`. + high_res_image_mean (`float` or `List[float]`, *optional*, defaults to `self.high_res_image_mean`): + Image mean to use if `do_normalize` is set to `True`. + high_res_image_std (`float` or `List[float]`, *optional*, defaults to `self.high_res_image_std`): + Image standard deviation to use if `do_normalize` is set to `True`. + return_tensors (`str` or `TensorType`, *optional*): + The type of tensors to return. Can be one of: + - Unset: Return a list of `np.ndarray`. + - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`. + - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. + - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. + - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. + data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - Unset: Use the channel dimension format of the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`): + Whether to convert the image to RGB. + do_pad (`bool`, *optional*, defaults to `self.do_pad`): + Whether to pad the image to square or not. + background_color (`tuple[int, int, int]`): + The background color to use for the padding. + """ + do_resize = do_resize if do_resize is not None else self.do_resize + do_rescale = do_rescale if do_rescale is not None else self.do_rescale + do_normalize = do_normalize if do_normalize is not None else self.do_normalize + resample = resample if resample is not None else self.resample + high_res_resample = high_res_resample if high_res_resample is not None else self.high_res_resample + rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor + image_mean = image_mean if image_mean is not None else self.image_mean + image_std = image_std if image_std is not None else self.image_std + high_res_image_mean = high_res_image_mean if high_res_image_mean is not None else self.high_res_image_mean + high_res_image_std = high_res_image_std if high_res_image_std is not None else self.high_res_image_std + do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb + do_pad = do_pad if do_pad is not None else self.do_pad + background_color = background_color if background_color is not None else self.background_color + + size = size if size is not None else self.size + size_dict = get_size_dict(size) + high_res_size = high_res_size if high_res_size is not None else self.high_res_size + high_res_size_dict = get_size_dict(high_res_size) + + images = self.fetch_images(images) + images = make_flat_list_of_images(images) + + if not valid_images(images): + raise ValueError( + "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " + "torch.Tensor, tf.Tensor or jax.ndarray." + ) + validate_preprocess_arguments( + do_rescale=do_rescale, + rescale_factor=rescale_factor, + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + do_resize=do_resize, + size=size, + resample=resample, + ) + + if do_convert_rgb: + images = [convert_to_rgb(image) for image in images] + + # All transformations expect numpy arrays. + images = [to_numpy_array(image) for image in images] + + if do_rescale and is_scaled_image(images[0]): + logger.warning_once( + "It looks like you are trying to rescale already rescaled images. If the input" + " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again." + ) + + if input_data_format is None: + # We assume that all images have the same channel dimension format. + input_data_format = infer_channel_dimension_format(images[0]) + + all_images = [] + all_high_res_images = [] + for image in images: + # high_res_image: resize (high) -> rescale -> normalize (high) + # low_res_image: resize (high) -> rescale -> resize (low) -> normalize (low) + high_res_image = image + if do_resize: + high_res_image = self.resize( + image=high_res_image, + size=high_res_size_dict, + resample=high_res_resample, + input_data_format=input_data_format, + ) + if do_pad: + # Expand and pad the images to obtain a square image of dimensions `size x size` + high_res_image = self.pad_to_square( + image=high_res_image, + background_color=background_color, + input_data_format=input_data_format, + ) + image = self.resize( + image=high_res_image, + size=size_dict, + resample=resample, + input_data_format=input_data_format, + ) + if do_pad: + image = self.pad_to_square( + image=image, + background_color=background_color, + input_data_format=input_data_format, + ) + + if do_rescale: + image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format) + high_res_image = self.rescale( + image=high_res_image, scale=rescale_factor, input_data_format=input_data_format + ) + + if do_normalize: + image = self.normalize( + image=image, mean=image_mean, std=image_std, input_data_format=input_data_format + ) + high_res_image = self.normalize( + image=high_res_image, + mean=high_res_image_mean, + std=high_res_image_std, + input_data_format=input_data_format, + ) + + image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) + high_res_image = to_channel_dimension_format( + high_res_image, data_format, input_channel_dim=input_data_format + ) + + all_images.append(image) + all_high_res_images.append(high_res_image) + + data = {"pixel_values": all_images, "high_res_pixel_values": all_high_res_images} + return BatchFeature(data=data, tensor_type=return_tensors) + + +class DeepseekVLHybridFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): + r""" + min_size (`int`, *optional*, defaults to 14): + The minimum allowed size for the resized image. Ensures that neither the height nor width + falls below this value after resizing. + high_res_size (`dict`, *optional*, defaults to `{"height": 1024, "width": 1024}`): + Size of the high resolution output image after resizing. Can be overridden by the `high_res_size` parameter in the `preprocess` + method. + high_res_resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`): + Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be + overridden by the `high_res_resample` parameter in the `preprocess` method. + high_res_image_mean (`float` or `list[float]`, *optional*, defaults to `OPENAI_CLIP_MEAN`): + Mean to use if normalizing the high resolution image. This is a float or list of floats the length of the number of + channels in the image. Can be overridden by the `high_res_image_mean` parameter in the `preprocess` method. + high_res_image_std (`float` or `list[float]`, *optional*, defaults to `OPENAI_CLIP_STD`): + Standard deviation to use if normalizing the high resolution image. This is a float or list of floats the length of the + number of channels in the image. Can be overridden by the `high_res_image_std` parameter in the `preprocess` method. + """ + + min_size: int + high_res_size: dict + high_res_resample: "PILImageResampling" + high_res_image_mean: list[float] + high_res_image_std: list[float] + + +class DeepseekVLHybridImageProcessorFast(DeepseekVLImageProcessorFast): + high_res_image_mean = OPENAI_CLIP_MEAN + high_res_image_std = OPENAI_CLIP_STD + high_res_size = {"height": 1024, "width": 1024} + high_res_resample = PILImageResampling.BICUBIC + model_input_names = ["pixel_values", "high_res_pixel_values"] + + def __init__(self, **kwargs: Unpack[DeepseekVLHybridFastImageProcessorKwargs]): + if kwargs.get("image_mean") is None: + background_color = (127, 127, 127) + else: + background_color = tuple([int(x * 255) for x in kwargs.get("image_mean")]) + if kwargs.get("high_res_image_mean") is None: + high_res_background_color = (127, 127, 127) + else: + high_res_background_color = tuple(int(x * 255) for x in kwargs.get("high_res_image_mean")) + BaseImageProcessorFast.__init__(self, **kwargs) + self.background_color = tuple(background_color) + self.high_res_background_color = tuple(high_res_background_color) + + def _further_process_kwargs( + self, + size: Optional[SizeDict] = None, + high_res_size: Optional[SizeDict] = None, + default_to_square: Optional[bool] = None, + image_mean: Optional[Union[float, list[float]]] = None, + image_std: Optional[Union[float, list[float]]] = None, + high_res_image_mean: Optional[Union[float, list[float]]] = None, + high_res_image_std: Optional[Union[float, list[float]]] = None, + data_format: Optional[ChannelDimension] = None, + **kwargs, + ) -> dict: + """ + Update kwargs that need further processing before being validated + Can be overridden by subclasses to customize the processing of kwargs. + """ + if kwargs is None: + kwargs = {} + if size is not None: + size = SizeDict(**get_size_dict(size=size, default_to_square=default_to_square)) + if high_res_size is not None: + high_res_size = SizeDict(**get_size_dict(size=high_res_size, default_to_square=default_to_square)) + if isinstance(image_mean, list): + image_mean = tuple(image_mean) + if isinstance(image_std, list): + image_std = tuple(image_std) + if isinstance(high_res_image_mean, list): + high_res_image_mean = tuple(high_res_image_mean) + if isinstance(high_res_image_std, list): + high_res_image_std = tuple(high_res_image_std) + if data_format is None: + data_format = ChannelDimension.FIRST + + high_res_resample = kwargs.pop("high_res_resample") + kwargs["high_res_interpolation"] = ( + pil_torch_interpolation_mapping[high_res_resample] + if isinstance(high_res_resample, (int, PILImageResampling)) + else high_res_resample + ) + + low_res_resample = kwargs.pop("resample") + kwargs["interpolation"] = ( + pil_torch_interpolation_mapping[low_res_resample] + if isinstance(low_res_resample, (int, PILImageResampling)) + else low_res_resample + ) + + kwargs["size"] = size + kwargs["high_res_size"] = high_res_size + kwargs["image_mean"] = image_mean + kwargs["image_std"] = image_std + kwargs["high_res_image_mean"] = high_res_image_mean + kwargs["high_res_image_std"] = high_res_image_std + kwargs["data_format"] = data_format + + return kwargs + + def _preprocess( + self, + images: list["torch.Tensor"], + do_resize: bool, + size: SizeDict, + high_res_size: SizeDict, + min_size: int, + interpolation: Optional["F.InterpolationMode"], + high_res_interpolation: Optional["F.InterpolationMode"], + do_rescale: bool, + rescale_factor: float, + do_normalize: bool, + image_mean: Optional[Union[float, list[float]]], + image_std: Optional[Union[float, list[float]]], + high_res_image_mean: Optional[Union[float, list[float]]], + high_res_image_std: Optional[Union[float, list[float]]], + disable_grouping: Optional[bool], + return_tensors: Optional[Union[str, TensorType]], + do_pad: bool = True, + **kwargs, + ) -> BatchFeature: + # Group images by size for batched resizing + grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping) + high_res_resized_images_grouped = {} + for shape, stacked_images in grouped_images.items(): + if do_resize: + stacked_high_res_images = self.resize( + image=stacked_images, size=high_res_size, min_size=min_size, interpolation=high_res_interpolation + ) + high_res_resized_images_grouped[shape] = stacked_high_res_images + high_res_resized_images = reorder_images(high_res_resized_images_grouped, grouped_images_index) + + # Group images by size for further processing + # Needed in case do_resize is False, or resize returns images with different sizes + grouped_high_res_images, grouped_high_res_images_index = group_images_by_shape( + high_res_resized_images, disable_grouping=disable_grouping + ) + high_res_padded_images = {} + high_res_processed_images_grouped = {} + for shape, stacked_high_res_images in grouped_high_res_images.items(): + if do_pad: + stacked_high_res_images = self.pad_to_square( + stacked_high_res_images, background_color=self.high_res_background_color + ) + high_res_padded_images[shape] = stacked_high_res_images + # Fused rescale and normalize + stacked_high_res_images = self.rescale_and_normalize( + stacked_high_res_images, + do_rescale, + rescale_factor, + do_normalize, + high_res_image_mean, + high_res_image_std, + ) + high_res_processed_images_grouped[shape] = stacked_high_res_images + high_res_processed_images = reorder_images(high_res_processed_images_grouped, grouped_high_res_images_index) + high_res_processed_images = ( + torch.stack(high_res_processed_images, dim=0) if return_tensors else high_res_processed_images + ) + + resized_images_grouped = {} + for shape, stacked_high_res_padded_images in high_res_padded_images.items(): + if do_resize: + stacked_images = self.resize( + image=stacked_high_res_padded_images, size=size, min_size=min_size, interpolation=interpolation + ) + resized_images_grouped[shape] = stacked_images + resized_images = reorder_images(resized_images_grouped, grouped_high_res_images_index) + + grouped_resized_images, grouped_resized_images_index = group_images_by_shape( + resized_images, disable_grouping=disable_grouping + ) + processed_images_grouped = {} + for shape, stacked_images in grouped_resized_images.items(): + if do_pad: + stacked_images = self.pad_to_square(stacked_images, background_color=self.background_color) + # Fused rescale and normalize + stacked_images = self.rescale_and_normalize( + stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std + ) + processed_images_grouped[shape] = stacked_images + processed_images = reorder_images(processed_images_grouped, grouped_resized_images_index) + processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images + + return BatchFeature( + data={"pixel_values": processed_images, "high_res_pixel_values": high_res_processed_images}, + tensor_type=return_tensors, + ) + + +class DeepseekVLHybridProcessorKwargs(DeepseekVLProcessorKwargs): + pass + + +class DeepseekVLHybridProcessor(DeepseekVLProcessor): + def __call__( + self, + text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None, + images: Optional[ImageInput] = None, + **kwargs: Unpack[DeepseekVLHybridProcessorKwargs], + ) -> BatchFeature: + """ + Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text` + and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode + the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to + DeepseekVLHybridImageProcessor's [`~DeepseekVLHybridImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring + of the above two methods for more information. + + Args: + text (`str`, `List[str]`, `List[List[str]]`): + The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings + (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set + `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). + images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`): + The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch + tensor. Both channels-first and channels-last formats are supported. + return_tensors (`str` or [`~utils.TensorType`], *optional*): + If set, will return tensors of a particular framework. Acceptable values are: + - `'tf'`: Return TensorFlow `tf.constant` objects. + - `'pt'`: Return PyTorch `torch.Tensor` objects. + - `'np'`: Return NumPy `np.ndarray` objects. + - `'jax'`: Return JAX `jnp.ndarray` objects. + + Returns: + [`BatchFeature`]: A [`BatchFeature`] with the following fields: + + - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`. + - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when + `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not + `None`). + - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`. + """ + output_kwargs = self._merge_kwargs( + DeepseekVLHybridProcessorKwargs, tokenizer_init_kwargs=self.tokenizer.init_kwargs, **kwargs + ) + if text is None and images is None: + raise ValueError("You must specify either text or images.") + + if text is not None: + if isinstance(text, str): + text = [text] + elif not (isinstance(text, (list, tuple)) and all(isinstance(t, str) for t in text)): + raise ValueError("Invalid input text. Please provide a string, or a list of strings") + + prompt_strings = [] + one_img_tokens = self.image_token * self.num_image_tokens + for prompt in text: + prompt = prompt.replace(self.image_token, one_img_tokens) + prompt_strings.append(prompt) + + data = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"]) + + # process images if pixel_values are provided + if images is not None: + inputs = self.image_processor(images, **output_kwargs["images_kwargs"]) + data["pixel_values"] = inputs["pixel_values"] + data["high_res_pixel_values"] = inputs["high_res_pixel_values"] + + return BatchFeature(data=data) + + +__all__ = [ + "DeepseekVLHybridConfig", + "DeepseekVLHybridPreTrainedModel", + "DeepseekVLHybridModel", + "DeepseekVLHybridForConditionalGeneration", + "DeepseekVLHybridImageProcessor", + "DeepseekVLHybridImageProcessorFast", + "DeepseekVLHybridProcessor", +] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deepseek_vl_hybrid/processing_deepseek_vl_hybrid.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deepseek_vl_hybrid/processing_deepseek_vl_hybrid.py new file mode 100644 index 0000000000000000000000000000000000000000..538fea5a6b322ddf0b8a0902915e726eab7420f0 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deepseek_vl_hybrid/processing_deepseek_vl_hybrid.py @@ -0,0 +1,158 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_deepseek_vl_hybrid.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# Copyright 2025 Deepseek AI and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Optional, Union + +from ...image_processing_utils_fast import BatchFeature +from ...image_utils import ImageInput +from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack +from ...tokenization_utils_base import PreTokenizedInput, TextInput + + +class DeepseekVLHybridProcessorKwargs(ProcessingKwargs, total=False): + _defaults = { + "text_kwargs": {"padding": False}, + "common_kwargs": {"return_tensors": "pt"}, + } + + +class DeepseekVLHybridProcessor(ProcessorMixin): + r""" + Constructs a DeepseekVLHybrid processor which wraps a DeepseekVLHybrid Image Processor and a Llama tokenizer into a single processor. + + [`DeepseekVLHybridProcessor`] offers all the functionalities of [`DeepseekVLHybridImageProcessor`] and [`LlamaTokenizerFast`]. See the + [`~DeepseekVLHybridProcessor.__call__`] and [`~DeepseekVLHybridProcessor.decode`] for more information. + + Args: + image_processor ([`DeepseekVLHybridImageProcessor`]): + The image processor is a required input. + tokenizer ([`LlamaTokenizerFast`]): + The tokenizer is a required input. + chat_template (`str`, *optional*): + A Jinja template which will be used to convert lists of messages + in a chat into a tokenizable string. + num_image_tokens (`int`, *optional*, defaults to 576): + The number of special image tokens used as placeholders for visual content in text sequences. + """ + + attributes = ["image_processor", "tokenizer"] + valid_kwargs = ["chat_template", "num_image_tokens"] + image_processor_class = "AutoImageProcessor" + tokenizer_class = "AutoTokenizer" + + def __init__( + self, + image_processor, + tokenizer, + chat_template=None, + num_image_tokens=576, + ): + self.image_token = tokenizer.image_token + self.num_image_tokens = num_image_tokens + + super().__init__(image_processor, tokenizer, chat_template=chat_template) + + def __call__( + self, + text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None, + images: Optional[ImageInput] = None, + **kwargs: Unpack[DeepseekVLHybridProcessorKwargs], + ) -> BatchFeature: + """ + Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text` + and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode + the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to + DeepseekVLHybridImageProcessor's [`~DeepseekVLHybridImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring + of the above two methods for more information. + + Args: + text (`str`, `List[str]`, `List[List[str]]`): + The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings + (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set + `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). + images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`): + The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch + tensor. Both channels-first and channels-last formats are supported. + return_tensors (`str` or [`~utils.TensorType`], *optional*): + If set, will return tensors of a particular framework. Acceptable values are: + - `'tf'`: Return TensorFlow `tf.constant` objects. + - `'pt'`: Return PyTorch `torch.Tensor` objects. + - `'np'`: Return NumPy `np.ndarray` objects. + - `'jax'`: Return JAX `jnp.ndarray` objects. + + Returns: + [`BatchFeature`]: A [`BatchFeature`] with the following fields: + + - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`. + - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when + `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not + `None`). + - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`. + """ + output_kwargs = self._merge_kwargs( + DeepseekVLHybridProcessorKwargs, tokenizer_init_kwargs=self.tokenizer.init_kwargs, **kwargs + ) + if text is None and images is None: + raise ValueError("You must specify either text or images.") + + if text is not None: + if isinstance(text, str): + text = [text] + elif not (isinstance(text, (list, tuple)) and all(isinstance(t, str) for t in text)): + raise ValueError("Invalid input text. Please provide a string, or a list of strings") + + prompt_strings = [] + one_img_tokens = self.image_token * self.num_image_tokens + for prompt in text: + prompt = prompt.replace(self.image_token, one_img_tokens) + prompt_strings.append(prompt) + + data = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"]) + + # process images if pixel_values are provided + if images is not None: + inputs = self.image_processor(images, **output_kwargs["images_kwargs"]) + data["pixel_values"] = inputs["pixel_values"] + data["high_res_pixel_values"] = inputs["high_res_pixel_values"] + + return BatchFeature(data=data) + + def batch_decode(self, *args, **kwargs): + """ + This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please + refer to the docstring of this method for more information. + """ + return self.tokenizer.batch_decode(*args, **kwargs) + + def decode(self, *args, **kwargs): + """ + This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to + the docstring of this method for more information. + """ + return self.tokenizer.decode(*args, **kwargs) + + @property + def model_input_names(self): + tokenizer_input_names = self.tokenizer.model_input_names + image_processor_input_names = self.image_processor.model_input_names + return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names)) + + +__all__ = ["DeepseekVLHybridProcessor"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deformable_detr/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deformable_detr/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..16a1959c30ff5474ed82a6b0a2e22896514d6a44 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deformable_detr/__init__.py @@ -0,0 +1,32 @@ +# Copyright 2022 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_deformable_detr import * + from .feature_extraction_deformable_detr import * + from .image_processing_deformable_detr import * + from .image_processing_deformable_detr_fast import * + from .modeling_deformable_detr import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deformable_detr/configuration_deformable_detr.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deformable_detr/configuration_deformable_detr.py new file mode 100644 index 0000000000000000000000000000000000000000..b85a7399908d4f25ce220aa1edab4b0e74c9bdb3 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deformable_detr/configuration_deformable_detr.py @@ -0,0 +1,290 @@ +# coding=utf-8 +# Copyright 2022 SenseTime and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Deformable DETR model configuration""" + +from ...configuration_utils import PretrainedConfig +from ...utils import logging +from ...utils.backbone_utils import verify_backbone_config_arguments +from ..auto import CONFIG_MAPPING + + +logger = logging.get_logger(__name__) + + +class DeformableDetrConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`DeformableDetrModel`]. It is used to instantiate + a Deformable DETR model according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of the Deformable DETR + [SenseTime/deformable-detr](https://huggingface.co/SenseTime/deformable-detr) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + use_timm_backbone (`bool`, *optional*, defaults to `True`): + Whether or not to use the `timm` library for the backbone. If set to `False`, will use the [`AutoBackbone`] + API. + backbone_config (`PretrainedConfig` or `dict`, *optional*): + The configuration of the backbone model. Only used in case `use_timm_backbone` is set to `False` in which + case it will default to `ResNetConfig()`. + num_channels (`int`, *optional*, defaults to 3): + The number of input channels. + num_queries (`int`, *optional*, defaults to 300): + Number of object queries, i.e. detection slots. This is the maximal number of objects + [`DeformableDetrModel`] can detect in a single image. In case `two_stage` is set to `True`, we use + `two_stage_num_proposals` instead. + d_model (`int`, *optional*, defaults to 256): + Dimension of the layers. + encoder_layers (`int`, *optional*, defaults to 6): + Number of encoder layers. + decoder_layers (`int`, *optional*, defaults to 6): + Number of decoder layers. + encoder_attention_heads (`int`, *optional*, defaults to 8): + Number of attention heads for each attention layer in the Transformer encoder. + decoder_attention_heads (`int`, *optional*, defaults to 8): + Number of attention heads for each attention layer in the Transformer decoder. + decoder_ffn_dim (`int`, *optional*, defaults to 1024): + Dimension of the "intermediate" (often named feed-forward) layer in decoder. + encoder_ffn_dim (`int`, *optional*, defaults to 1024): + Dimension of the "intermediate" (often named feed-forward) layer in decoder. + activation_function (`str` or `function`, *optional*, defaults to `"relu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"silu"` and `"gelu_new"` are supported. + dropout (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + activation_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for activations inside the fully connected layer. + init_std (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + init_xavier_std (`float`, *optional*, defaults to 1): + The scaling factor used for the Xavier initialization gain in the HM Attention map module. + encoder_layerdrop (`float`, *optional*, defaults to 0.0): + The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://huggingface.co/papers/1909.11556) + for more details. + auxiliary_loss (`bool`, *optional*, defaults to `False`): + Whether auxiliary decoding losses (loss at each decoder layer) are to be used. + position_embedding_type (`str`, *optional*, defaults to `"sine"`): + Type of position embeddings to be used on top of the image features. One of `"sine"` or `"learned"`. + backbone (`str`, *optional*, defaults to `"resnet50"`): + Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this + will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone` + is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights. + use_pretrained_backbone (`bool`, *optional*, defaults to `True`): + Whether to use pretrained weights for the backbone. + backbone_kwargs (`dict`, *optional*): + Keyword arguments to be passed to AutoBackbone when loading from a checkpoint + e.g. `{'out_indices': (0, 1, 2, 3)}`. Cannot be specified if `backbone_config` is set. + dilation (`bool`, *optional*, defaults to `False`): + Whether to replace stride with dilation in the last convolutional block (DC5). Only supported when + `use_timm_backbone` = `True`. + class_cost (`float`, *optional*, defaults to 1): + Relative weight of the classification error in the Hungarian matching cost. + bbox_cost (`float`, *optional*, defaults to 5): + Relative weight of the L1 error of the bounding box coordinates in the Hungarian matching cost. + giou_cost (`float`, *optional*, defaults to 2): + Relative weight of the generalized IoU loss of the bounding box in the Hungarian matching cost. + mask_loss_coefficient (`float`, *optional*, defaults to 1): + Relative weight of the Focal loss in the panoptic segmentation loss. + dice_loss_coefficient (`float`, *optional*, defaults to 1): + Relative weight of the DICE/F-1 loss in the panoptic segmentation loss. + bbox_loss_coefficient (`float`, *optional*, defaults to 5): + Relative weight of the L1 bounding box loss in the object detection loss. + giou_loss_coefficient (`float`, *optional*, defaults to 2): + Relative weight of the generalized IoU loss in the object detection loss. + eos_coefficient (`float`, *optional*, defaults to 0.1): + Relative classification weight of the 'no-object' class in the object detection loss. + num_feature_levels (`int`, *optional*, defaults to 4): + The number of input feature levels. + encoder_n_points (`int`, *optional*, defaults to 4): + The number of sampled keys in each feature level for each attention head in the encoder. + decoder_n_points (`int`, *optional*, defaults to 4): + The number of sampled keys in each feature level for each attention head in the decoder. + two_stage (`bool`, *optional*, defaults to `False`): + Whether to apply a two-stage deformable DETR, where the region proposals are also generated by a variant of + Deformable DETR, which are further fed into the decoder for iterative bounding box refinement. + two_stage_num_proposals (`int`, *optional*, defaults to 300): + The number of region proposals to be generated, in case `two_stage` is set to `True`. + with_box_refine (`bool`, *optional*, defaults to `False`): + Whether to apply iterative bounding box refinement, where each decoder layer refines the bounding boxes + based on the predictions from the previous layer. + focal_alpha (`float`, *optional*, defaults to 0.25): + Alpha parameter in the focal loss. + disable_custom_kernels (`bool`, *optional*, defaults to `False`): + Disable the use of custom CUDA and CPU kernels. This option is necessary for the ONNX export, as custom + kernels are not supported by PyTorch ONNX export. + + Examples: + + ```python + >>> from transformers import DeformableDetrConfig, DeformableDetrModel + + >>> # Initializing a Deformable DETR SenseTime/deformable-detr style configuration + >>> configuration = DeformableDetrConfig() + + >>> # Initializing a model (with random weights) from the SenseTime/deformable-detr style configuration + >>> model = DeformableDetrModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "deformable_detr" + attribute_map = { + "hidden_size": "d_model", + "num_attention_heads": "encoder_attention_heads", + } + + def __init__( + self, + use_timm_backbone=True, + backbone_config=None, + num_channels=3, + num_queries=300, + max_position_embeddings=1024, + encoder_layers=6, + encoder_ffn_dim=1024, + encoder_attention_heads=8, + decoder_layers=6, + decoder_ffn_dim=1024, + decoder_attention_heads=8, + encoder_layerdrop=0.0, + is_encoder_decoder=True, + activation_function="relu", + d_model=256, + dropout=0.1, + attention_dropout=0.0, + activation_dropout=0.0, + init_std=0.02, + init_xavier_std=1.0, + return_intermediate=True, + auxiliary_loss=False, + position_embedding_type="sine", + backbone="resnet50", + use_pretrained_backbone=True, + backbone_kwargs=None, + dilation=False, + num_feature_levels=4, + encoder_n_points=4, + decoder_n_points=4, + two_stage=False, + two_stage_num_proposals=300, + with_box_refine=False, + class_cost=1, + bbox_cost=5, + giou_cost=2, + mask_loss_coefficient=1, + dice_loss_coefficient=1, + bbox_loss_coefficient=5, + giou_loss_coefficient=2, + eos_coefficient=0.1, + focal_alpha=0.25, + disable_custom_kernels=False, + **kwargs, + ): + # We default to values which were previously hard-coded in the model. This enables configurability of the config + # while keeping the default behavior the same. + if use_timm_backbone and backbone_kwargs is None: + backbone_kwargs = {} + if dilation: + backbone_kwargs["output_stride"] = 16 + backbone_kwargs["out_indices"] = [2, 3, 4] if num_feature_levels > 1 else [4] + backbone_kwargs["in_chans"] = num_channels + # Backwards compatibility + elif not use_timm_backbone and backbone in (None, "resnet50"): + if backbone_config is None: + logger.info("`backbone_config` is `None`. Initializing the config with the default `ResNet` backbone.") + backbone_config = CONFIG_MAPPING["resnet"](out_features=["stage4"]) + elif isinstance(backbone_config, dict): + backbone_model_type = backbone_config.get("model_type") + config_class = CONFIG_MAPPING[backbone_model_type] + backbone_config = config_class.from_dict(backbone_config) + + verify_backbone_config_arguments( + use_timm_backbone=use_timm_backbone, + use_pretrained_backbone=use_pretrained_backbone, + backbone=backbone, + backbone_config=backbone_config, + backbone_kwargs=backbone_kwargs, + ) + + self.use_timm_backbone = use_timm_backbone + self.backbone_config = backbone_config + self.num_channels = num_channels + self.num_queries = num_queries + self.max_position_embeddings = max_position_embeddings + self.d_model = d_model + self.encoder_ffn_dim = encoder_ffn_dim + self.encoder_layers = encoder_layers + self.encoder_attention_heads = encoder_attention_heads + self.decoder_ffn_dim = decoder_ffn_dim + self.decoder_layers = decoder_layers + self.decoder_attention_heads = decoder_attention_heads + self.dropout = dropout + self.attention_dropout = attention_dropout + self.activation_dropout = activation_dropout + self.activation_function = activation_function + self.init_std = init_std + self.init_xavier_std = init_xavier_std + self.encoder_layerdrop = encoder_layerdrop + self.auxiliary_loss = auxiliary_loss + self.position_embedding_type = position_embedding_type + self.backbone = backbone + self.use_pretrained_backbone = use_pretrained_backbone + self.backbone_kwargs = backbone_kwargs + self.dilation = dilation + # deformable attributes + self.num_feature_levels = num_feature_levels + self.encoder_n_points = encoder_n_points + self.decoder_n_points = decoder_n_points + self.two_stage = two_stage + self.two_stage_num_proposals = two_stage_num_proposals + self.with_box_refine = with_box_refine + if two_stage is True and with_box_refine is False: + raise ValueError("If two_stage is True, with_box_refine must be True.") + # Hungarian matcher + self.class_cost = class_cost + self.bbox_cost = bbox_cost + self.giou_cost = giou_cost + # Loss coefficients + self.mask_loss_coefficient = mask_loss_coefficient + self.dice_loss_coefficient = dice_loss_coefficient + self.bbox_loss_coefficient = bbox_loss_coefficient + self.giou_loss_coefficient = giou_loss_coefficient + self.eos_coefficient = eos_coefficient + self.focal_alpha = focal_alpha + self.disable_custom_kernels = disable_custom_kernels + super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) + + @property + def num_attention_heads(self) -> int: + return self.encoder_attention_heads + + @property + def hidden_size(self) -> int: + return self.d_model + + @property + def sub_configs(self): + return ( + {"backbone_config": type(self.backbone_config)} + if getattr(self, "backbone_config", None) is not None + else {} + ) + + +__all__ = ["DeformableDetrConfig"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deformable_detr/feature_extraction_deformable_detr.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deformable_detr/feature_extraction_deformable_detr.py new file mode 100644 index 0000000000000000000000000000000000000000..e349ca3db0ca92b349752484877ad8dd64d153b1 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deformable_detr/feature_extraction_deformable_detr.py @@ -0,0 +1,48 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Feature extractor class for Deformable DETR.""" + +import warnings + +from ...image_transforms import rgb_to_id as _rgb_to_id +from ...utils import logging +from ...utils.import_utils import requires +from .image_processing_deformable_detr import DeformableDetrImageProcessor + + +logger = logging.get_logger(__name__) + + +def rgb_to_id(x): + warnings.warn( + "rgb_to_id has moved and will not be importable from this module from v5. " + "Please import from transformers.image_transforms instead.", + FutureWarning, + ) + return _rgb_to_id(x) + + +@requires(backends=("vision",)) +class DeformableDetrFeatureExtractor(DeformableDetrImageProcessor): + def __init__(self, *args, **kwargs) -> None: + warnings.warn( + "The class DeformableDetrFeatureExtractor is deprecated and will be removed in version 5 of Transformers." + " Please use DeformableDetrImageProcessor instead.", + FutureWarning, + ) + super().__init__(*args, **kwargs) + + +__all__ = ["DeformableDetrFeatureExtractor"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deformable_detr/image_processing_deformable_detr.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deformable_detr/image_processing_deformable_detr.py new file mode 100644 index 0000000000000000000000000000000000000000..c6875eb9b8f80e152c0e310e9bdd051e6b2200e6 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deformable_detr/image_processing_deformable_detr.py @@ -0,0 +1,1634 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Image processor class for Deformable DETR.""" + +import io +import pathlib +from collections import defaultdict +from collections.abc import Iterable +from typing import Any, Callable, Optional, Union + +import numpy as np + +from ...feature_extraction_utils import BatchFeature +from ...image_processing_utils import BaseImageProcessor, get_size_dict +from ...image_transforms import ( + PaddingMode, + center_to_corners_format, + corners_to_center_format, + id_to_rgb, + pad, + rescale, + resize, + rgb_to_id, + to_channel_dimension_format, +) +from ...image_utils import ( + IMAGENET_DEFAULT_MEAN, + IMAGENET_DEFAULT_STD, + AnnotationFormat, + AnnotationType, + ChannelDimension, + ImageInput, + PILImageResampling, + get_image_size, + infer_channel_dimension_format, + is_scaled_image, + make_flat_list_of_images, + to_numpy_array, + valid_images, + validate_annotations, + validate_kwargs, + validate_preprocess_arguments, +) +from ...utils import ( + TensorType, + is_flax_available, + is_jax_tensor, + is_scipy_available, + is_tf_available, + is_tf_tensor, + is_torch_available, + is_torch_tensor, + is_vision_available, + logging, +) +from ...utils.import_utils import requires + + +if is_torch_available(): + import torch + from torch import nn + + +if is_vision_available(): + import PIL + +if is_scipy_available(): + import scipy.special + import scipy.stats + + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + +SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC) + + +# Copied from transformers.models.detr.image_processing_detr.get_size_with_aspect_ratio +def get_size_with_aspect_ratio(image_size, size, max_size=None) -> tuple[int, int]: + """ + Computes the output image size given the input image size and the desired output size. + + Args: + image_size (`tuple[int, int]`): + The input image size. + size (`int`): + The desired output size. + max_size (`int`, *optional*): + The maximum allowed output size. + """ + height, width = image_size + raw_size = None + if max_size is not None: + min_original_size = float(min((height, width))) + max_original_size = float(max((height, width))) + if max_original_size / min_original_size * size > max_size: + raw_size = max_size * min_original_size / max_original_size + size = int(round(raw_size)) + + if (height <= width and height == size) or (width <= height and width == size): + oh, ow = height, width + elif width < height: + ow = size + if max_size is not None and raw_size is not None: + oh = int(raw_size * height / width) + else: + oh = int(size * height / width) + else: + oh = size + if max_size is not None and raw_size is not None: + ow = int(raw_size * width / height) + else: + ow = int(size * width / height) + + return (oh, ow) + + +# Copied from transformers.models.detr.image_processing_detr.get_resize_output_image_size +def get_resize_output_image_size( + input_image: np.ndarray, + size: Union[int, tuple[int, int], list[int]], + max_size: Optional[int] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, +) -> tuple[int, int]: + """ + Computes the output image size given the input image size and the desired output size. If the desired output size + is a tuple or list, the output image size is returned as is. If the desired output size is an integer, the output + image size is computed by keeping the aspect ratio of the input image size. + + Args: + input_image (`np.ndarray`): + The image to resize. + size (`int` or `tuple[int, int]` or `list[int]`): + The desired output size. + max_size (`int`, *optional*): + The maximum allowed output size. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format of the input image. If not provided, it will be inferred from the input image. + """ + image_size = get_image_size(input_image, input_data_format) + if isinstance(size, (list, tuple)): + return size + + return get_size_with_aspect_ratio(image_size, size, max_size) + + +# Copied from transformers.models.detr.image_processing_detr.get_image_size_for_max_height_width +def get_image_size_for_max_height_width( + input_image: np.ndarray, + max_height: int, + max_width: int, + input_data_format: Optional[Union[str, ChannelDimension]] = None, +) -> tuple[int, int]: + """ + Computes the output image size given the input image and the maximum allowed height and width. Keep aspect ratio. + Important, even if image_height < max_height and image_width < max_width, the image will be resized + to at least one of the edges be equal to max_height or max_width. + + For example: + - input_size: (100, 200), max_height: 50, max_width: 50 -> output_size: (25, 50) + - input_size: (100, 200), max_height: 200, max_width: 500 -> output_size: (200, 400) + + Args: + input_image (`np.ndarray`): + The image to resize. + max_height (`int`): + The maximum allowed height. + max_width (`int`): + The maximum allowed width. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format of the input image. If not provided, it will be inferred from the input image. + """ + image_size = get_image_size(input_image, input_data_format) + height, width = image_size + height_scale = max_height / height + width_scale = max_width / width + min_scale = min(height_scale, width_scale) + new_height = int(height * min_scale) + new_width = int(width * min_scale) + return new_height, new_width + + +# Copied from transformers.models.detr.image_processing_detr.get_numpy_to_framework_fn +def get_numpy_to_framework_fn(arr) -> Callable: + """ + Returns a function that converts a numpy array to the framework of the input array. + + Args: + arr (`np.ndarray`): The array to convert. + """ + if isinstance(arr, np.ndarray): + return np.array + if is_tf_available() and is_tf_tensor(arr): + import tensorflow as tf + + return tf.convert_to_tensor + if is_torch_available() and is_torch_tensor(arr): + import torch + + return torch.tensor + if is_flax_available() and is_jax_tensor(arr): + import jax.numpy as jnp + + return jnp.array + raise ValueError(f"Cannot convert arrays of type {type(arr)}") + + +# Copied from transformers.models.detr.image_processing_detr.safe_squeeze +def safe_squeeze(arr: np.ndarray, axis: Optional[int] = None) -> np.ndarray: + """ + Squeezes an array, but only if the axis specified has dim 1. + """ + if axis is None: + return arr.squeeze() + + try: + return arr.squeeze(axis=axis) + except ValueError: + return arr + + +# Copied from transformers.models.detr.image_processing_detr.normalize_annotation +def normalize_annotation(annotation: dict, image_size: tuple[int, int]) -> dict: + image_height, image_width = image_size + norm_annotation = {} + for key, value in annotation.items(): + if key == "boxes": + boxes = value + boxes = corners_to_center_format(boxes) + boxes /= np.asarray([image_width, image_height, image_width, image_height], dtype=np.float32) + norm_annotation[key] = boxes + else: + norm_annotation[key] = value + return norm_annotation + + +# Copied from transformers.models.detr.image_processing_detr.max_across_indices +def max_across_indices(values: Iterable[Any]) -> list[Any]: + """ + Return the maximum value across all indices of an iterable of values. + """ + return [max(values_i) for values_i in zip(*values)] + + +# Copied from transformers.models.detr.image_processing_detr.get_max_height_width +def get_max_height_width( + images: list[np.ndarray], input_data_format: Optional[Union[str, ChannelDimension]] = None +) -> list[int]: + """ + Get the maximum height and width across all images in a batch. + """ + if input_data_format is None: + input_data_format = infer_channel_dimension_format(images[0]) + + if input_data_format == ChannelDimension.FIRST: + _, max_height, max_width = max_across_indices([img.shape for img in images]) + elif input_data_format == ChannelDimension.LAST: + max_height, max_width, _ = max_across_indices([img.shape for img in images]) + else: + raise ValueError(f"Invalid channel dimension format: {input_data_format}") + return (max_height, max_width) + + +# Copied from transformers.models.detr.image_processing_detr.make_pixel_mask +def make_pixel_mask( + image: np.ndarray, output_size: tuple[int, int], input_data_format: Optional[Union[str, ChannelDimension]] = None +) -> np.ndarray: + """ + Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding. + + Args: + image (`np.ndarray`): + Image to make the pixel mask for. + output_size (`tuple[int, int]`): + Output size of the mask. + """ + input_height, input_width = get_image_size(image, channel_dim=input_data_format) + mask = np.zeros(output_size, dtype=np.int64) + mask[:input_height, :input_width] = 1 + return mask + + +# Copied from transformers.models.detr.image_processing_detr.convert_coco_poly_to_mask +def convert_coco_poly_to_mask(segmentations, height: int, width: int) -> np.ndarray: + """ + Convert a COCO polygon annotation to a mask. + + Args: + segmentations (`list[list[float]]`): + List of polygons, each polygon represented by a list of x-y coordinates. + height (`int`): + Height of the mask. + width (`int`): + Width of the mask. + """ + try: + from pycocotools import mask as coco_mask + except ImportError: + raise ImportError("Pycocotools is not installed in your environment.") + + masks = [] + for polygons in segmentations: + rles = coco_mask.frPyObjects(polygons, height, width) + mask = coco_mask.decode(rles) + if len(mask.shape) < 3: + mask = mask[..., None] + mask = np.asarray(mask, dtype=np.uint8) + mask = np.any(mask, axis=2) + masks.append(mask) + if masks: + masks = np.stack(masks, axis=0) + else: + masks = np.zeros((0, height, width), dtype=np.uint8) + + return masks + + +# Copied from transformers.models.detr.image_processing_detr.prepare_coco_detection_annotation with DETR->DeformableDetr +def prepare_coco_detection_annotation( + image, + target, + return_segmentation_masks: bool = False, + input_data_format: Optional[Union[ChannelDimension, str]] = None, +): + """ + Convert the target in COCO format into the format expected by DeformableDetr. + """ + image_height, image_width = get_image_size(image, channel_dim=input_data_format) + + image_id = target["image_id"] + image_id = np.asarray([image_id], dtype=np.int64) + + # Get all COCO annotations for the given image. + annotations = target["annotations"] + annotations = [obj for obj in annotations if "iscrowd" not in obj or obj["iscrowd"] == 0] + + classes = [obj["category_id"] for obj in annotations] + classes = np.asarray(classes, dtype=np.int64) + + # for conversion to coco api + area = np.asarray([obj["area"] for obj in annotations], dtype=np.float32) + iscrowd = np.asarray([obj.get("iscrowd", 0) for obj in annotations], dtype=np.int64) + + boxes = [obj["bbox"] for obj in annotations] + # guard against no boxes via resizing + boxes = np.asarray(boxes, dtype=np.float32).reshape(-1, 4) + boxes[:, 2:] += boxes[:, :2] + boxes[:, 0::2] = boxes[:, 0::2].clip(min=0, max=image_width) + boxes[:, 1::2] = boxes[:, 1::2].clip(min=0, max=image_height) + + keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0]) + + new_target = {} + new_target["image_id"] = image_id + new_target["class_labels"] = classes[keep] + new_target["boxes"] = boxes[keep] + new_target["area"] = area[keep] + new_target["iscrowd"] = iscrowd[keep] + new_target["orig_size"] = np.asarray([int(image_height), int(image_width)], dtype=np.int64) + + if annotations and "keypoints" in annotations[0]: + keypoints = [obj["keypoints"] for obj in annotations] + # Converting the filtered keypoints list to a numpy array + keypoints = np.asarray(keypoints, dtype=np.float32) + # Apply the keep mask here to filter the relevant annotations + keypoints = keypoints[keep] + num_keypoints = keypoints.shape[0] + keypoints = keypoints.reshape((-1, 3)) if num_keypoints else keypoints + new_target["keypoints"] = keypoints + + if return_segmentation_masks: + segmentation_masks = [obj["segmentation"] for obj in annotations] + masks = convert_coco_poly_to_mask(segmentation_masks, image_height, image_width) + new_target["masks"] = masks[keep] + + return new_target + + +# Copied from transformers.models.detr.image_processing_detr.masks_to_boxes +def masks_to_boxes(masks: np.ndarray) -> np.ndarray: + """ + Compute the bounding boxes around the provided panoptic segmentation masks. + + Args: + masks: masks in format `[number_masks, height, width]` where N is the number of masks + + Returns: + boxes: bounding boxes in format `[number_masks, 4]` in xyxy format + """ + if masks.size == 0: + return np.zeros((0, 4)) + + h, w = masks.shape[-2:] + y = np.arange(0, h, dtype=np.float32) + x = np.arange(0, w, dtype=np.float32) + # see https://github.com/pytorch/pytorch/issues/50276 + y, x = np.meshgrid(y, x, indexing="ij") + + x_mask = masks * np.expand_dims(x, axis=0) + x_max = x_mask.reshape(x_mask.shape[0], -1).max(-1) + x = np.ma.array(x_mask, mask=~(np.array(masks, dtype=bool))) + x_min = x.filled(fill_value=1e8) + x_min = x_min.reshape(x_min.shape[0], -1).min(-1) + + y_mask = masks * np.expand_dims(y, axis=0) + y_max = y_mask.reshape(x_mask.shape[0], -1).max(-1) + y = np.ma.array(y_mask, mask=~(np.array(masks, dtype=bool))) + y_min = y.filled(fill_value=1e8) + y_min = y_min.reshape(y_min.shape[0], -1).min(-1) + + return np.stack([x_min, y_min, x_max, y_max], 1) + + +# Copied from transformers.models.detr.image_processing_detr.prepare_coco_panoptic_annotation with DETR->DeformableDetr +def prepare_coco_panoptic_annotation( + image: np.ndarray, + target: dict, + masks_path: Union[str, pathlib.Path], + return_masks: bool = True, + input_data_format: Union[ChannelDimension, str] = None, +) -> dict: + """ + Prepare a coco panoptic annotation for DeformableDetr. + """ + image_height, image_width = get_image_size(image, channel_dim=input_data_format) + annotation_path = pathlib.Path(masks_path) / target["file_name"] + + new_target = {} + new_target["image_id"] = np.asarray([target["image_id"] if "image_id" in target else target["id"]], dtype=np.int64) + new_target["size"] = np.asarray([image_height, image_width], dtype=np.int64) + new_target["orig_size"] = np.asarray([image_height, image_width], dtype=np.int64) + + if "segments_info" in target: + masks = np.asarray(PIL.Image.open(annotation_path), dtype=np.uint32) + masks = rgb_to_id(masks) + + ids = np.array([segment_info["id"] for segment_info in target["segments_info"]]) + masks = masks == ids[:, None, None] + masks = masks.astype(np.uint8) + if return_masks: + new_target["masks"] = masks + new_target["boxes"] = masks_to_boxes(masks) + new_target["class_labels"] = np.array( + [segment_info["category_id"] for segment_info in target["segments_info"]], dtype=np.int64 + ) + new_target["iscrowd"] = np.asarray( + [segment_info["iscrowd"] for segment_info in target["segments_info"]], dtype=np.int64 + ) + new_target["area"] = np.asarray( + [segment_info["area"] for segment_info in target["segments_info"]], dtype=np.float32 + ) + + return new_target + + +# Copied from transformers.models.detr.image_processing_detr.get_segmentation_image +def get_segmentation_image( + masks: np.ndarray, input_size: tuple, target_size: tuple, stuff_equiv_classes, deduplicate=False +): + h, w = input_size + final_h, final_w = target_size + + m_id = scipy.special.softmax(masks.transpose(0, 1), -1) + + if m_id.shape[-1] == 0: + # We didn't detect any mask :( + m_id = np.zeros((h, w), dtype=np.int64) + else: + m_id = m_id.argmax(-1).reshape(h, w) + + if deduplicate: + # Merge the masks corresponding to the same stuff class + for equiv in stuff_equiv_classes.values(): + for eq_id in equiv: + m_id[m_id == eq_id] = equiv[0] + + seg_img = id_to_rgb(m_id) + seg_img = resize(seg_img, (final_w, final_h), resample=PILImageResampling.NEAREST) + return seg_img + + +# Copied from transformers.models.detr.image_processing_detr.get_mask_area +def get_mask_area(seg_img: np.ndarray, target_size: tuple[int, int], n_classes: int) -> np.ndarray: + final_h, final_w = target_size + np_seg_img = seg_img.astype(np.uint8) + np_seg_img = np_seg_img.reshape(final_h, final_w, 3) + m_id = rgb_to_id(np_seg_img) + area = [(m_id == i).sum() for i in range(n_classes)] + return area + + +# Copied from transformers.models.detr.image_processing_detr.score_labels_from_class_probabilities +def score_labels_from_class_probabilities(logits: np.ndarray) -> tuple[np.ndarray, np.ndarray]: + probs = scipy.special.softmax(logits, axis=-1) + labels = probs.argmax(-1, keepdims=True) + scores = np.take_along_axis(probs, labels, axis=-1) + scores, labels = scores.squeeze(-1), labels.squeeze(-1) + return scores, labels + + +# Copied from transformers.models.detr.image_processing_detr.post_process_panoptic_sample +def post_process_panoptic_sample( + out_logits: np.ndarray, + masks: np.ndarray, + boxes: np.ndarray, + processed_size: tuple[int, int], + target_size: tuple[int, int], + is_thing_map: dict, + threshold=0.85, +) -> dict: + """ + Converts the output of [`DetrForSegmentation`] into panoptic segmentation predictions for a single sample. + + Args: + out_logits (`torch.Tensor`): + The logits for this sample. + masks (`torch.Tensor`): + The predicted segmentation masks for this sample. + boxes (`torch.Tensor`): + The predicted bounding boxes for this sample. The boxes are in the normalized format `(center_x, center_y, + width, height)` and values between `[0, 1]`, relative to the size the image (disregarding padding). + processed_size (`tuple[int, int]`): + The processed size of the image `(height, width)`, as returned by the preprocessing step i.e. the size + after data augmentation but before batching. + target_size (`tuple[int, int]`): + The target size of the image, `(height, width)` corresponding to the requested final size of the + prediction. + is_thing_map (`Dict`): + A dictionary mapping class indices to a boolean value indicating whether the class is a thing or not. + threshold (`float`, *optional*, defaults to 0.85): + The threshold used to binarize the segmentation masks. + """ + # we filter empty queries and detection below threshold + scores, labels = score_labels_from_class_probabilities(out_logits) + keep = (labels != out_logits.shape[-1] - 1) & (scores > threshold) + + cur_scores = scores[keep] + cur_classes = labels[keep] + cur_boxes = center_to_corners_format(boxes[keep]) + + if len(cur_boxes) != len(cur_classes): + raise ValueError("Not as many boxes as there are classes") + + cur_masks = masks[keep] + cur_masks = resize(cur_masks[:, None], processed_size, resample=PILImageResampling.BILINEAR) + cur_masks = safe_squeeze(cur_masks, 1) + b, h, w = cur_masks.shape + + # It may be that we have several predicted masks for the same stuff class. + # In the following, we track the list of masks ids for each stuff class (they are merged later on) + cur_masks = cur_masks.reshape(b, -1) + stuff_equiv_classes = defaultdict(list) + for k, label in enumerate(cur_classes): + if not is_thing_map[label]: + stuff_equiv_classes[label].append(k) + + seg_img = get_segmentation_image(cur_masks, processed_size, target_size, stuff_equiv_classes, deduplicate=True) + area = get_mask_area(cur_masks, processed_size, n_classes=len(cur_scores)) + + # We filter out any mask that is too small + if cur_classes.size() > 0: + # We know filter empty masks as long as we find some + filtered_small = np.array([a <= 4 for a in area], dtype=bool) + while filtered_small.any(): + cur_masks = cur_masks[~filtered_small] + cur_scores = cur_scores[~filtered_small] + cur_classes = cur_classes[~filtered_small] + seg_img = get_segmentation_image(cur_masks, (h, w), target_size, stuff_equiv_classes, deduplicate=True) + area = get_mask_area(seg_img, target_size, n_classes=len(cur_scores)) + filtered_small = np.array([a <= 4 for a in area], dtype=bool) + else: + cur_classes = np.ones((1, 1), dtype=np.int64) + + segments_info = [ + {"id": i, "isthing": is_thing_map[cat], "category_id": int(cat), "area": a} + for i, (cat, a) in enumerate(zip(cur_classes, area)) + ] + del cur_classes + + with io.BytesIO() as out: + PIL.Image.fromarray(seg_img).save(out, format="PNG") + predictions = {"png_string": out.getvalue(), "segments_info": segments_info} + + return predictions + + +# Copied from transformers.models.detr.image_processing_detr.resize_annotation +def resize_annotation( + annotation: dict[str, Any], + orig_size: tuple[int, int], + target_size: tuple[int, int], + threshold: float = 0.5, + resample: PILImageResampling = PILImageResampling.NEAREST, +): + """ + Resizes an annotation to a target size. + + Args: + annotation (`dict[str, Any]`): + The annotation dictionary. + orig_size (`tuple[int, int]`): + The original size of the input image. + target_size (`tuple[int, int]`): + The target size of the image, as returned by the preprocessing `resize` step. + threshold (`float`, *optional*, defaults to 0.5): + The threshold used to binarize the segmentation masks. + resample (`PILImageResampling`, defaults to `PILImageResampling.NEAREST`): + The resampling filter to use when resizing the masks. + """ + ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(target_size, orig_size)) + ratio_height, ratio_width = ratios + + new_annotation = {} + new_annotation["size"] = target_size + + for key, value in annotation.items(): + if key == "boxes": + boxes = value + scaled_boxes = boxes * np.asarray([ratio_width, ratio_height, ratio_width, ratio_height], dtype=np.float32) + new_annotation["boxes"] = scaled_boxes + elif key == "area": + area = value + scaled_area = area * (ratio_width * ratio_height) + new_annotation["area"] = scaled_area + elif key == "masks": + masks = value[:, None] + masks = np.array([resize(mask, target_size, resample=resample) for mask in masks]) + masks = masks.astype(np.float32) + masks = masks[:, 0] > threshold + new_annotation["masks"] = masks + elif key == "size": + new_annotation["size"] = target_size + else: + new_annotation[key] = value + + return new_annotation + + +# Copied from transformers.models.detr.image_processing_detr.binary_mask_to_rle +def binary_mask_to_rle(mask): + """ + Converts given binary mask of shape `(height, width)` to the run-length encoding (RLE) format. + + Args: + mask (`torch.Tensor` or `numpy.array`): + A binary mask tensor of shape `(height, width)` where 0 denotes background and 1 denotes the target + segment_id or class_id. + Returns: + `List`: Run-length encoded list of the binary mask. Refer to COCO API for more information about the RLE + format. + """ + if is_torch_tensor(mask): + mask = mask.numpy() + + pixels = mask.flatten() + pixels = np.concatenate([[0], pixels, [0]]) + runs = np.where(pixels[1:] != pixels[:-1])[0] + 1 + runs[1::2] -= runs[::2] + return list(runs) + + +# Copied from transformers.models.detr.image_processing_detr.convert_segmentation_to_rle +def convert_segmentation_to_rle(segmentation): + """ + Converts given segmentation map of shape `(height, width)` to the run-length encoding (RLE) format. + + Args: + segmentation (`torch.Tensor` or `numpy.array`): + A segmentation map of shape `(height, width)` where each value denotes a segment or class id. + Returns: + `list[List]`: A list of lists, where each list is the run-length encoding of a segment / class id. + """ + segment_ids = torch.unique(segmentation) + + run_length_encodings = [] + for idx in segment_ids: + mask = torch.where(segmentation == idx, 1, 0) + rle = binary_mask_to_rle(mask) + run_length_encodings.append(rle) + + return run_length_encodings + + +# Copied from transformers.models.detr.image_processing_detr.remove_low_and_no_objects +def remove_low_and_no_objects(masks, scores, labels, object_mask_threshold, num_labels): + """ + Binarize the given masks using `object_mask_threshold`, it returns the associated values of `masks`, `scores` and + `labels`. + + Args: + masks (`torch.Tensor`): + A tensor of shape `(num_queries, height, width)`. + scores (`torch.Tensor`): + A tensor of shape `(num_queries)`. + labels (`torch.Tensor`): + A tensor of shape `(num_queries)`. + object_mask_threshold (`float`): + A number between 0 and 1 used to binarize the masks. + Raises: + `ValueError`: Raised when the first dimension doesn't match in all input tensors. + Returns: + `tuple[`torch.Tensor`, `torch.Tensor`, `torch.Tensor`]`: The `masks`, `scores` and `labels` without the region + < `object_mask_threshold`. + """ + if not (masks.shape[0] == scores.shape[0] == labels.shape[0]): + raise ValueError("mask, scores and labels must have the same shape!") + + to_keep = labels.ne(num_labels) & (scores > object_mask_threshold) + + return masks[to_keep], scores[to_keep], labels[to_keep] + + +# Copied from transformers.models.detr.image_processing_detr.check_segment_validity +def check_segment_validity(mask_labels, mask_probs, k, mask_threshold=0.5, overlap_mask_area_threshold=0.8): + # Get the mask associated with the k class + mask_k = mask_labels == k + mask_k_area = mask_k.sum() + + # Compute the area of all the stuff in query k + original_area = (mask_probs[k] >= mask_threshold).sum() + mask_exists = mask_k_area > 0 and original_area > 0 + + # Eliminate disconnected tiny segments + if mask_exists: + area_ratio = mask_k_area / original_area + if not area_ratio.item() > overlap_mask_area_threshold: + mask_exists = False + + return mask_exists, mask_k + + +# Copied from transformers.models.detr.image_processing_detr.compute_segments +def compute_segments( + mask_probs, + pred_scores, + pred_labels, + mask_threshold: float = 0.5, + overlap_mask_area_threshold: float = 0.8, + label_ids_to_fuse: Optional[set[int]] = None, + target_size: Optional[tuple[int, int]] = None, +): + height = mask_probs.shape[1] if target_size is None else target_size[0] + width = mask_probs.shape[2] if target_size is None else target_size[1] + + segmentation = torch.zeros((height, width), dtype=torch.int32, device=mask_probs.device) + segments: list[dict] = [] + + if target_size is not None: + mask_probs = nn.functional.interpolate( + mask_probs.unsqueeze(0), size=target_size, mode="bilinear", align_corners=False + )[0] + + current_segment_id = 0 + + # Weigh each mask by its prediction score + mask_probs *= pred_scores.view(-1, 1, 1) + mask_labels = mask_probs.argmax(0) # [height, width] + + # Keep track of instances of each class + stuff_memory_list: dict[str, int] = {} + for k in range(pred_labels.shape[0]): + pred_class = pred_labels[k].item() + should_fuse = pred_class in label_ids_to_fuse + + # Check if mask exists and large enough to be a segment + mask_exists, mask_k = check_segment_validity( + mask_labels, mask_probs, k, mask_threshold, overlap_mask_area_threshold + ) + + if mask_exists: + if pred_class in stuff_memory_list: + current_segment_id = stuff_memory_list[pred_class] + else: + current_segment_id += 1 + + # Add current object segment to final segmentation map + segmentation[mask_k] = current_segment_id + segment_score = round(pred_scores[k].item(), 6) + segments.append( + { + "id": current_segment_id, + "label_id": pred_class, + "was_fused": should_fuse, + "score": segment_score, + } + ) + if should_fuse: + stuff_memory_list[pred_class] = current_segment_id + + return segmentation, segments + + +@requires(backends=("torch", "vision")) +class DeformableDetrImageProcessor(BaseImageProcessor): + r""" + Constructs a Deformable DETR image processor. + + Args: + format (`str`, *optional*, defaults to `"coco_detection"`): + Data format of the annotations. One of "coco_detection" or "coco_panoptic". + do_resize (`bool`, *optional*, defaults to `True`): + Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be + overridden by the `do_resize` parameter in the `preprocess` method. + size (`dict[str, int]` *optional*, defaults to `{"shortest_edge": 800, "longest_edge": 1333}`): + Size of the image's `(height, width)` dimensions after resizing. Can be overridden by the `size` parameter + in the `preprocess` method. Available options are: + - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`. + Do NOT keep the aspect ratio. + - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting + the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge + less or equal to `longest_edge`. + - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the + aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to + `max_width`. + resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`): + Resampling filter to use if resizing the image. + do_rescale (`bool`, *optional*, defaults to `True`): + Controls whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the + `do_rescale` parameter in the `preprocess` method. + rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): + Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the + `preprocess` method. + do_normalize: + Controls whether to normalize the image. Can be overridden by the `do_normalize` parameter in the + `preprocess` method. + image_mean (`float` or `list[float]`, *optional*, defaults to `IMAGENET_DEFAULT_MEAN`): + Mean values to use when normalizing the image. Can be a single value or a list of values, one for each + channel. Can be overridden by the `image_mean` parameter in the `preprocess` method. + image_std (`float` or `list[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`): + Standard deviation values to use when normalizing the image. Can be a single value or a list of values, one + for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method. + do_convert_annotations (`bool`, *optional*, defaults to `True`): + Controls whether to convert the annotations to the format expected by the DETR model. Converts the + bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`. + Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method. + do_pad (`bool`, *optional*, defaults to `True`): + Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess` + method. If `True`, padding will be applied to the bottom and right of the image with zeros. + If `pad_size` is provided, the image will be padded to the specified dimensions. + Otherwise, the image will be padded to the maximum height and width of the batch. + pad_size (`dict[str, int]`, *optional*): + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size + provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest + height and width in the batch. + """ + + model_input_names = ["pixel_values", "pixel_mask"] + + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.__init__ + def __init__( + self, + format: Union[str, AnnotationFormat] = AnnotationFormat.COCO_DETECTION, + do_resize: bool = True, + size: Optional[dict[str, int]] = None, + resample: PILImageResampling = PILImageResampling.BILINEAR, + do_rescale: bool = True, + rescale_factor: Union[int, float] = 1 / 255, + do_normalize: bool = True, + image_mean: Optional[Union[float, list[float]]] = None, + image_std: Optional[Union[float, list[float]]] = None, + do_convert_annotations: Optional[bool] = None, + do_pad: bool = True, + pad_size: Optional[dict[str, int]] = None, + **kwargs, + ) -> None: + if "pad_and_return_pixel_mask" in kwargs: + do_pad = kwargs.pop("pad_and_return_pixel_mask") + + if "max_size" in kwargs: + logger.warning_once( + "The `max_size` parameter is deprecated and will be removed in v4.26. " + "Please specify in `size['longest_edge'] instead`.", + ) + max_size = kwargs.pop("max_size") + else: + max_size = None if size is None else 1333 + + size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333} + size = get_size_dict(size, max_size=max_size, default_to_square=False) + + # Backwards compatibility + if do_convert_annotations is None: + do_convert_annotations = do_normalize + + super().__init__(**kwargs) + self.format = format + self.do_resize = do_resize + self.size = size + self.resample = resample + self.do_rescale = do_rescale + self.rescale_factor = rescale_factor + self.do_normalize = do_normalize + self.do_convert_annotations = do_convert_annotations + self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN + self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD + self.do_pad = do_pad + self.pad_size = pad_size + self._valid_processor_keys = [ + "images", + "annotations", + "return_segmentation_masks", + "masks_path", + "do_resize", + "size", + "resample", + "do_rescale", + "rescale_factor", + "do_normalize", + "do_convert_annotations", + "image_mean", + "image_std", + "do_pad", + "pad_size", + "format", + "return_tensors", + "data_format", + "input_data_format", + ] + + @classmethod + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.from_dict with Detr->DeformableDetr + def from_dict(cls, image_processor_dict: dict[str, Any], **kwargs): + """ + Overrides the `from_dict` method from the base class to make sure parameters are updated if image processor is + created using from_dict and kwargs e.g. `DeformableDetrImageProcessor.from_pretrained(checkpoint, size=600, + max_size=800)` + """ + image_processor_dict = image_processor_dict.copy() + if "max_size" in kwargs: + image_processor_dict["max_size"] = kwargs.pop("max_size") + if "pad_and_return_pixel_mask" in kwargs: + image_processor_dict["pad_and_return_pixel_mask"] = kwargs.pop("pad_and_return_pixel_mask") + return super().from_dict(image_processor_dict, **kwargs) + + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare_annotation with DETR->DeformableDetr + def prepare_annotation( + self, + image: np.ndarray, + target: dict, + format: Optional[AnnotationFormat] = None, + return_segmentation_masks: Optional[bool] = None, + masks_path: Optional[Union[str, pathlib.Path]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> dict: + """ + Prepare an annotation for feeding into DeformableDetr model. + """ + format = format if format is not None else self.format + + if format == AnnotationFormat.COCO_DETECTION: + return_segmentation_masks = False if return_segmentation_masks is None else return_segmentation_masks + target = prepare_coco_detection_annotation( + image, target, return_segmentation_masks, input_data_format=input_data_format + ) + elif format == AnnotationFormat.COCO_PANOPTIC: + return_segmentation_masks = True if return_segmentation_masks is None else return_segmentation_masks + target = prepare_coco_panoptic_annotation( + image, + target, + masks_path=masks_path, + return_masks=return_segmentation_masks, + input_data_format=input_data_format, + ) + else: + raise ValueError(f"Format {format} is not supported.") + return target + + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.resize + def resize( + self, + image: np.ndarray, + size: dict[str, int], + resample: PILImageResampling = PILImageResampling.BILINEAR, + data_format: Optional[ChannelDimension] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ) -> np.ndarray: + """ + Resize the image to the given size. Size can be `min_size` (scalar) or `(height, width)` tuple. If size is an + int, smaller edge of the image will be matched to this number. + + Args: + image (`np.ndarray`): + Image to resize. + size (`dict[str, int]`): + Size of the image's `(height, width)` dimensions after resizing. Available options are: + - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`. + Do NOT keep the aspect ratio. + - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting + the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge + less or equal to `longest_edge`. + - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the + aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to + `max_width`. + resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`): + Resampling filter to use if resizing the image. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format for the output image. If unset, the channel dimension format of the input + image is used. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format of the input image. If not provided, it will be inferred. + """ + if "max_size" in kwargs: + logger.warning_once( + "The `max_size` parameter is deprecated and will be removed in v4.26. " + "Please specify in `size['longest_edge'] instead`.", + ) + max_size = kwargs.pop("max_size") + else: + max_size = None + size = get_size_dict(size, max_size=max_size, default_to_square=False) + if "shortest_edge" in size and "longest_edge" in size: + new_size = get_resize_output_image_size( + image, size["shortest_edge"], size["longest_edge"], input_data_format=input_data_format + ) + elif "max_height" in size and "max_width" in size: + new_size = get_image_size_for_max_height_width( + image, size["max_height"], size["max_width"], input_data_format=input_data_format + ) + elif "height" in size and "width" in size: + new_size = (size["height"], size["width"]) + else: + raise ValueError( + "Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got" + f" {size.keys()}." + ) + image = resize( + image, + size=new_size, + resample=resample, + data_format=data_format, + input_data_format=input_data_format, + **kwargs, + ) + return image + + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.resize_annotation + def resize_annotation( + self, + annotation, + orig_size, + size, + resample: PILImageResampling = PILImageResampling.NEAREST, + ) -> dict: + """ + Resize the annotation to match the resized image. If size is an int, smaller edge of the mask will be matched + to this number. + """ + return resize_annotation(annotation, orig_size=orig_size, target_size=size, resample=resample) + + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.rescale + def rescale( + self, + image: np.ndarray, + rescale_factor: float, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> np.ndarray: + """ + Rescale the image by the given factor. image = image * rescale_factor. + + Args: + image (`np.ndarray`): + Image to rescale. + rescale_factor (`float`): + The value to use for rescaling. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format for the output image. If unset, the channel dimension format of the input + image is used. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + input_data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format for the input image. If unset, is inferred from the input image. Can be + one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + """ + return rescale(image, rescale_factor, data_format=data_format, input_data_format=input_data_format) + + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.normalize_annotation + def normalize_annotation(self, annotation: dict, image_size: tuple[int, int]) -> dict: + """ + Normalize the boxes in the annotation from `[top_left_x, top_left_y, bottom_right_x, bottom_right_y]` to + `[center_x, center_y, width, height]` format and from absolute to relative pixel values. + """ + return normalize_annotation(annotation, image_size=image_size) + + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._update_annotation_for_padded_image + def _update_annotation_for_padded_image( + self, + annotation: dict, + input_image_size: tuple[int, int], + output_image_size: tuple[int, int], + padding, + update_bboxes, + ) -> dict: + """ + Update the annotation for a padded image. + """ + new_annotation = {} + new_annotation["size"] = output_image_size + + for key, value in annotation.items(): + if key == "masks": + masks = value + masks = pad( + masks, + padding, + mode=PaddingMode.CONSTANT, + constant_values=0, + input_data_format=ChannelDimension.FIRST, + ) + masks = safe_squeeze(masks, 1) + new_annotation["masks"] = masks + elif key == "boxes" and update_bboxes: + boxes = value + boxes *= np.asarray( + [ + input_image_size[1] / output_image_size[1], + input_image_size[0] / output_image_size[0], + input_image_size[1] / output_image_size[1], + input_image_size[0] / output_image_size[0], + ] + ) + new_annotation["boxes"] = boxes + elif key == "size": + new_annotation["size"] = output_image_size + else: + new_annotation[key] = value + return new_annotation + + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image + def _pad_image( + self, + image: np.ndarray, + output_size: tuple[int, int], + annotation: Optional[dict[str, Any]] = None, + constant_values: Union[float, Iterable[float]] = 0, + data_format: Optional[ChannelDimension] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + update_bboxes: bool = True, + ) -> np.ndarray: + """ + Pad an image with zeros to the given size. + """ + input_height, input_width = get_image_size(image, channel_dim=input_data_format) + output_height, output_width = output_size + + pad_bottom = output_height - input_height + pad_right = output_width - input_width + padding = ((0, pad_bottom), (0, pad_right)) + padded_image = pad( + image, + padding, + mode=PaddingMode.CONSTANT, + constant_values=constant_values, + data_format=data_format, + input_data_format=input_data_format, + ) + if annotation is not None: + annotation = self._update_annotation_for_padded_image( + annotation, (input_height, input_width), (output_height, output_width), padding, update_bboxes + ) + return padded_image, annotation + + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad + def pad( + self, + images: list[np.ndarray], + annotations: Optional[Union[AnnotationType, list[AnnotationType]]] = None, + constant_values: Union[float, Iterable[float]] = 0, + return_pixel_mask: bool = True, + return_tensors: Optional[Union[str, TensorType]] = None, + data_format: Optional[ChannelDimension] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + update_bboxes: bool = True, + pad_size: Optional[dict[str, int]] = None, + ) -> BatchFeature: + """ + Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width + in the batch and optionally returns their corresponding pixel mask. + + Args: + images (list[`np.ndarray`]): + Images to pad. + annotations (`AnnotationType` or `list[AnnotationType]`, *optional*): + Annotations to transform according to the padding that is applied to the images. + constant_values (`float` or `Iterable[float]`, *optional*): + The value to use for the padding if `mode` is `"constant"`. + return_pixel_mask (`bool`, *optional*, defaults to `True`): + Whether to return a pixel mask. + return_tensors (`str` or `TensorType`, *optional*): + The type of tensors to return. Can be one of: + - Unset: Return a list of `np.ndarray`. + - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`. + - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. + - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. + - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format of the image. If not provided, it will be the same as the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format of the input image. If not provided, it will be inferred. + update_bboxes (`bool`, *optional*, defaults to `True`): + Whether to update the bounding boxes in the annotations to match the padded images. If the + bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)` + format, the bounding boxes will not be updated. + pad_size (`dict[str, int]`, *optional*): + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size + provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest + height and width in the batch. + """ + pad_size = pad_size if pad_size is not None else self.pad_size + if pad_size is not None: + padded_size = (pad_size["height"], pad_size["width"]) + else: + padded_size = get_max_height_width(images, input_data_format=input_data_format) + + annotation_list = annotations if annotations is not None else [None] * len(images) + padded_images = [] + padded_annotations = [] + for image, annotation in zip(images, annotation_list): + padded_image, padded_annotation = self._pad_image( + image, + padded_size, + annotation, + constant_values=constant_values, + data_format=data_format, + input_data_format=input_data_format, + update_bboxes=update_bboxes, + ) + padded_images.append(padded_image) + padded_annotations.append(padded_annotation) + + data = {"pixel_values": padded_images} + + if return_pixel_mask: + masks = [ + make_pixel_mask(image=image, output_size=padded_size, input_data_format=input_data_format) + for image in images + ] + data["pixel_mask"] = masks + + encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors) + + if annotations is not None: + encoded_inputs["labels"] = [ + BatchFeature(annotation, tensor_type=return_tensors) for annotation in padded_annotations + ] + + return encoded_inputs + + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.preprocess + def preprocess( + self, + images: ImageInput, + annotations: Optional[Union[AnnotationType, list[AnnotationType]]] = None, + return_segmentation_masks: Optional[bool] = None, + masks_path: Optional[Union[str, pathlib.Path]] = None, + do_resize: Optional[bool] = None, + size: Optional[dict[str, int]] = None, + resample=None, # PILImageResampling + do_rescale: Optional[bool] = None, + rescale_factor: Optional[Union[int, float]] = None, + do_normalize: Optional[bool] = None, + do_convert_annotations: Optional[bool] = None, + image_mean: Optional[Union[float, list[float]]] = None, + image_std: Optional[Union[float, list[float]]] = None, + do_pad: Optional[bool] = None, + format: Optional[Union[str, AnnotationFormat]] = None, + return_tensors: Optional[Union[TensorType, str]] = None, + data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + pad_size: Optional[dict[str, int]] = None, + **kwargs, + ) -> BatchFeature: + """ + Preprocess an image or a batch of images so that it can be used by the model. + + Args: + images (`ImageInput`): + Image or batch of images to preprocess. Expects a single or batch of images with pixel values ranging + from 0 to 255. If passing in images with pixel values between 0 and 1, set `do_rescale=False`. + annotations (`AnnotationType` or `list[AnnotationType]`, *optional*): + List of annotations associated with the image or batch of images. If annotation is for object + detection, the annotations should be a dictionary with the following keys: + - "image_id" (`int`): The image id. + - "annotations" (`list[Dict]`): List of annotations for an image. Each annotation should be a + dictionary. An image can have no annotations, in which case the list should be empty. + If annotation is for segmentation, the annotations should be a dictionary with the following keys: + - "image_id" (`int`): The image id. + - "segments_info" (`list[Dict]`): List of segments for an image. Each segment should be a dictionary. + An image can have no segments, in which case the list should be empty. + - "file_name" (`str`): The file name of the image. + return_segmentation_masks (`bool`, *optional*, defaults to self.return_segmentation_masks): + Whether to return segmentation masks. + masks_path (`str` or `pathlib.Path`, *optional*): + Path to the directory containing the segmentation masks. + do_resize (`bool`, *optional*, defaults to self.do_resize): + Whether to resize the image. + size (`dict[str, int]`, *optional*, defaults to self.size): + Size of the image's `(height, width)` dimensions after resizing. Available options are: + - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`. + Do NOT keep the aspect ratio. + - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting + the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge + less or equal to `longest_edge`. + - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the + aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to + `max_width`. + resample (`PILImageResampling`, *optional*, defaults to self.resample): + Resampling filter to use when resizing the image. + do_rescale (`bool`, *optional*, defaults to self.do_rescale): + Whether to rescale the image. + rescale_factor (`float`, *optional*, defaults to self.rescale_factor): + Rescale factor to use when rescaling the image. + do_normalize (`bool`, *optional*, defaults to self.do_normalize): + Whether to normalize the image. + do_convert_annotations (`bool`, *optional*, defaults to self.do_convert_annotations): + Whether to convert the annotations to the format expected by the model. Converts the bounding + boxes from the format `(top_left_x, top_left_y, width, height)` to `(center_x, center_y, width, height)` + and in relative coordinates. + image_mean (`float` or `list[float]`, *optional*, defaults to self.image_mean): + Mean to use when normalizing the image. + image_std (`float` or `list[float]`, *optional*, defaults to self.image_std): + Standard deviation to use when normalizing the image. + do_pad (`bool`, *optional*, defaults to self.do_pad): + Whether to pad the image. If `True`, padding will be applied to the bottom and right of + the image with zeros. If `pad_size` is provided, the image will be padded to the specified + dimensions. Otherwise, the image will be padded to the maximum height and width of the batch. + format (`str` or `AnnotationFormat`, *optional*, defaults to self.format): + Format of the annotations. + return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors): + Type of tensors to return. If `None`, will return the list of images. + data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - Unset: Use the channel dimension format of the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + pad_size (`dict[str, int]`, *optional*): + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size + provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest + height and width in the batch. + """ + if "pad_and_return_pixel_mask" in kwargs: + logger.warning_once( + "The `pad_and_return_pixel_mask` argument is deprecated and will be removed in a future version, " + "use `do_pad` instead." + ) + do_pad = kwargs.pop("pad_and_return_pixel_mask") + + if "max_size" in kwargs: + logger.warning_once( + "The `max_size` argument is deprecated and will be removed in a future version, use" + " `size['longest_edge']` instead." + ) + size = kwargs.pop("max_size") + + do_resize = self.do_resize if do_resize is None else do_resize + size = self.size if size is None else size + size = get_size_dict(size=size, default_to_square=False) + resample = self.resample if resample is None else resample + do_rescale = self.do_rescale if do_rescale is None else do_rescale + rescale_factor = self.rescale_factor if rescale_factor is None else rescale_factor + do_normalize = self.do_normalize if do_normalize is None else do_normalize + image_mean = self.image_mean if image_mean is None else image_mean + image_std = self.image_std if image_std is None else image_std + do_convert_annotations = ( + self.do_convert_annotations if do_convert_annotations is None else do_convert_annotations + ) + do_pad = self.do_pad if do_pad is None else do_pad + pad_size = self.pad_size if pad_size is None else pad_size + format = self.format if format is None else format + + images = make_flat_list_of_images(images) + + if not valid_images(images): + raise ValueError( + "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " + "torch.Tensor, tf.Tensor or jax.ndarray." + ) + validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self._valid_processor_keys) + + # Here, the pad() method pads to the maximum of (width, height). It does not need to be validated. + validate_preprocess_arguments( + do_rescale=do_rescale, + rescale_factor=rescale_factor, + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + do_resize=do_resize, + size=size, + resample=resample, + ) + + if annotations is not None and isinstance(annotations, dict): + annotations = [annotations] + + if annotations is not None and len(images) != len(annotations): + raise ValueError( + f"The number of images ({len(images)}) and annotations ({len(annotations)}) do not match." + ) + + format = AnnotationFormat(format) + if annotations is not None: + validate_annotations(format, SUPPORTED_ANNOTATION_FORMATS, annotations) + + if ( + masks_path is not None + and format == AnnotationFormat.COCO_PANOPTIC + and not isinstance(masks_path, (pathlib.Path, str)) + ): + raise ValueError( + "The path to the directory containing the mask PNG files should be provided as a" + f" `pathlib.Path` or string object, but is {type(masks_path)} instead." + ) + + # All transformations expect numpy arrays + images = [to_numpy_array(image) for image in images] + + if do_rescale and is_scaled_image(images[0]): + logger.warning_once( + "It looks like you are trying to rescale already rescaled images. If the input" + " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again." + ) + + if input_data_format is None: + # We assume that all images have the same channel dimension format. + input_data_format = infer_channel_dimension_format(images[0]) + + # prepare (COCO annotations as a list of Dict -> DETR target as a single Dict per image) + if annotations is not None: + prepared_images = [] + prepared_annotations = [] + for image, target in zip(images, annotations): + target = self.prepare_annotation( + image, + target, + format, + return_segmentation_masks=return_segmentation_masks, + masks_path=masks_path, + input_data_format=input_data_format, + ) + prepared_images.append(image) + prepared_annotations.append(target) + images = prepared_images + annotations = prepared_annotations + del prepared_images, prepared_annotations + + # transformations + if do_resize: + if annotations is not None: + resized_images, resized_annotations = [], [] + for image, target in zip(images, annotations): + orig_size = get_image_size(image, input_data_format) + resized_image = self.resize( + image, size=size, resample=resample, input_data_format=input_data_format + ) + resized_annotation = self.resize_annotation( + target, orig_size, get_image_size(resized_image, input_data_format) + ) + resized_images.append(resized_image) + resized_annotations.append(resized_annotation) + images = resized_images + annotations = resized_annotations + del resized_images, resized_annotations + else: + images = [ + self.resize(image, size=size, resample=resample, input_data_format=input_data_format) + for image in images + ] + + if do_rescale: + images = [self.rescale(image, rescale_factor, input_data_format=input_data_format) for image in images] + + if do_normalize: + images = [ + self.normalize(image, image_mean, image_std, input_data_format=input_data_format) for image in images + ] + + if do_convert_annotations and annotations is not None: + annotations = [ + self.normalize_annotation(annotation, get_image_size(image, input_data_format)) + for annotation, image in zip(annotations, images) + ] + + if do_pad: + # Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...} + encoded_inputs = self.pad( + images, + annotations=annotations, + return_pixel_mask=True, + data_format=data_format, + input_data_format=input_data_format, + update_bboxes=do_convert_annotations, + return_tensors=return_tensors, + pad_size=pad_size, + ) + else: + images = [ + to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) + for image in images + ] + encoded_inputs = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors) + if annotations is not None: + encoded_inputs["labels"] = [ + BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations + ] + + return encoded_inputs + + # POSTPROCESSING METHODS - TODO: add support for other frameworks + def post_process(self, outputs, target_sizes): + """ + Converts the raw output of [`DeformableDetrForObjectDetection`] into final bounding boxes in (top_left_x, + top_left_y, bottom_right_x, bottom_right_y) format. Only supports PyTorch. + + Args: + outputs ([`DeformableDetrObjectDetectionOutput`]): + Raw outputs of the model. + target_sizes (`torch.Tensor` of shape `(batch_size, 2)`): + Tensor containing the size (height, width) of each image of the batch. For evaluation, this must be the + original image size (before any data augmentation). For visualization, this should be the image size + after data augment, but before padding. + Returns: + `list[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image + in the batch as predicted by the model. + """ + logger.warning_once( + "`post_process` is deprecated and will be removed in v5 of Transformers, please use" + " `post_process_object_detection` instead, with `threshold=0.` for equivalent results.", + ) + + out_logits, out_bbox = outputs.logits, outputs.pred_boxes + + if len(out_logits) != len(target_sizes): + raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits") + if target_sizes.shape[1] != 2: + raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch") + + prob = out_logits.sigmoid() + topk_values, topk_indexes = torch.topk(prob.view(out_logits.shape[0], -1), 100, dim=1) + scores = topk_values + topk_boxes = torch.div(topk_indexes, out_logits.shape[2], rounding_mode="floor") + labels = topk_indexes % out_logits.shape[2] + boxes = center_to_corners_format(out_bbox) + boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4)) + + # and from relative [0, 1] to absolute [0, height] coordinates + img_h, img_w = target_sizes.unbind(1) + scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1) + boxes = boxes * scale_fct[:, None, :] + + results = [{"scores": s, "labels": l, "boxes": b} for s, l, b in zip(scores, labels, boxes)] + + return results + + def post_process_object_detection( + self, outputs, threshold: float = 0.5, target_sizes: Union[TensorType, list[tuple]] = None, top_k: int = 100 + ): + """ + Converts the raw output of [`DeformableDetrForObjectDetection`] into final bounding boxes in (top_left_x, + top_left_y, bottom_right_x, bottom_right_y) format. Only supports PyTorch. + + Args: + outputs ([`DetrObjectDetectionOutput`]): + Raw outputs of the model. + threshold (`float`, *optional*): + Score threshold to keep object detection predictions. + target_sizes (`torch.Tensor` or `list[tuple[int, int]]`, *optional*): + Tensor of shape `(batch_size, 2)` or list of tuples (`tuple[int, int]`) containing the target size + (height, width) of each image in the batch. If left to None, predictions will not be resized. + top_k (`int`, *optional*, defaults to 100): + Keep only top k bounding boxes before filtering by thresholding. + + Returns: + `list[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image + in the batch as predicted by the model. + """ + out_logits, out_bbox = outputs.logits, outputs.pred_boxes + + if target_sizes is not None: + if len(out_logits) != len(target_sizes): + raise ValueError( + "Make sure that you pass in as many target sizes as the batch dimension of the logits" + ) + + prob = out_logits.sigmoid() + prob = prob.view(out_logits.shape[0], -1) + k_value = min(top_k, prob.size(1)) + topk_values, topk_indexes = torch.topk(prob, k_value, dim=1) + scores = topk_values + topk_boxes = torch.div(topk_indexes, out_logits.shape[2], rounding_mode="floor") + labels = topk_indexes % out_logits.shape[2] + boxes = center_to_corners_format(out_bbox) + boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4)) + + # and from relative [0, 1] to absolute [0, height] coordinates + if target_sizes is not None: + if isinstance(target_sizes, list): + img_h = torch.Tensor([i[0] for i in target_sizes]) + img_w = torch.Tensor([i[1] for i in target_sizes]) + else: + img_h, img_w = target_sizes.unbind(1) + scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device) + boxes = boxes * scale_fct[:, None, :] + + results = [] + for s, l, b in zip(scores, labels, boxes): + score = s[s > threshold] + label = l[s > threshold] + box = b[s > threshold] + results.append({"scores": score, "labels": label, "boxes": box}) + + return results + + +__all__ = ["DeformableDetrImageProcessor"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deformable_detr/image_processing_deformable_detr_fast.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deformable_detr/image_processing_deformable_detr_fast.py new file mode 100644 index 0000000000000000000000000000000000000000..8458d02d58a52635f38361e6da05c22294c0a0f6 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deformable_detr/image_processing_deformable_detr_fast.py @@ -0,0 +1,795 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/deformable_detr/modular_deformable_detr.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_deformable_detr.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +import pathlib +from typing import Any, Optional, Union + +import torch +from torchvision.io import read_image +from torchvision.transforms.v2 import functional as F + +from ...image_processing_utils import BatchFeature, get_size_dict +from ...image_processing_utils_fast import ( + BaseImageProcessorFast, + DefaultFastImageProcessorKwargs, + SizeDict, + get_image_size_for_max_height_width, + get_max_height_width, + safe_squeeze, +) +from ...image_transforms import center_to_corners_format, corners_to_center_format +from ...image_utils import ( + IMAGENET_DEFAULT_MEAN, + IMAGENET_DEFAULT_STD, + AnnotationFormat, + AnnotationType, + ChannelDimension, + ImageInput, + PILImageResampling, + get_image_size, + validate_annotations, +) +from ...processing_utils import Unpack +from ...utils import TensorType, auto_docstring, logging +from ...utils.import_utils import requires +from .image_processing_deformable_detr import get_size_with_aspect_ratio + + +logger = logging.get_logger(__name__) + + +class DeformableDetrFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): + r""" + format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`): + Data format of the annotations. One of "coco_detection" or "coco_panoptic". + do_convert_annotations (`bool`, *optional*, defaults to `True`): + Controls whether to convert the annotations to the format expected by the DEFORMABLE_DETR model. Converts the + bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`. + Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method. + return_segmentation_masks (`bool`, *optional*, defaults to `False`): + Whether to return segmentation masks. + """ + + format: Optional[Union[str, AnnotationFormat]] + do_convert_annotations: Optional[bool] + return_segmentation_masks: Optional[bool] + + +SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC) + + +# inspired by https://github.com/facebookresearch/deformable_detr/blob/master/datasets/coco.py#L33 +def convert_coco_poly_to_mask(segmentations, height: int, width: int, device: torch.device) -> torch.Tensor: + """ + Convert a COCO polygon annotation to a mask. + + Args: + segmentations (`list[list[float]]`): + List of polygons, each polygon represented by a list of x-y coordinates. + height (`int`): + Height of the mask. + width (`int`): + Width of the mask. + """ + try: + from pycocotools import mask as coco_mask + except ImportError: + raise ImportError("Pycocotools is not installed in your environment.") + + masks = [] + for polygons in segmentations: + rles = coco_mask.frPyObjects(polygons, height, width) + mask = coco_mask.decode(rles) + if len(mask.shape) < 3: + mask = mask[..., None] + mask = torch.as_tensor(mask, dtype=torch.uint8, device=device) + mask = torch.any(mask, axis=2) + masks.append(mask) + if masks: + masks = torch.stack(masks, axis=0) + else: + masks = torch.zeros((0, height, width), dtype=torch.uint8, device=device) + + return masks + + +# inspired by https://github.com/facebookresearch/deformable_detr/blob/master/datasets/coco.py#L50 +def prepare_coco_detection_annotation( + image, + target, + return_segmentation_masks: bool = False, + input_data_format: Optional[Union[ChannelDimension, str]] = None, +): + """ + Convert the target in COCO format into the format expected by DEFORMABLE_DETR. + """ + image_height, image_width = image.size()[-2:] + + image_id = target["image_id"] + image_id = torch.as_tensor([image_id], dtype=torch.int64, device=image.device) + + # Get all COCO annotations for the given image. + annotations = target["annotations"] + classes = [] + area = [] + boxes = [] + keypoints = [] + for obj in annotations: + if "iscrowd" not in obj or obj["iscrowd"] == 0: + classes.append(obj["category_id"]) + area.append(obj["area"]) + boxes.append(obj["bbox"]) + if "keypoints" in obj: + keypoints.append(obj["keypoints"]) + + classes = torch.as_tensor(classes, dtype=torch.int64, device=image.device) + area = torch.as_tensor(area, dtype=torch.float32, device=image.device) + iscrowd = torch.zeros_like(classes, dtype=torch.int64, device=image.device) + # guard against no boxes via resizing + boxes = torch.as_tensor(boxes, dtype=torch.float32, device=image.device).reshape(-1, 4) + boxes[:, 2:] += boxes[:, :2] + boxes[:, 0::2] = boxes[:, 0::2].clip(min=0, max=image_width) + boxes[:, 1::2] = boxes[:, 1::2].clip(min=0, max=image_height) + + keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0]) + + new_target = { + "image_id": image_id, + "class_labels": classes[keep], + "boxes": boxes[keep], + "area": area[keep], + "iscrowd": iscrowd[keep], + "orig_size": torch.as_tensor([int(image_height), int(image_width)], dtype=torch.int64, device=image.device), + } + + if keypoints: + keypoints = torch.as_tensor(keypoints, dtype=torch.float32, device=image.device) + # Apply the keep mask here to filter the relevant annotations + keypoints = keypoints[keep] + num_keypoints = keypoints.shape[0] + keypoints = keypoints.reshape((-1, 3)) if num_keypoints else keypoints + new_target["keypoints"] = keypoints + + if return_segmentation_masks: + segmentation_masks = [obj["segmentation"] for obj in annotations] + masks = convert_coco_poly_to_mask(segmentation_masks, image_height, image_width, device=image.device) + new_target["masks"] = masks[keep] + + return new_target + + +def masks_to_boxes(masks: torch.Tensor) -> torch.Tensor: + """ + Compute the bounding boxes around the provided panoptic segmentation masks. + + Args: + masks: masks in format `[number_masks, height, width]` where N is the number of masks + + Returns: + boxes: bounding boxes in format `[number_masks, 4]` in xyxy format + """ + if masks.numel() == 0: + return torch.zeros((0, 4), device=masks.device) + + h, w = masks.shape[-2:] + y = torch.arange(0, h, dtype=torch.float32, device=masks.device) + x = torch.arange(0, w, dtype=torch.float32, device=masks.device) + # see https://github.com/pytorch/pytorch/issues/50276 + y, x = torch.meshgrid(y, x, indexing="ij") + + x_mask = masks * torch.unsqueeze(x, 0) + x_max = x_mask.view(x_mask.shape[0], -1).max(-1)[0] + x_min = ( + torch.where(masks, x.unsqueeze(0), torch.tensor(1e8, device=masks.device)).view(masks.shape[0], -1).min(-1)[0] + ) + + y_mask = masks * torch.unsqueeze(y, 0) + y_max = y_mask.view(y_mask.shape[0], -1).max(-1)[0] + y_min = ( + torch.where(masks, y.unsqueeze(0), torch.tensor(1e8, device=masks.device)).view(masks.shape[0], -1).min(-1)[0] + ) + + return torch.stack([x_min, y_min, x_max, y_max], 1) + + +# 2 functions below adapted from https://github.com/cocodataset/panopticapi/blob/master/panopticapi/utils.py +# Copyright (c) 2018, Alexander Kirillov +# All rights reserved. +def rgb_to_id(color): + """ + Converts RGB color to unique ID. + """ + if isinstance(color, torch.Tensor) and len(color.shape) == 3: + if color.dtype == torch.uint8: + color = color.to(torch.int32) + return color[:, :, 0] + 256 * color[:, :, 1] + 256 * 256 * color[:, :, 2] + return int(color[0] + 256 * color[1] + 256 * 256 * color[2]) + + +def prepare_coco_panoptic_annotation( + image: torch.Tensor, + target: dict, + masks_path: Union[str, pathlib.Path], + return_masks: bool = True, + input_data_format: Union[ChannelDimension, str] = None, +) -> dict: + """ + Prepare a coco panoptic annotation for DEFORMABLE_DETR. + """ + image_height, image_width = get_image_size(image, channel_dim=input_data_format) + annotation_path = pathlib.Path(masks_path) / target["file_name"] + + new_target = {} + new_target["image_id"] = torch.as_tensor( + [target["image_id"] if "image_id" in target else target["id"]], dtype=torch.int64, device=image.device + ) + new_target["size"] = torch.as_tensor([image_height, image_width], dtype=torch.int64, device=image.device) + new_target["orig_size"] = torch.as_tensor([image_height, image_width], dtype=torch.int64, device=image.device) + + if "segments_info" in target: + masks = read_image(annotation_path).permute(1, 2, 0).to(dtype=torch.int32, device=image.device) + masks = rgb_to_id(masks) + + ids = torch.as_tensor([segment_info["id"] for segment_info in target["segments_info"]], device=image.device) + masks = masks == ids[:, None, None] + masks = masks.to(torch.bool) + if return_masks: + new_target["masks"] = masks + new_target["boxes"] = masks_to_boxes(masks) + new_target["class_labels"] = torch.as_tensor( + [segment_info["category_id"] for segment_info in target["segments_info"]], + dtype=torch.int64, + device=image.device, + ) + new_target["iscrowd"] = torch.as_tensor( + [segment_info["iscrowd"] for segment_info in target["segments_info"]], + dtype=torch.int64, + device=image.device, + ) + new_target["area"] = torch.as_tensor( + [segment_info["area"] for segment_info in target["segments_info"]], + dtype=torch.float32, + device=image.device, + ) + + return new_target + + +@auto_docstring +@requires(backends=("torchvision", "torch")) +class DeformableDetrImageProcessorFast(BaseImageProcessorFast): + resample = PILImageResampling.BILINEAR + image_mean = IMAGENET_DEFAULT_MEAN + image_std = IMAGENET_DEFAULT_STD + format = AnnotationFormat.COCO_DETECTION + do_resize = True + do_rescale = True + do_normalize = True + do_pad = True + size = {"shortest_edge": 800, "longest_edge": 1333} + default_to_square = False + model_input_names = ["pixel_values", "pixel_mask"] + valid_kwargs = DeformableDetrFastImageProcessorKwargs + + def __init__(self, **kwargs: Unpack[DeformableDetrFastImageProcessorKwargs]) -> None: + if "pad_and_return_pixel_mask" in kwargs: + kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask") + + size = kwargs.pop("size", None) + if "max_size" in kwargs: + logger.warning_once( + "The `max_size` parameter is deprecated and will be removed in v4.26. " + "Please specify in `size['longest_edge'] instead`.", + ) + max_size = kwargs.pop("max_size") + else: + max_size = None if size is None else 1333 + + size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333} + self.size = get_size_dict(size, max_size=max_size, default_to_square=False) + + # Backwards compatibility + do_convert_annotations = kwargs.get("do_convert_annotations") + do_normalize = kwargs.get("do_normalize") + if do_convert_annotations is None and getattr(self, "do_convert_annotations", None) is None: + self.do_convert_annotations = do_normalize if do_normalize is not None else self.do_normalize + + super().__init__(**kwargs) + + @classmethod + def from_dict(cls, image_processor_dict: dict[str, Any], **kwargs): + """ + Overrides the `from_dict` method from the base class to make sure parameters are updated if image processor is + created using from_dict and kwargs e.g. `DeformableDetrImageProcessorFast.from_pretrained(checkpoint, size=600, + max_size=800)` + """ + image_processor_dict = image_processor_dict.copy() + if "max_size" in kwargs: + image_processor_dict["max_size"] = kwargs.pop("max_size") + if "pad_and_return_pixel_mask" in kwargs: + image_processor_dict["pad_and_return_pixel_mask"] = kwargs.pop("pad_and_return_pixel_mask") + return super().from_dict(image_processor_dict, **kwargs) + + def prepare_annotation( + self, + image: torch.Tensor, + target: dict, + format: Optional[AnnotationFormat] = None, + return_segmentation_masks: Optional[bool] = None, + masks_path: Optional[Union[str, pathlib.Path]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> dict: + """ + Prepare an annotation for feeding into DEFORMABLE_DETR model. + """ + format = format if format is not None else self.format + + if format == AnnotationFormat.COCO_DETECTION: + return_segmentation_masks = False if return_segmentation_masks is None else return_segmentation_masks + target = prepare_coco_detection_annotation( + image, target, return_segmentation_masks, input_data_format=input_data_format + ) + elif format == AnnotationFormat.COCO_PANOPTIC: + return_segmentation_masks = True if return_segmentation_masks is None else return_segmentation_masks + target = prepare_coco_panoptic_annotation( + image, + target, + masks_path=masks_path, + return_masks=return_segmentation_masks, + input_data_format=input_data_format, + ) + else: + raise ValueError(f"Format {format} is not supported.") + return target + + def resize( + self, + image: torch.Tensor, + size: SizeDict, + interpolation: Optional["F.InterpolationMode"] = None, + **kwargs, + ) -> torch.Tensor: + """ + Resize the image to the given size. Size can be `min_size` (scalar) or `(height, width)` tuple. If size is an + int, smaller edge of the image will be matched to this number. + + Args: + image (`torch.Tensor`): + Image to resize. + size (`SizeDict`): + Size of the image's `(height, width)` dimensions after resizing. Available options are: + - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`. + Do NOT keep the aspect ratio. + - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting + the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge + less or equal to `longest_edge`. + - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the + aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to + `max_width`. + interpolation (`InterpolationMode`, *optional*, defaults to `InterpolationMode.BILINEAR`): + Resampling filter to use if resizing the image. + """ + interpolation = interpolation if interpolation is not None else F.InterpolationMode.BILINEAR + if size.shortest_edge and size.longest_edge: + # Resize the image so that the shortest edge or the longest edge is of the given size + # while maintaining the aspect ratio of the original image. + new_size = get_size_with_aspect_ratio( + image.size()[-2:], + size["shortest_edge"], + size["longest_edge"], + ) + elif size.max_height and size.max_width: + new_size = get_image_size_for_max_height_width(image.size()[-2:], size["max_height"], size["max_width"]) + elif size.height and size.width: + new_size = (size["height"], size["width"]) + else: + raise ValueError( + "Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got" + f" {size.keys()}." + ) + + image = F.resize( + image, + size=new_size, + interpolation=interpolation, + **kwargs, + ) + return image + + def resize_annotation( + self, + annotation: dict[str, Any], + orig_size: tuple[int, int], + target_size: tuple[int, int], + threshold: float = 0.5, + interpolation: Optional["F.InterpolationMode"] = None, + ): + """ + Resizes an annotation to a target size. + + Args: + annotation (`dict[str, Any]`): + The annotation dictionary. + orig_size (`tuple[int, int]`): + The original size of the input image. + target_size (`tuple[int, int]`): + The target size of the image, as returned by the preprocessing `resize` step. + threshold (`float`, *optional*, defaults to 0.5): + The threshold used to binarize the segmentation masks. + resample (`InterpolationMode`, defaults to `F.InterpolationMode.NEAREST_EXACT`): + The resampling filter to use when resizing the masks. + """ + interpolation = interpolation if interpolation is not None else F.InterpolationMode.NEAREST_EXACT + ratio_height, ratio_width = [target / orig for target, orig in zip(target_size, orig_size)] + + new_annotation = {} + new_annotation["size"] = target_size + + for key, value in annotation.items(): + if key == "boxes": + boxes = value + scaled_boxes = boxes * torch.as_tensor( + [ratio_width, ratio_height, ratio_width, ratio_height], dtype=torch.float32, device=boxes.device + ) + new_annotation["boxes"] = scaled_boxes + elif key == "area": + area = value + scaled_area = area * (ratio_width * ratio_height) + new_annotation["area"] = scaled_area + elif key == "masks": + masks = value[:, None] + masks = [F.resize(mask, target_size, interpolation=interpolation) for mask in masks] + masks = torch.stack(masks).to(torch.float32) + masks = masks[:, 0] > threshold + new_annotation["masks"] = masks + elif key == "size": + new_annotation["size"] = target_size + else: + new_annotation[key] = value + + return new_annotation + + def normalize_annotation(self, annotation: dict, image_size: tuple[int, int]) -> dict: + image_height, image_width = image_size + norm_annotation = {} + for key, value in annotation.items(): + if key == "boxes": + boxes = value + boxes = corners_to_center_format(boxes) + boxes /= torch.as_tensor( + [image_width, image_height, image_width, image_height], dtype=torch.float32, device=boxes.device + ) + norm_annotation[key] = boxes + else: + norm_annotation[key] = value + return norm_annotation + + def _update_annotation_for_padded_image( + self, + annotation: dict, + input_image_size: tuple[int, int], + output_image_size: tuple[int, int], + padding, + update_bboxes, + ) -> dict: + """ + Update the annotation for a padded image. + """ + new_annotation = {} + new_annotation["size"] = output_image_size + ratio_height, ratio_width = (input / output for output, input in zip(output_image_size, input_image_size)) + + for key, value in annotation.items(): + if key == "masks": + masks = value + masks = F.pad( + masks, + padding, + fill=0, + ) + masks = safe_squeeze(masks, 1) + new_annotation["masks"] = masks + elif key == "boxes" and update_bboxes: + boxes = value + boxes *= torch.as_tensor([ratio_width, ratio_height, ratio_width, ratio_height], device=boxes.device) + new_annotation["boxes"] = boxes + elif key == "size": + new_annotation["size"] = output_image_size + else: + new_annotation[key] = value + return new_annotation + + def pad( + self, + image: torch.Tensor, + padded_size: tuple[int, int], + annotation: Optional[dict[str, Any]] = None, + update_bboxes: bool = True, + fill: int = 0, + ): + original_size = image.size()[-2:] + padding_bottom = padded_size[0] - original_size[0] + padding_right = padded_size[1] - original_size[1] + if padding_bottom < 0 or padding_right < 0: + raise ValueError( + f"Padding dimensions are negative. Please make sure that the padded size is larger than the " + f"original size. Got padded size: {padded_size}, original size: {original_size}." + ) + if original_size != padded_size: + padding = [0, 0, padding_right, padding_bottom] + image = F.pad(image, padding, fill=fill) + if annotation is not None: + annotation = self._update_annotation_for_padded_image( + annotation, original_size, padded_size, padding, update_bboxes + ) + + # Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding. + pixel_mask = torch.zeros(padded_size, dtype=torch.int64, device=image.device) + pixel_mask[: original_size[0], : original_size[1]] = 1 + + return image, pixel_mask, annotation + + @auto_docstring + def preprocess( + self, + images: ImageInput, + annotations: Optional[Union[AnnotationType, list[AnnotationType]]] = None, + masks_path: Optional[Union[str, pathlib.Path]] = None, + **kwargs: Unpack[DeformableDetrFastImageProcessorKwargs], + ) -> BatchFeature: + r""" + annotations (`AnnotationType` or `list[AnnotationType]`, *optional*): + List of annotations associated with the image or batch of images. If annotation is for object + detection, the annotations should be a dictionary with the following keys: + - "image_id" (`int`): The image id. + - "annotations" (`list[Dict]`): List of annotations for an image. Each annotation should be a + dictionary. An image can have no annotations, in which case the list should be empty. + If annotation is for segmentation, the annotations should be a dictionary with the following keys: + - "image_id" (`int`): The image id. + - "segments_info" (`list[Dict]`): List of segments for an image. Each segment should be a dictionary. + An image can have no segments, in which case the list should be empty. + - "file_name" (`str`): The file name of the image. + masks_path (`str` or `pathlib.Path`, *optional*): + Path to the directory containing the segmentation masks. + """ + if "pad_and_return_pixel_mask" in kwargs: + kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask") + logger.warning_once( + "The `pad_and_return_pixel_mask` argument is deprecated and will be removed in a future version, " + "use `do_pad` instead." + ) + + if "max_size" in kwargs: + logger.warning_once( + "The `max_size` argument is deprecated and will be removed in a future version, use" + " `size['longest_edge']` instead." + ) + kwargs["size"] = kwargs.pop("max_size") + + return super().preprocess(images, annotations, masks_path, **kwargs) + + def _preprocess( + self, + images: list["torch.Tensor"], + annotations: Optional[Union[AnnotationType, list[AnnotationType]]], + masks_path: Optional[Union[str, pathlib.Path]], + return_segmentation_masks: bool, + do_resize: bool, + size: SizeDict, + interpolation: Optional["F.InterpolationMode"], + do_rescale: bool, + rescale_factor: float, + do_normalize: bool, + do_convert_annotations: bool, + image_mean: Optional[Union[float, list[float]]], + image_std: Optional[Union[float, list[float]]], + do_pad: bool, + pad_size: Optional[SizeDict], + format: Optional[Union[str, AnnotationFormat]], + return_tensors: Optional[Union[str, TensorType]], + **kwargs, + ) -> BatchFeature: + """ + Preprocess an image or a batch of images so that it can be used by the model. + """ + if annotations is not None and isinstance(annotations, dict): + annotations = [annotations] + + if annotations is not None and len(images) != len(annotations): + raise ValueError( + f"The number of images ({len(images)}) and annotations ({len(annotations)}) do not match." + ) + + format = AnnotationFormat(format) + if annotations is not None: + validate_annotations(format, SUPPORTED_ANNOTATION_FORMATS, annotations) + + if ( + masks_path is not None + and format == AnnotationFormat.COCO_PANOPTIC + and not isinstance(masks_path, (pathlib.Path, str)) + ): + raise ValueError( + "The path to the directory containing the mask PNG files should be provided as a" + f" `pathlib.Path` or string object, but is {type(masks_path)} instead." + ) + + data = {} + + processed_images = [] + processed_annotations = [] + pixel_masks = [] # Initialize pixel_masks here + for image, annotation in zip(images, annotations if annotations is not None else [None] * len(images)): + # prepare (COCO annotations as a list of Dict -> DEFORMABLE_DETR target as a single Dict per image) + if annotations is not None: + annotation = self.prepare_annotation( + image, + annotation, + format, + return_segmentation_masks=return_segmentation_masks, + masks_path=masks_path, + input_data_format=ChannelDimension.FIRST, + ) + + if do_resize: + resized_image = self.resize(image, size=size, interpolation=interpolation) + if annotations is not None: + annotation = self.resize_annotation( + annotation, + orig_size=image.size()[-2:], + target_size=resized_image.size()[-2:], + ) + image = resized_image + # Fused rescale and normalize + image = self.rescale_and_normalize(image, do_rescale, rescale_factor, do_normalize, image_mean, image_std) + if do_convert_annotations and annotations is not None: + annotation = self.normalize_annotation(annotation, get_image_size(image, ChannelDimension.FIRST)) + + processed_images.append(image) + processed_annotations.append(annotation) + images = processed_images + annotations = processed_annotations if annotations is not None else None + + if do_pad: + # depends on all resized image shapes so we need another loop + if pad_size is not None: + padded_size = (pad_size.height, pad_size.width) + else: + padded_size = get_max_height_width(images) + + padded_images = [] + padded_annotations = [] + for image, annotation in zip(images, annotations if annotations is not None else [None] * len(images)): + # Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...} + if padded_size == image.size()[-2:]: + padded_images.append(image) + pixel_masks.append(torch.ones(padded_size, dtype=torch.int64, device=image.device)) + padded_annotations.append(annotation) + continue + image, pixel_mask, annotation = self.pad( + image, padded_size, annotation=annotation, update_bboxes=do_convert_annotations + ) + padded_images.append(image) + padded_annotations.append(annotation) + pixel_masks.append(pixel_mask) + images = padded_images + annotations = padded_annotations if annotations is not None else None + data.update({"pixel_mask": torch.stack(pixel_masks, dim=0)}) + + data.update({"pixel_values": torch.stack(images, dim=0)}) + encoded_inputs = BatchFeature(data, tensor_type=return_tensors) + if annotations is not None: + encoded_inputs["labels"] = [ + BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations + ] + return encoded_inputs + + def post_process(self, outputs, target_sizes): + """ + Converts the raw output of [`DeformableDetrForObjectDetection`] into final bounding boxes in (top_left_x, + top_left_y, bottom_right_x, bottom_right_y) format. Only supports PyTorch. + + Args: + outputs ([`DeformableDetrObjectDetectionOutput`]): + Raw outputs of the model. + target_sizes (`torch.Tensor` of shape `(batch_size, 2)`): + Tensor containing the size (height, width) of each image of the batch. For evaluation, this must be the + original image size (before any data augmentation). For visualization, this should be the image size + after data augment, but before padding. + Returns: + `list[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image + in the batch as predicted by the model. + """ + logger.warning_once( + "`post_process` is deprecated and will be removed in v5 of Transformers, please use" + " `post_process_object_detection` instead, with `threshold=0.` for equivalent results.", + ) + + out_logits, out_bbox = outputs.logits, outputs.pred_boxes + + if len(out_logits) != len(target_sizes): + raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits") + if target_sizes.shape[1] != 2: + raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch") + + prob = out_logits.sigmoid() + topk_values, topk_indexes = torch.topk(prob.view(out_logits.shape[0], -1), 100, dim=1) + scores = topk_values + topk_boxes = torch.div(topk_indexes, out_logits.shape[2], rounding_mode="floor") + labels = topk_indexes % out_logits.shape[2] + boxes = center_to_corners_format(out_bbox) + boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4)) + + # and from relative [0, 1] to absolute [0, height] coordinates + img_h, img_w = target_sizes.unbind(1) + scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1) + boxes = boxes * scale_fct[:, None, :] + + results = [{"scores": s, "labels": l, "boxes": b} for s, l, b in zip(scores, labels, boxes)] + + return results + + def post_process_object_detection( + self, outputs, threshold: float = 0.5, target_sizes: Union[TensorType, list[tuple]] = None, top_k: int = 100 + ): + """ + Converts the raw output of [`DeformableDetrForObjectDetection`] into final bounding boxes in (top_left_x, + top_left_y, bottom_right_x, bottom_right_y) format. Only supports PyTorch. + + Args: + outputs ([`DetrObjectDetectionOutput`]): + Raw outputs of the model. + threshold (`float`, *optional*): + Score threshold to keep object detection predictions. + target_sizes (`torch.Tensor` or `list[tuple[int, int]]`, *optional*): + Tensor of shape `(batch_size, 2)` or list of tuples (`tuple[int, int]`) containing the target size + (height, width) of each image in the batch. If left to None, predictions will not be resized. + top_k (`int`, *optional*, defaults to 100): + Keep only top k bounding boxes before filtering by thresholding. + + Returns: + `list[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image + in the batch as predicted by the model. + """ + out_logits, out_bbox = outputs.logits, outputs.pred_boxes + + if target_sizes is not None: + if len(out_logits) != len(target_sizes): + raise ValueError( + "Make sure that you pass in as many target sizes as the batch dimension of the logits" + ) + + prob = out_logits.sigmoid() + prob = prob.view(out_logits.shape[0], -1) + k_value = min(top_k, prob.size(1)) + topk_values, topk_indexes = torch.topk(prob, k_value, dim=1) + scores = topk_values + topk_boxes = torch.div(topk_indexes, out_logits.shape[2], rounding_mode="floor") + labels = topk_indexes % out_logits.shape[2] + boxes = center_to_corners_format(out_bbox) + boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4)) + + # and from relative [0, 1] to absolute [0, height] coordinates + if target_sizes is not None: + if isinstance(target_sizes, list): + img_h = torch.Tensor([i[0] for i in target_sizes]) + img_w = torch.Tensor([i[1] for i in target_sizes]) + else: + img_h, img_w = target_sizes.unbind(1) + scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device) + boxes = boxes * scale_fct[:, None, :] + + results = [] + for s, l, b in zip(scores, labels, boxes): + score = s[s > threshold] + label = l[s > threshold] + box = b[s > threshold] + results.append({"scores": score, "labels": label, "boxes": box}) + + return results + + +__all__ = ["DeformableDetrImageProcessorFast"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deformable_detr/modeling_deformable_detr.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deformable_detr/modeling_deformable_detr.py new file mode 100644 index 0000000000000000000000000000000000000000..34f5bce7a5c4f6bc12d8fdf670a0bc381f7f6850 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deformable_detr/modeling_deformable_detr.py @@ -0,0 +1,1897 @@ +# coding=utf-8 +# Copyright 2022 SenseTime and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch Deformable DETR model.""" + +import copy +import math +import warnings +from dataclasses import dataclass +from typing import Any, Optional, Union + +import torch +import torch.nn.functional as F +from torch import Tensor, nn + +from ...activations import ACT2FN +from ...integrations import use_kernel_forward_from_hub +from ...modeling_attn_mask_utils import _prepare_4d_attention_mask +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_outputs import BaseModelOutput +from ...modeling_utils import PreTrainedModel +from ...pytorch_utils import meshgrid +from ...utils import ( + ModelOutput, + auto_docstring, + is_timm_available, + logging, + requires_backends, +) +from ...utils.backbone_utils import load_backbone +from .configuration_deformable_detr import DeformableDetrConfig + + +logger = logging.get_logger(__name__) + + +if is_timm_available(): + from timm import create_model + + +logger = logging.get_logger(__name__) + + +@use_kernel_forward_from_hub("MultiScaleDeformableAttention") +class MultiScaleDeformableAttention(nn.Module): + def forward( + self, + value: Tensor, + value_spatial_shapes: Tensor, + value_spatial_shapes_list: list[tuple], + level_start_index: Tensor, + sampling_locations: Tensor, + attention_weights: Tensor, + im2col_step: int, + ): + batch_size, _, num_heads, hidden_dim = value.shape + _, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape + value_list = value.split([height * width for height, width in value_spatial_shapes_list], dim=1) + sampling_grids = 2 * sampling_locations - 1 + sampling_value_list = [] + for level_id, (height, width) in enumerate(value_spatial_shapes_list): + # batch_size, height*width, num_heads, hidden_dim + # -> batch_size, height*width, num_heads*hidden_dim + # -> batch_size, num_heads*hidden_dim, height*width + # -> batch_size*num_heads, hidden_dim, height, width + value_l_ = ( + value_list[level_id] + .flatten(2) + .transpose(1, 2) + .reshape(batch_size * num_heads, hidden_dim, height, width) + ) + # batch_size, num_queries, num_heads, num_points, 2 + # -> batch_size, num_heads, num_queries, num_points, 2 + # -> batch_size*num_heads, num_queries, num_points, 2 + sampling_grid_l_ = sampling_grids[:, :, :, level_id].transpose(1, 2).flatten(0, 1) + # batch_size*num_heads, hidden_dim, num_queries, num_points + sampling_value_l_ = nn.functional.grid_sample( + value_l_, + sampling_grid_l_, + mode="bilinear", + padding_mode="zeros", + align_corners=False, + ) + sampling_value_list.append(sampling_value_l_) + # (batch_size, num_queries, num_heads, num_levels, num_points) + # -> (batch_size, num_heads, num_queries, num_levels, num_points) + # -> (batch_size, num_heads, 1, num_queries, num_levels*num_points) + attention_weights = attention_weights.transpose(1, 2).reshape( + batch_size * num_heads, 1, num_queries, num_levels * num_points + ) + output = ( + (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights) + .sum(-1) + .view(batch_size, num_heads * hidden_dim, num_queries) + ) + return output.transpose(1, 2).contiguous() + + +@dataclass +@auto_docstring( + custom_intro=""" + Base class for outputs of the DeformableDetrDecoder. This class adds two attributes to + BaseModelOutputWithCrossAttentions, namely: + - a stacked tensor of intermediate decoder hidden states (i.e. the output of each decoder layer) + - a stacked tensor of intermediate reference points. + """ +) +class DeformableDetrDecoderOutput(ModelOutput): + r""" + intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`): + Stacked intermediate hidden states (output of each layer of the decoder). + intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, sequence_length, hidden_size)`): + Stacked intermediate reference points (reference points of each layer of the decoder). + cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax, + used to compute the weighted average in the cross-attention heads. + """ + + last_hidden_state: Optional[torch.FloatTensor] = None + intermediate_hidden_states: Optional[torch.FloatTensor] = None + intermediate_reference_points: Optional[torch.FloatTensor] = None + hidden_states: Optional[tuple[torch.FloatTensor]] = None + attentions: Optional[tuple[torch.FloatTensor]] = None + cross_attentions: Optional[tuple[torch.FloatTensor]] = None + + +@dataclass +@auto_docstring( + custom_intro=""" + Base class for outputs of the Deformable DETR encoder-decoder model. + """ +) +class DeformableDetrModelOutput(ModelOutput): + r""" + init_reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`): + Initial reference points sent through the Transformer decoder. + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the decoder of the model. + intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`): + Stacked intermediate hidden states (output of each layer of the decoder). + intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`): + Stacked intermediate reference points (reference points of each layer of the decoder). + enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`): + Predicted bounding boxes scores where the top `config.two_stage_num_proposals` scoring bounding boxes are + picked as region proposals in the first stage. Output of bounding box binary classification (i.e. + foreground and background). + enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`): + Logits of predicted bounding boxes coordinates in the first stage. + """ + + init_reference_points: Optional[torch.FloatTensor] = None + last_hidden_state: Optional[torch.FloatTensor] = None + intermediate_hidden_states: Optional[torch.FloatTensor] = None + intermediate_reference_points: Optional[torch.FloatTensor] = None + decoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None + decoder_attentions: Optional[tuple[torch.FloatTensor]] = None + cross_attentions: Optional[tuple[torch.FloatTensor]] = None + encoder_last_hidden_state: Optional[torch.FloatTensor] = None + encoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None + encoder_attentions: Optional[tuple[torch.FloatTensor]] = None + enc_outputs_class: Optional[torch.FloatTensor] = None + enc_outputs_coord_logits: Optional[torch.FloatTensor] = None + + +@dataclass +@auto_docstring( + custom_intro=""" + Output type of [`DeformableDetrForObjectDetection`]. + """ +) +class DeformableDetrObjectDetectionOutput(ModelOutput): + r""" + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)): + Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a + bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized + scale-invariant IoU loss. + loss_dict (`Dict`, *optional*): + A dictionary containing the individual losses. Useful for logging. + logits (`torch.FloatTensor` of shape `(batch_size, num_queries, num_classes + 1)`): + Classification logits (including no-object) for all queries. + pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`): + Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These + values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding + possible padding). You can use [`~DeformableDetrProcessor.post_process_object_detection`] to retrieve the + unnormalized bounding boxes. + auxiliary_outputs (`list[Dict]`, *optional*): + Optional, only returned when auxiliary losses are activated (i.e. `config.auxiliary_loss` is set to `True`) + and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and + `pred_boxes`) for each decoder layer. + init_reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`): + Initial reference points sent through the Transformer decoder. + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the decoder of the model. + intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`): + Stacked intermediate hidden states (output of each layer of the decoder). + intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`): + Stacked intermediate reference points (reference points of each layer of the decoder). + enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`): + Predicted bounding boxes scores where the top `config.two_stage_num_proposals` scoring bounding boxes are + picked as region proposals in the first stage. Output of bounding box binary classification (i.e. + foreground and background). + enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`): + Logits of predicted bounding boxes coordinates in the first stage. + """ + + loss: Optional[torch.FloatTensor] = None + loss_dict: Optional[dict] = None + logits: Optional[torch.FloatTensor] = None + pred_boxes: Optional[torch.FloatTensor] = None + auxiliary_outputs: Optional[list[dict]] = None + init_reference_points: Optional[torch.FloatTensor] = None + last_hidden_state: Optional[torch.FloatTensor] = None + intermediate_hidden_states: Optional[torch.FloatTensor] = None + intermediate_reference_points: Optional[torch.FloatTensor] = None + decoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None + decoder_attentions: Optional[tuple[torch.FloatTensor]] = None + cross_attentions: Optional[tuple[torch.FloatTensor]] = None + encoder_last_hidden_state: Optional[torch.FloatTensor] = None + encoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None + encoder_attentions: Optional[tuple[torch.FloatTensor]] = None + enc_outputs_class: Any = None + enc_outputs_coord_logits: Optional[torch.FloatTensor] = None + + +def _get_clones(module, N): + return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) + + +def inverse_sigmoid(x, eps=1e-5): + x = x.clamp(min=0, max=1) + x1 = x.clamp(min=eps) + x2 = (1 - x).clamp(min=eps) + return torch.log(x1 / x2) + + +# Copied from transformers.models.detr.modeling_detr.DetrFrozenBatchNorm2d with Detr->DeformableDetr +class DeformableDetrFrozenBatchNorm2d(nn.Module): + """ + BatchNorm2d where the batch statistics and the affine parameters are fixed. + + Copy-paste from torchvision.misc.ops with added eps before rqsrt, without which any other models than + torchvision.models.resnet[18,34,50,101] produce nans. + """ + + def __init__(self, n): + super().__init__() + self.register_buffer("weight", torch.ones(n)) + self.register_buffer("bias", torch.zeros(n)) + self.register_buffer("running_mean", torch.zeros(n)) + self.register_buffer("running_var", torch.ones(n)) + + def _load_from_state_dict( + self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs + ): + num_batches_tracked_key = prefix + "num_batches_tracked" + if num_batches_tracked_key in state_dict: + del state_dict[num_batches_tracked_key] + + super()._load_from_state_dict( + state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs + ) + + def forward(self, x): + # move reshapes to the beginning + # to make it user-friendly + weight = self.weight.reshape(1, -1, 1, 1) + bias = self.bias.reshape(1, -1, 1, 1) + running_var = self.running_var.reshape(1, -1, 1, 1) + running_mean = self.running_mean.reshape(1, -1, 1, 1) + epsilon = 1e-5 + scale = weight * (running_var + epsilon).rsqrt() + bias = bias - running_mean * scale + return x * scale + bias + + +# Copied from transformers.models.detr.modeling_detr.replace_batch_norm with Detr->DeformableDetr +def replace_batch_norm(model): + r""" + Recursively replace all `torch.nn.BatchNorm2d` with `DeformableDetrFrozenBatchNorm2d`. + + Args: + model (torch.nn.Module): + input model + """ + for name, module in model.named_children(): + if isinstance(module, nn.BatchNorm2d): + new_module = DeformableDetrFrozenBatchNorm2d(module.num_features) + + if module.weight.device != torch.device("meta"): + new_module.weight.data.copy_(module.weight) + new_module.bias.data.copy_(module.bias) + new_module.running_mean.data.copy_(module.running_mean) + new_module.running_var.data.copy_(module.running_var) + + model._modules[name] = new_module + + if len(list(module.children())) > 0: + replace_batch_norm(module) + + +class DeformableDetrConvEncoder(nn.Module): + """ + Convolutional backbone, using either the AutoBackbone API or one from the timm library. + + nn.BatchNorm2d layers are replaced by DeformableDetrFrozenBatchNorm2d as defined above. + + """ + + def __init__(self, config): + super().__init__() + + self.config = config + + # For backwards compatibility we have to use the timm library directly instead of the AutoBackbone API + if config.use_timm_backbone: + # We default to values which were previously hard-coded. This enables configurability from the config + # using backbone arguments, while keeping the default behavior the same. + requires_backends(self, ["timm"]) + kwargs = getattr(config, "backbone_kwargs", {}) + kwargs = {} if kwargs is None else kwargs.copy() + out_indices = kwargs.pop("out_indices", (2, 3, 4) if config.num_feature_levels > 1 else (4,)) + num_channels = kwargs.pop("in_chans", config.num_channels) + if config.dilation: + kwargs["output_stride"] = kwargs.get("output_stride", 16) + backbone = create_model( + config.backbone, + pretrained=config.use_pretrained_backbone, + features_only=True, + out_indices=out_indices, + in_chans=num_channels, + **kwargs, + ) + else: + backbone = load_backbone(config) + + # replace batch norm by frozen batch norm + with torch.no_grad(): + replace_batch_norm(backbone) + self.model = backbone + self.intermediate_channel_sizes = ( + self.model.feature_info.channels() if config.use_timm_backbone else self.model.channels + ) + + backbone_model_type = None + if config.backbone is not None: + backbone_model_type = config.backbone + elif config.backbone_config is not None: + backbone_model_type = config.backbone_config.model_type + else: + raise ValueError("Either `backbone` or `backbone_config` should be provided in the config") + + if "resnet" in backbone_model_type: + for name, parameter in self.model.named_parameters(): + if config.use_timm_backbone: + if "layer2" not in name and "layer3" not in name and "layer4" not in name: + parameter.requires_grad_(False) + else: + if "stage.1" not in name and "stage.2" not in name and "stage.3" not in name: + parameter.requires_grad_(False) + + # Copied from transformers.models.detr.modeling_detr.DetrConvEncoder.forward with Detr->DeformableDetr + def forward(self, pixel_values: torch.Tensor, pixel_mask: torch.Tensor): + # send pixel_values through the model to get list of feature maps + features = self.model(pixel_values) if self.config.use_timm_backbone else self.model(pixel_values).feature_maps + + out = [] + for feature_map in features: + # downsample pixel_mask to match shape of corresponding feature_map + mask = nn.functional.interpolate(pixel_mask[None].float(), size=feature_map.shape[-2:]).to(torch.bool)[0] + out.append((feature_map, mask)) + return out + + +# Copied from transformers.models.detr.modeling_detr.DetrConvModel with Detr->DeformableDetr +class DeformableDetrConvModel(nn.Module): + """ + This module adds 2D position embeddings to all intermediate feature maps of the convolutional encoder. + """ + + def __init__(self, conv_encoder, position_embedding): + super().__init__() + self.conv_encoder = conv_encoder + self.position_embedding = position_embedding + + def forward(self, pixel_values, pixel_mask): + # send pixel_values and pixel_mask through backbone to get list of (feature_map, pixel_mask) tuples + out = self.conv_encoder(pixel_values, pixel_mask) + pos = [] + for feature_map, mask in out: + # position encoding + pos.append(self.position_embedding(feature_map, mask).to(feature_map.dtype)) + + return out, pos + + +class DeformableDetrSinePositionEmbedding(nn.Module): + """ + This is a more standard version of the position embedding, very similar to the one used by the Attention is all you + need paper, generalized to work on images. + """ + + def __init__(self, embedding_dim=64, temperature=10000, normalize=False, scale=None): + super().__init__() + self.embedding_dim = embedding_dim + self.temperature = temperature + self.normalize = normalize + if scale is not None and normalize is False: + raise ValueError("normalize should be True if scale is passed") + if scale is None: + scale = 2 * math.pi + self.scale = scale + + def forward(self, pixel_values, pixel_mask): + if pixel_mask is None: + raise ValueError("No pixel mask provided") + y_embed = pixel_mask.cumsum(1, dtype=pixel_values.dtype) + x_embed = pixel_mask.cumsum(2, dtype=pixel_values.dtype) + if self.normalize: + eps = 1e-6 + y_embed = (y_embed - 0.5) / (y_embed[:, -1:, :] + eps) * self.scale + x_embed = (x_embed - 0.5) / (x_embed[:, :, -1:] + eps) * self.scale + + dim_t = torch.arange(self.embedding_dim, dtype=pixel_values.dtype, device=pixel_values.device) + dim_t = self.temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / self.embedding_dim) + + pos_x = x_embed[:, :, :, None] / dim_t + pos_y = y_embed[:, :, :, None] / dim_t + pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3) + pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3) + pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) + return pos + + +# Copied from transformers.models.detr.modeling_detr.DetrLearnedPositionEmbedding +class DeformableDetrLearnedPositionEmbedding(nn.Module): + """ + This module learns positional embeddings up to a fixed maximum size. + """ + + def __init__(self, embedding_dim=256): + super().__init__() + self.row_embeddings = nn.Embedding(50, embedding_dim) + self.column_embeddings = nn.Embedding(50, embedding_dim) + + def forward(self, pixel_values, pixel_mask=None): + height, width = pixel_values.shape[-2:] + width_values = torch.arange(width, device=pixel_values.device) + height_values = torch.arange(height, device=pixel_values.device) + x_emb = self.column_embeddings(width_values) + y_emb = self.row_embeddings(height_values) + pos = torch.cat([x_emb.unsqueeze(0).repeat(height, 1, 1), y_emb.unsqueeze(1).repeat(1, width, 1)], dim=-1) + pos = pos.permute(2, 0, 1) + pos = pos.unsqueeze(0) + pos = pos.repeat(pixel_values.shape[0], 1, 1, 1) + return pos + + +# Copied from transformers.models.detr.modeling_detr.build_position_encoding with Detr->DeformableDetr +def build_position_encoding(config): + n_steps = config.d_model // 2 + if config.position_embedding_type == "sine": + # TODO find a better way of exposing other arguments + position_embedding = DeformableDetrSinePositionEmbedding(n_steps, normalize=True) + elif config.position_embedding_type == "learned": + position_embedding = DeformableDetrLearnedPositionEmbedding(n_steps) + else: + raise ValueError(f"Not supported {config.position_embedding_type}") + + return position_embedding + + +class DeformableDetrMultiscaleDeformableAttention(nn.Module): + """ + Multiscale deformable attention as proposed in Deformable DETR. + """ + + def __init__(self, config: DeformableDetrConfig, num_heads: int, n_points: int): + super().__init__() + + self.attn = MultiScaleDeformableAttention() + + if config.d_model % num_heads != 0: + raise ValueError( + f"embed_dim (d_model) must be divisible by num_heads, but got {config.d_model} and {num_heads}" + ) + dim_per_head = config.d_model // num_heads + # check if dim_per_head is power of 2 + if not ((dim_per_head & (dim_per_head - 1) == 0) and dim_per_head != 0): + warnings.warn( + "You'd better set embed_dim (d_model) in DeformableDetrMultiscaleDeformableAttention to make the" + " dimension of each attention head a power of 2 which is more efficient in the authors' CUDA" + " implementation." + ) + + self.im2col_step = 64 + + self.d_model = config.d_model + self.n_levels = config.num_feature_levels + self.n_heads = num_heads + self.n_points = n_points + + self.sampling_offsets = nn.Linear(config.d_model, num_heads * self.n_levels * n_points * 2) + self.attention_weights = nn.Linear(config.d_model, num_heads * self.n_levels * n_points) + self.value_proj = nn.Linear(config.d_model, config.d_model) + self.output_proj = nn.Linear(config.d_model, config.d_model) + + self.disable_custom_kernels = config.disable_custom_kernels + + def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]): + return tensor if position_embeddings is None else tensor + position_embeddings + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + encoder_hidden_states=None, + encoder_attention_mask=None, + position_embeddings: Optional[torch.Tensor] = None, + reference_points=None, + spatial_shapes=None, + spatial_shapes_list=None, + level_start_index=None, + output_attentions: bool = False, + ): + # add position embeddings to the hidden states before projecting to queries and keys + if position_embeddings is not None: + hidden_states = self.with_pos_embed(hidden_states, position_embeddings) + + batch_size, num_queries, _ = hidden_states.shape + batch_size, sequence_length, _ = encoder_hidden_states.shape + total_elements = sum(height * width for height, width in spatial_shapes_list) + if total_elements != sequence_length: + raise ValueError( + "Make sure to align the spatial shapes with the sequence length of the encoder hidden states" + ) + + value = self.value_proj(encoder_hidden_states) + if attention_mask is not None: + # we invert the attention_mask + value = value.masked_fill(~attention_mask[..., None], float(0)) + value = value.view(batch_size, sequence_length, self.n_heads, self.d_model // self.n_heads) + sampling_offsets = self.sampling_offsets(hidden_states).view( + batch_size, num_queries, self.n_heads, self.n_levels, self.n_points, 2 + ) + attention_weights = self.attention_weights(hidden_states).view( + batch_size, num_queries, self.n_heads, self.n_levels * self.n_points + ) + attention_weights = F.softmax(attention_weights, -1).view( + batch_size, num_queries, self.n_heads, self.n_levels, self.n_points + ) + # batch_size, num_queries, n_heads, n_levels, n_points, 2 + num_coordinates = reference_points.shape[-1] + if num_coordinates == 2: + offset_normalizer = torch.stack([spatial_shapes[..., 1], spatial_shapes[..., 0]], -1) + sampling_locations = ( + reference_points[:, :, None, :, None, :] + + sampling_offsets / offset_normalizer[None, None, None, :, None, :] + ) + elif num_coordinates == 4: + sampling_locations = ( + reference_points[:, :, None, :, None, :2] + + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5 + ) + else: + raise ValueError(f"Last dim of reference_points must be 2 or 4, but got {reference_points.shape[-1]}") + + output = self.attn( + value, + spatial_shapes, + spatial_shapes_list, + level_start_index, + sampling_locations, + attention_weights, + self.im2col_step, + ) + + output = self.output_proj(output) + + return output, attention_weights + + +class DeformableDetrMultiheadAttention(nn.Module): + """ + Multi-headed attention from 'Attention Is All You Need' paper. + + Here, we add position embeddings to the queries and keys (as explained in the Deformable DETR paper). + """ + + def __init__( + self, + embed_dim: int, + num_heads: int, + dropout: float = 0.0, + bias: bool = True, + ): + super().__init__() + self.embed_dim = embed_dim + self.num_heads = num_heads + self.dropout = dropout + self.head_dim = embed_dim // num_heads + if self.head_dim * num_heads != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:" + f" {num_heads})." + ) + self.scaling = self.head_dim**-0.5 + + self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + + def _shape(self, tensor: torch.Tensor, seq_len: int, batch_size: int): + return tensor.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() + + def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]): + return tensor if position_embeddings is None else tensor + position_embeddings + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_embeddings: Optional[torch.Tensor] = None, + output_attentions: bool = False, + ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]: + """Input shape: Batch x Time x Channel""" + + batch_size, target_len, embed_dim = hidden_states.size() + # add position embeddings to the hidden states before projecting to queries and keys + if position_embeddings is not None: + hidden_states_original = hidden_states + hidden_states = self.with_pos_embed(hidden_states, position_embeddings) + + # get queries, keys and values + query_states = self.q_proj(hidden_states) * self.scaling + key_states = self._shape(self.k_proj(hidden_states), -1, batch_size) + value_states = self._shape(self.v_proj(hidden_states_original), -1, batch_size) + + proj_shape = (batch_size * self.num_heads, -1, self.head_dim) + query_states = self._shape(query_states, target_len, batch_size).view(*proj_shape) + key_states = key_states.view(*proj_shape) + value_states = value_states.view(*proj_shape) + + source_len = key_states.size(1) + + attn_weights = torch.bmm(query_states, key_states.transpose(1, 2)) + + if attn_weights.size() != (batch_size * self.num_heads, target_len, source_len): + raise ValueError( + f"Attention weights should be of size {(batch_size * self.num_heads, target_len, source_len)}, but is" + f" {attn_weights.size()}" + ) + + # expand attention_mask + if attention_mask is not None: + # [batch_size, seq_len] -> [batch_size, 1, target_seq_len, source_seq_len] + attention_mask = _prepare_4d_attention_mask(attention_mask, hidden_states.dtype) + + if attention_mask is not None: + if attention_mask.size() != (batch_size, 1, target_len, source_len): + raise ValueError( + f"Attention mask should be of size {(batch_size, 1, target_len, source_len)}, but is" + f" {attention_mask.size()}" + ) + if attention_mask.dtype == torch.bool: + attention_mask = torch.zeros_like(attention_mask, dtype=attn_weights.dtype).masked_fill_( + attention_mask, -torch.inf + ) + attn_weights = attn_weights.view(batch_size, self.num_heads, target_len, source_len) + attention_mask + attn_weights = attn_weights.view(batch_size * self.num_heads, target_len, source_len) + + attn_weights = nn.functional.softmax(attn_weights, dim=-1) + + if output_attentions: + # this operation is a bit awkward, but it's required to + # make sure that attn_weights keeps its gradient. + # In order to do so, attn_weights have to reshaped + # twice and have to be reused in the following + attn_weights_reshaped = attn_weights.view(batch_size, self.num_heads, target_len, source_len) + attn_weights = attn_weights_reshaped.view(batch_size * self.num_heads, target_len, source_len) + else: + attn_weights_reshaped = None + + attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) + + attn_output = torch.bmm(attn_probs, value_states) + + if attn_output.size() != ( + batch_size * self.num_heads, + target_len, + self.head_dim, + ): + raise ValueError( + f"`attn_output` should be of size {(batch_size, self.num_heads, target_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.view(batch_size, self.num_heads, target_len, self.head_dim) + attn_output = attn_output.transpose(1, 2) + attn_output = attn_output.reshape(batch_size, target_len, embed_dim) + + attn_output = self.out_proj(attn_output) + + return attn_output, attn_weights_reshaped + + +class DeformableDetrEncoderLayer(GradientCheckpointingLayer): + def __init__(self, config: DeformableDetrConfig): + super().__init__() + self.embed_dim = config.d_model + self.self_attn = DeformableDetrMultiscaleDeformableAttention( + config, + num_heads=config.encoder_attention_heads, + n_points=config.encoder_n_points, + ) + self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) + self.dropout = config.dropout + self.activation_fn = ACT2FN[config.activation_function] + self.activation_dropout = config.activation_dropout + self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim) + self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim) + self.final_layer_norm = nn.LayerNorm(self.embed_dim) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: torch.Tensor, + position_embeddings: Optional[torch.Tensor] = None, + reference_points=None, + spatial_shapes=None, + spatial_shapes_list=None, + level_start_index=None, + output_attentions: bool = False, + ): + """ + Args: + hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Input to the layer. + attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): + Attention mask. + position_embeddings (`torch.FloatTensor`, *optional*): + Position embeddings, to be added to `hidden_states`. + reference_points (`torch.FloatTensor`, *optional*): + Reference points. + spatial_shapes (`torch.LongTensor`, *optional*): + Spatial shapes of the backbone feature maps. + level_start_index (`torch.LongTensor`, *optional*): + Level start index. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + """ + residual = hidden_states + + # Apply Multi-scale Deformable Attention Module on the multi-scale feature maps. + hidden_states, attn_weights = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + encoder_hidden_states=hidden_states, + encoder_attention_mask=attention_mask, + position_embeddings=position_embeddings, + reference_points=reference_points, + spatial_shapes=spatial_shapes, + spatial_shapes_list=spatial_shapes_list, + level_start_index=level_start_index, + output_attentions=output_attentions, + ) + + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + + residual = hidden_states + hidden_states = self.activation_fn(self.fc1(hidden_states)) + hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) + + hidden_states = self.fc2(hidden_states) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + + hidden_states = residual + hidden_states + hidden_states = self.final_layer_norm(hidden_states) + + if self.training: + if torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any(): + clamp_value = torch.finfo(hidden_states.dtype).max - 1000 + hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (attn_weights,) + + return outputs + + +class DeformableDetrDecoderLayer(GradientCheckpointingLayer): + def __init__(self, config: DeformableDetrConfig): + super().__init__() + self.embed_dim = config.d_model + + # self-attention + self.self_attn = DeformableDetrMultiheadAttention( + embed_dim=self.embed_dim, + num_heads=config.decoder_attention_heads, + dropout=config.attention_dropout, + ) + self.dropout = config.dropout + self.activation_fn = ACT2FN[config.activation_function] + self.activation_dropout = config.activation_dropout + + self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) + # cross-attention + self.encoder_attn = DeformableDetrMultiscaleDeformableAttention( + config, + num_heads=config.decoder_attention_heads, + n_points=config.decoder_n_points, + ) + self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim) + # feedforward neural networks + self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim) + self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim) + self.final_layer_norm = nn.LayerNorm(self.embed_dim) + + def forward( + self, + hidden_states: torch.Tensor, + position_embeddings: Optional[torch.Tensor] = None, + reference_points=None, + spatial_shapes=None, + spatial_shapes_list=None, + level_start_index=None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = False, + ): + """ + Args: + hidden_states (`torch.FloatTensor`): + Input to the layer of shape `(seq_len, batch, embed_dim)`. + position_embeddings (`torch.FloatTensor`, *optional*): + Position embeddings that are added to the queries and keys in the self-attention layer. + reference_points (`torch.FloatTensor`, *optional*): + Reference points. + spatial_shapes (`torch.LongTensor`, *optional*): + Spatial shapes. + level_start_index (`torch.LongTensor`, *optional*): + Level start index. + encoder_hidden_states (`torch.FloatTensor`): + cross attention input to the layer of shape `(seq_len, batch, embed_dim)` + encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size + `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative + values. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + """ + residual = hidden_states + + # Self Attention + hidden_states, self_attn_weights = self.self_attn( + hidden_states=hidden_states, + position_embeddings=position_embeddings, + output_attentions=output_attentions, + ) + + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + + second_residual = hidden_states + + # Cross-Attention + cross_attn_weights = None + hidden_states, cross_attn_weights = self.encoder_attn( + hidden_states=hidden_states, + attention_mask=encoder_attention_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + position_embeddings=position_embeddings, + reference_points=reference_points, + spatial_shapes=spatial_shapes, + spatial_shapes_list=spatial_shapes_list, + level_start_index=level_start_index, + output_attentions=output_attentions, + ) + + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = second_residual + hidden_states + + hidden_states = self.encoder_attn_layer_norm(hidden_states) + + # Fully Connected + residual = hidden_states + hidden_states = self.activation_fn(self.fc1(hidden_states)) + hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) + hidden_states = self.fc2(hidden_states) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + hidden_states = self.final_layer_norm(hidden_states) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights, cross_attn_weights) + + return outputs + + +@auto_docstring +class DeformableDetrPreTrainedModel(PreTrainedModel): + config: DeformableDetrConfig + base_model_prefix = "model" + main_input_name = "pixel_values" + supports_gradient_checkpointing = True + _no_split_modules = [ + r"DeformableDetrConvEncoder", + r"DeformableDetrEncoderLayer", + r"DeformableDetrDecoderLayer", + ] + + def _init_weights(self, module): + std = self.config.init_std + + if isinstance(module, DeformableDetrLearnedPositionEmbedding): + nn.init.uniform_(module.row_embeddings.weight) + nn.init.uniform_(module.column_embeddings.weight) + elif isinstance(module, DeformableDetrMultiscaleDeformableAttention): + nn.init.constant_(module.sampling_offsets.weight.data, 0.0) + default_dtype = torch.get_default_dtype() + thetas = torch.arange(module.n_heads, dtype=torch.int64).to(default_dtype) * ( + 2.0 * math.pi / module.n_heads + ) + grid_init = torch.stack([thetas.cos(), thetas.sin()], -1) + grid_init = ( + (grid_init / grid_init.abs().max(-1, keepdim=True)[0]) + .view(module.n_heads, 1, 1, 2) + .repeat(1, module.n_levels, module.n_points, 1) + ) + for i in range(module.n_points): + grid_init[:, :, i, :] *= i + 1 + with torch.no_grad(): + module.sampling_offsets.bias = nn.Parameter(grid_init.view(-1)) + nn.init.constant_(module.attention_weights.weight.data, 0.0) + nn.init.constant_(module.attention_weights.bias.data, 0.0) + nn.init.xavier_uniform_(module.value_proj.weight.data) + nn.init.constant_(module.value_proj.bias.data, 0.0) + nn.init.xavier_uniform_(module.output_proj.weight.data) + nn.init.constant_(module.output_proj.bias.data, 0.0) + elif isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + if hasattr(module, "reference_points") and not self.config.two_stage: + nn.init.xavier_uniform_(module.reference_points.weight.data, gain=1.0) + nn.init.constant_(module.reference_points.bias.data, 0.0) + if hasattr(module, "level_embed"): + nn.init.normal_(module.level_embed) + + +class DeformableDetrEncoder(DeformableDetrPreTrainedModel): + """ + Transformer encoder consisting of *config.encoder_layers* deformable attention layers. Each layer is a + [`DeformableDetrEncoderLayer`]. + + The encoder updates the flattened multi-scale feature maps through multiple deformable attention layers. + + Args: + config: DeformableDetrConfig + """ + + def __init__(self, config: DeformableDetrConfig): + super().__init__(config) + self.gradient_checkpointing = False + + self.dropout = config.dropout + self.layers = nn.ModuleList([DeformableDetrEncoderLayer(config) for _ in range(config.encoder_layers)]) + + # Initialize weights and apply final processing + self.post_init() + + @staticmethod + def get_reference_points(spatial_shapes, valid_ratios, device): + """ + Get reference points for each feature map. Used in decoder. + + Args: + spatial_shapes (`torch.LongTensor` of shape `(num_feature_levels, 2)`): + Spatial shapes of each feature map. + valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`): + Valid ratios of each feature map. + device (`torch.device`): + Device on which to create the tensors. + Returns: + `torch.FloatTensor` of shape `(batch_size, num_queries, num_feature_levels, 2)` + """ + reference_points_list = [] + for level, (height, width) in enumerate(spatial_shapes): + ref_y, ref_x = meshgrid( + torch.linspace(0.5, height - 0.5, height, dtype=valid_ratios.dtype, device=device), + torch.linspace(0.5, width - 0.5, width, dtype=valid_ratios.dtype, device=device), + indexing="ij", + ) + # TODO: valid_ratios could be useless here. check https://github.com/fundamentalvision/Deformable-DETR/issues/36 + ref_y = ref_y.reshape(-1)[None] / (valid_ratios[:, None, level, 1] * height) + ref_x = ref_x.reshape(-1)[None] / (valid_ratios[:, None, level, 0] * width) + ref = torch.stack((ref_x, ref_y), -1) + reference_points_list.append(ref) + reference_points = torch.cat(reference_points_list, 1) + reference_points = reference_points[:, :, None] * valid_ratios[:, None] + return reference_points + + def forward( + self, + inputs_embeds=None, + attention_mask=None, + position_embeddings=None, + spatial_shapes=None, + spatial_shapes_list=None, + level_start_index=None, + valid_ratios=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + Args: + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Flattened feature map (output of the backbone + projection layer) that is passed to the encoder. + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding pixel features. Mask values selected in `[0, 1]`: + - 1 for pixel features that are real (i.e. **not masked**), + - 0 for pixel features that are padding (i.e. **masked**). + [What are attention masks?](../glossary#attention-mask) + position_embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Position embeddings that are added to the queries and keys in each self-attention layer. + spatial_shapes (`torch.LongTensor` of shape `(num_feature_levels, 2)`): + Spatial shapes of each feature map. + level_start_index (`torch.LongTensor` of shape `(num_feature_levels)`): + Starting index of each feature map. + valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`): + Ratio of valid area in each feature level. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + hidden_states = inputs_embeds + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + + spatial_shapes_tuple = tuple(spatial_shapes_list) + reference_points = self.get_reference_points(spatial_shapes_tuple, valid_ratios, device=inputs_embeds.device) + + encoder_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + for i, encoder_layer in enumerate(self.layers): + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + layer_outputs = encoder_layer( + hidden_states, + attention_mask, + position_embeddings=position_embeddings, + reference_points=reference_points, + spatial_shapes=spatial_shapes, + spatial_shapes_list=spatial_shapes_list, + level_start_index=level_start_index, + output_attentions=output_attentions, + ) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) + return BaseModelOutput( + last_hidden_state=hidden_states, + hidden_states=encoder_states, + attentions=all_attentions, + ) + + +class DeformableDetrDecoder(DeformableDetrPreTrainedModel): + """ + Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`DeformableDetrDecoderLayer`]. + + The decoder updates the query embeddings through multiple self-attention and cross-attention layers. + + Some tweaks for Deformable DETR: + + - `position_embeddings`, `reference_points`, `spatial_shapes` and `valid_ratios` are added to the forward pass. + - it also returns a stack of intermediate outputs and reference points from all decoding layers. + + Args: + config: DeformableDetrConfig + """ + + def __init__(self, config: DeformableDetrConfig): + super().__init__(config) + + self.dropout = config.dropout + self.layers = nn.ModuleList([DeformableDetrDecoderLayer(config) for _ in range(config.decoder_layers)]) + self.gradient_checkpointing = False + + # hack implementation for iterative bounding box refinement and two-stage Deformable DETR + self.bbox_embed = None + self.class_embed = None + + # Initialize weights and apply final processing + self.post_init() + + def forward( + self, + inputs_embeds=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + position_embeddings=None, + reference_points=None, + spatial_shapes=None, + spatial_shapes_list=None, + level_start_index=None, + valid_ratios=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + Args: + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`): + The query embeddings that are passed into the decoder. + encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention + of the decoder. + encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing cross-attention on padding pixel_values of the encoder. Mask values selected + in `[0, 1]`: + - 1 for pixels that are real (i.e. **not masked**), + - 0 for pixels that are padding (i.e. **masked**). + position_embeddings (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*): + Position embeddings that are added to the queries and keys in each self-attention layer. + reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)` is `as_two_stage` else `(batch_size, num_queries, 2)` or , *optional*): + Reference point in range `[0, 1]`, top-left (0,0), bottom-right (1, 1), including padding area. + spatial_shapes (`torch.FloatTensor` of shape `(num_feature_levels, 2)`): + Spatial shapes of the feature maps. + level_start_index (`torch.LongTensor` of shape `(num_feature_levels)`, *optional*): + Indexes for the start of each feature level. In range `[0, sequence_length]`. + valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`, *optional*): + Ratio of valid area in each feature level. + + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if inputs_embeds is not None: + hidden_states = inputs_embeds + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None + intermediate = () + intermediate_reference_points = () + + for idx, decoder_layer in enumerate(self.layers): + num_coordinates = reference_points.shape[-1] + if num_coordinates == 4: + reference_points_input = ( + reference_points[:, :, None] * torch.cat([valid_ratios, valid_ratios], -1)[:, None] + ) + elif reference_points.shape[-1] == 2: + reference_points_input = reference_points[:, :, None] * valid_ratios[:, None] + else: + raise ValueError("Reference points' last dimension must be of size 2") + + if output_hidden_states: + all_hidden_states += (hidden_states,) + + layer_outputs = decoder_layer( + hidden_states, + position_embeddings, + reference_points_input, + spatial_shapes, + spatial_shapes_list, + level_start_index, + encoder_hidden_states, # as a positional argument for gradient checkpointing + encoder_attention_mask, + output_attentions, + ) + + hidden_states = layer_outputs[0] + + # hack implementation for iterative bounding box refinement + if self.bbox_embed is not None: + tmp = self.bbox_embed[idx](hidden_states) + num_coordinates = reference_points.shape[-1] + if num_coordinates == 4: + new_reference_points = tmp + inverse_sigmoid(reference_points) + new_reference_points = new_reference_points.sigmoid() + elif num_coordinates == 2: + new_reference_points = tmp + new_reference_points[..., :2] = tmp[..., :2] + inverse_sigmoid(reference_points) + new_reference_points = new_reference_points.sigmoid() + else: + raise ValueError( + f"Last dim of reference_points must be 2 or 4, but got {reference_points.shape[-1]}" + ) + reference_points = new_reference_points.detach() + + intermediate += (hidden_states,) + intermediate_reference_points += (reference_points,) + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + if encoder_hidden_states is not None: + all_cross_attentions += (layer_outputs[2],) + + # Keep batch_size as first dimension + intermediate = torch.stack(intermediate, dim=1) + intermediate_reference_points = torch.stack(intermediate_reference_points, dim=1) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if not return_dict: + return tuple( + v + for v in [ + hidden_states, + intermediate, + intermediate_reference_points, + all_hidden_states, + all_self_attns, + all_cross_attentions, + ] + if v is not None + ) + return DeformableDetrDecoderOutput( + last_hidden_state=hidden_states, + intermediate_hidden_states=intermediate, + intermediate_reference_points=intermediate_reference_points, + hidden_states=all_hidden_states, + attentions=all_self_attns, + cross_attentions=all_cross_attentions, + ) + + +@auto_docstring( + custom_intro=""" + The bare Deformable DETR Model (consisting of a backbone and encoder-decoder Transformer) outputting raw + hidden-states without any specific head on top. + """ +) +class DeformableDetrModel(DeformableDetrPreTrainedModel): + def __init__(self, config: DeformableDetrConfig): + super().__init__(config) + + # Create backbone + positional encoding + backbone = DeformableDetrConvEncoder(config) + position_embeddings = build_position_encoding(config) + self.backbone = DeformableDetrConvModel(backbone, position_embeddings) + + # Create input projection layers + if config.num_feature_levels > 1: + num_backbone_outs = len(backbone.intermediate_channel_sizes) + input_proj_list = [] + for _ in range(num_backbone_outs): + in_channels = backbone.intermediate_channel_sizes[_] + input_proj_list.append( + nn.Sequential( + nn.Conv2d(in_channels, config.d_model, kernel_size=1), + nn.GroupNorm(32, config.d_model), + ) + ) + for _ in range(config.num_feature_levels - num_backbone_outs): + input_proj_list.append( + nn.Sequential( + nn.Conv2d( + in_channels, + config.d_model, + kernel_size=3, + stride=2, + padding=1, + ), + nn.GroupNorm(32, config.d_model), + ) + ) + in_channels = config.d_model + self.input_proj = nn.ModuleList(input_proj_list) + else: + self.input_proj = nn.ModuleList( + [ + nn.Sequential( + nn.Conv2d( + backbone.intermediate_channel_sizes[-1], + config.d_model, + kernel_size=1, + ), + nn.GroupNorm(32, config.d_model), + ) + ] + ) + + if not config.two_stage: + self.query_position_embeddings = nn.Embedding(config.num_queries, config.d_model * 2) + + self.encoder = DeformableDetrEncoder(config) + self.decoder = DeformableDetrDecoder(config) + + self.level_embed = nn.Parameter(torch.Tensor(config.num_feature_levels, config.d_model)) + + if config.two_stage: + self.enc_output = nn.Linear(config.d_model, config.d_model) + self.enc_output_norm = nn.LayerNorm(config.d_model) + self.pos_trans = nn.Linear(config.d_model * 2, config.d_model * 2) + self.pos_trans_norm = nn.LayerNorm(config.d_model * 2) + else: + self.reference_points = nn.Linear(config.d_model, 2) + + self.post_init() + + def get_encoder(self): + return self.encoder + + def freeze_backbone(self): + for name, param in self.backbone.conv_encoder.model.named_parameters(): + param.requires_grad_(False) + + def unfreeze_backbone(self): + for name, param in self.backbone.conv_encoder.model.named_parameters(): + param.requires_grad_(True) + + def get_valid_ratio(self, mask, dtype=torch.float32): + """Get the valid ratio of all feature maps.""" + + _, height, width = mask.shape + valid_height = torch.sum(mask[:, :, 0], 1) + valid_width = torch.sum(mask[:, 0, :], 1) + valid_ratio_height = valid_height.to(dtype) / height + valid_ratio_width = valid_width.to(dtype) / width + valid_ratio = torch.stack([valid_ratio_width, valid_ratio_height], -1) + return valid_ratio + + def get_proposal_pos_embed(self, proposals): + """Get the position embedding of the proposals.""" + + num_pos_feats = self.config.d_model // 2 + temperature = 10000 + scale = 2 * math.pi + + dim_t = torch.arange(num_pos_feats, dtype=proposals.dtype, device=proposals.device) + dim_t = temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / num_pos_feats) + # batch_size, num_queries, 4 + proposals = proposals.sigmoid() * scale + # batch_size, num_queries, 4, 128 + pos = proposals[:, :, :, None] / dim_t + # batch_size, num_queries, 4, 64, 2 -> batch_size, num_queries, 512 + pos = torch.stack((pos[:, :, :, 0::2].sin(), pos[:, :, :, 1::2].cos()), dim=4).flatten(2) + return pos + + def gen_encoder_output_proposals(self, enc_output, padding_mask, spatial_shapes): + """Generate the encoder output proposals from encoded enc_output. + + Args: + enc_output (Tensor[batch_size, sequence_length, hidden_size]): Output of the encoder. + padding_mask (Tensor[batch_size, sequence_length]): Padding mask for `enc_output`. + spatial_shapes (list[tuple[int, int]]): Spatial shapes of the feature maps. + + Returns: + `tuple(torch.FloatTensor)`: A tuple of feature map and bbox prediction. + - object_query (Tensor[batch_size, sequence_length, hidden_size]): Object query features. Later used to + directly predict a bounding box. (without the need of a decoder) + - output_proposals (Tensor[batch_size, sequence_length, 4]): Normalized proposals, after an inverse + sigmoid. + """ + batch_size = enc_output.shape[0] + proposals = [] + _cur = 0 + for level, (height, width) in enumerate(spatial_shapes): + mask_flatten_ = padding_mask[:, _cur : (_cur + height * width)].view(batch_size, height, width, 1) + valid_height = torch.sum(~mask_flatten_[:, :, 0, 0], 1) + valid_width = torch.sum(~mask_flatten_[:, 0, :, 0], 1) + + grid_y, grid_x = meshgrid( + torch.linspace( + 0, + height - 1, + height, + dtype=enc_output.dtype, + device=enc_output.device, + ), + torch.linspace( + 0, + width - 1, + width, + dtype=enc_output.dtype, + device=enc_output.device, + ), + indexing="ij", + ) + grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1) + + scale = torch.cat([valid_width.unsqueeze(-1), valid_height.unsqueeze(-1)], 1).view(batch_size, 1, 1, 2) + grid = (grid.unsqueeze(0).expand(batch_size, -1, -1, -1) + 0.5) / scale + width_height = torch.ones_like(grid) * 0.05 * (2.0**level) + proposal = torch.cat((grid, width_height), -1).view(batch_size, -1, 4) + proposals.append(proposal) + _cur += height * width + output_proposals = torch.cat(proposals, 1) + output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all(-1, keepdim=True) + output_proposals = torch.log(output_proposals / (1 - output_proposals)) # inverse sigmoid + output_proposals = output_proposals.masked_fill(padding_mask.unsqueeze(-1), float("inf")) + output_proposals = output_proposals.masked_fill(~output_proposals_valid, float("inf")) + + # assign each pixel as an object query + object_query = enc_output + object_query = object_query.masked_fill(padding_mask.unsqueeze(-1), float(0)) + object_query = object_query.masked_fill(~output_proposals_valid, float(0)) + object_query = self.enc_output_norm(self.enc_output(object_query)) + return object_query, output_proposals + + @auto_docstring + def forward( + self, + pixel_values: torch.FloatTensor, + pixel_mask: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.FloatTensor] = None, + encoder_outputs: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + decoder_inputs_embeds: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple[torch.FloatTensor], DeformableDetrModelOutput]: + r""" + decoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*): + Not used by default. Can be used to mask object queries. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you + can choose to directly pass a flattened representation of an image. + decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*): + Optionally, instead of initializing the queries with a tensor of zeros, you can choose to directly pass an + embedded representation. + + Examples: + + ```python + >>> from transformers import AutoImageProcessor, DeformableDetrModel + >>> from PIL import Image + >>> import requests + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> image_processor = AutoImageProcessor.from_pretrained("SenseTime/deformable-detr") + >>> model = DeformableDetrModel.from_pretrained("SenseTime/deformable-detr") + + >>> inputs = image_processor(images=image, return_tensors="pt") + + >>> outputs = model(**inputs) + + >>> last_hidden_states = outputs.last_hidden_state + >>> list(last_hidden_states.shape) + [1, 300, 256] + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + batch_size, num_channels, height, width = pixel_values.shape + device = pixel_values.device + + if pixel_mask is None: + pixel_mask = torch.ones(((batch_size, height, width)), dtype=torch.long, device=device) + + # Extract multi-scale feature maps of same resolution `config.d_model` (cf Figure 4 in paper) + # First, sent pixel_values + pixel_mask through Backbone to obtain the features + # which is a list of tuples + features, position_embeddings_list = self.backbone(pixel_values, pixel_mask) + + # Then, apply 1x1 convolution to reduce the channel dimension to d_model (256 by default) + sources = [] + masks = [] + for level, (source, mask) in enumerate(features): + sources.append(self.input_proj[level](source)) + masks.append(mask) + if mask is None: + raise ValueError("No attention mask was provided") + + # Lowest resolution feature maps are obtained via 3x3 stride 2 convolutions on the final stage + if self.config.num_feature_levels > len(sources): + _len_sources = len(sources) + for level in range(_len_sources, self.config.num_feature_levels): + if level == _len_sources: + source = self.input_proj[level](features[-1][0]) + else: + source = self.input_proj[level](sources[-1]) + mask = nn.functional.interpolate(pixel_mask[None].to(pixel_values.dtype), size=source.shape[-2:]).to( + torch.bool + )[0] + pos_l = self.backbone.position_embedding(source, mask).to(source.dtype) + sources.append(source) + masks.append(mask) + position_embeddings_list.append(pos_l) + + # Create queries + query_embeds = None + if not self.config.two_stage: + query_embeds = self.query_position_embeddings.weight + + # Prepare encoder inputs (by flattening) + source_flatten = [] + mask_flatten = [] + lvl_pos_embed_flatten = [] + spatial_shapes_list = [] + for level, (source, mask, pos_embed) in enumerate(zip(sources, masks, position_embeddings_list)): + batch_size, num_channels, height, width = source.shape + spatial_shape = (height, width) + spatial_shapes_list.append(spatial_shape) + source = source.flatten(2).transpose(1, 2) + mask = mask.flatten(1) + pos_embed = pos_embed.flatten(2).transpose(1, 2) + lvl_pos_embed = pos_embed + self.level_embed[level].view(1, 1, -1) + lvl_pos_embed_flatten.append(lvl_pos_embed) + source_flatten.append(source) + mask_flatten.append(mask) + source_flatten = torch.cat(source_flatten, 1) + mask_flatten = torch.cat(mask_flatten, 1) + lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1) + spatial_shapes = torch.as_tensor(spatial_shapes_list, dtype=torch.long, device=source_flatten.device) + level_start_index = torch.cat((spatial_shapes.new_zeros((1,)), spatial_shapes.prod(1).cumsum(0)[:-1])) + valid_ratios = torch.stack([self.get_valid_ratio(m, dtype=source_flatten.dtype) for m in masks], 1) + + # Fourth, sent source_flatten + mask_flatten + lvl_pos_embed_flatten (backbone + proj layer output) through encoder + # Also provide spatial_shapes, level_start_index and valid_ratios + if encoder_outputs is None: + encoder_outputs = self.encoder( + inputs_embeds=source_flatten, + attention_mask=mask_flatten, + position_embeddings=lvl_pos_embed_flatten, + spatial_shapes=spatial_shapes, + spatial_shapes_list=spatial_shapes_list, + level_start_index=level_start_index, + valid_ratios=valid_ratios, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True + elif return_dict and not isinstance(encoder_outputs, BaseModelOutput): + encoder_outputs = BaseModelOutput( + last_hidden_state=encoder_outputs[0], + hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, + attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None, + ) + + # Fifth, prepare decoder inputs + batch_size, _, num_channels = encoder_outputs[0].shape + enc_outputs_class = None + enc_outputs_coord_logits = None + if self.config.two_stage: + object_query_embedding, output_proposals = self.gen_encoder_output_proposals( + encoder_outputs[0], ~mask_flatten, spatial_shapes_list + ) + + # hack implementation for two-stage Deformable DETR + # apply a detection head to each pixel (A.4 in paper) + # linear projection for bounding box binary classification (i.e. foreground and background) + enc_outputs_class = self.decoder.class_embed[-1](object_query_embedding) + # 3-layer FFN to predict bounding boxes coordinates (bbox regression branch) + delta_bbox = self.decoder.bbox_embed[-1](object_query_embedding) + enc_outputs_coord_logits = delta_bbox + output_proposals + + # only keep top scoring `config.two_stage_num_proposals` proposals + topk = self.config.two_stage_num_proposals + topk_proposals = torch.topk(enc_outputs_class[..., 0], topk, dim=1)[1] + topk_coords_logits = torch.gather( + enc_outputs_coord_logits, + 1, + topk_proposals.unsqueeze(-1).repeat(1, 1, 4), + ) + + topk_coords_logits = topk_coords_logits.detach() + reference_points = topk_coords_logits.sigmoid() + init_reference_points = reference_points + pos_trans_out = self.pos_trans_norm(self.pos_trans(self.get_proposal_pos_embed(topk_coords_logits))) + query_embed, target = torch.split(pos_trans_out, num_channels, dim=2) + else: + query_embed, target = torch.split(query_embeds, num_channels, dim=1) + query_embed = query_embed.unsqueeze(0).expand(batch_size, -1, -1) + target = target.unsqueeze(0).expand(batch_size, -1, -1) + reference_points = self.reference_points(query_embed).sigmoid() + init_reference_points = reference_points + + decoder_outputs = self.decoder( + inputs_embeds=target, + position_embeddings=query_embed, + encoder_hidden_states=encoder_outputs[0], + encoder_attention_mask=mask_flatten, + reference_points=reference_points, + spatial_shapes=spatial_shapes, + spatial_shapes_list=spatial_shapes_list, + level_start_index=level_start_index, + valid_ratios=valid_ratios, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + if not return_dict: + enc_outputs = tuple(value for value in [enc_outputs_class, enc_outputs_coord_logits] if value is not None) + tuple_outputs = (init_reference_points,) + decoder_outputs + encoder_outputs + enc_outputs + + return tuple_outputs + + return DeformableDetrModelOutput( + init_reference_points=init_reference_points, + last_hidden_state=decoder_outputs.last_hidden_state, + intermediate_hidden_states=decoder_outputs.intermediate_hidden_states, + intermediate_reference_points=decoder_outputs.intermediate_reference_points, + decoder_hidden_states=decoder_outputs.hidden_states, + decoder_attentions=decoder_outputs.attentions, + cross_attentions=decoder_outputs.cross_attentions, + encoder_last_hidden_state=encoder_outputs.last_hidden_state, + encoder_hidden_states=encoder_outputs.hidden_states, + encoder_attentions=encoder_outputs.attentions, + enc_outputs_class=enc_outputs_class, + enc_outputs_coord_logits=enc_outputs_coord_logits, + ) + + +# Copied from transformers.models.detr.modeling_detr.DetrMLPPredictionHead +class DeformableDetrMLPPredictionHead(nn.Module): + """ + Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates, + height and width of a bounding box w.r.t. an image. + + Copied from https://github.com/facebookresearch/detr/blob/master/models/detr.py + + """ + + def __init__(self, input_dim, hidden_dim, output_dim, num_layers): + super().__init__() + self.num_layers = num_layers + h = [hidden_dim] * (num_layers - 1) + self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])) + + def forward(self, x): + for i, layer in enumerate(self.layers): + x = nn.functional.relu(layer(x)) if i < self.num_layers - 1 else layer(x) + return x + + +@auto_docstring( + custom_intro=""" + Deformable DETR Model (consisting of a backbone and encoder-decoder Transformer) with object detection heads on + top, for tasks such as COCO detection. + """ +) +class DeformableDetrForObjectDetection(DeformableDetrPreTrainedModel): + # When using clones, all layers > 0 will be clones, but layer 0 *is* required + _tied_weights_keys = [r"bbox_embed\.[1-9]\d*", r"class_embed\.[1-9]\d*"] + # We can't initialize the model on meta device as some weights are modified during the initialization + _no_split_modules = None + + def __init__(self, config: DeformableDetrConfig): + super().__init__(config) + + # Deformable DETR encoder-decoder model + self.model = DeformableDetrModel(config) + # Detection heads on top + self.class_embed = nn.Linear(config.d_model, config.num_labels) + self.bbox_embed = DeformableDetrMLPPredictionHead( + input_dim=config.d_model, + hidden_dim=config.d_model, + output_dim=4, + num_layers=3, + ) + + # if two-stage, the last class_embed and bbox_embed is for region proposal generation + num_pred = (config.decoder_layers + 1) if config.two_stage else config.decoder_layers + if config.with_box_refine: + self.class_embed = _get_clones(self.class_embed, num_pred) + self.bbox_embed = _get_clones(self.bbox_embed, num_pred) + # hack implementation for iterative bounding box refinement + self.model.decoder.bbox_embed = self.bbox_embed + else: + self.class_embed = nn.ModuleList([self.class_embed for _ in range(num_pred)]) + self.bbox_embed = nn.ModuleList([self.bbox_embed for _ in range(num_pred)]) + self.model.decoder.bbox_embed = None + if config.two_stage: + # hack implementation for two-stage + self.model.decoder.class_embed = self.class_embed + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + pixel_values: torch.FloatTensor, + pixel_mask: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.FloatTensor] = None, + encoder_outputs: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + decoder_inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[list[dict]] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple[torch.FloatTensor], DeformableDetrObjectDetectionOutput]: + r""" + decoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*): + Not used by default. Can be used to mask object queries. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you + can choose to directly pass a flattened representation of an image. + decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*): + Optionally, instead of initializing the queries with a tensor of zeros, you can choose to directly pass an + embedded representation. + labels (`list[Dict]` of len `(batch_size,)`, *optional*): + Labels for computing the bipartite matching loss. List of dicts, each dictionary containing at least the + following 2 keys: 'class_labels' and 'boxes' (the class labels and bounding boxes of an image in the batch + respectively). The class labels themselves should be a `torch.LongTensor` of len `(number of bounding boxes + in the image,)` and the boxes a `torch.FloatTensor` of shape `(number of bounding boxes in the image, 4)`. + + Examples: + + ```python + >>> from transformers import AutoImageProcessor, DeformableDetrForObjectDetection + >>> from PIL import Image + >>> import requests + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> image_processor = AutoImageProcessor.from_pretrained("SenseTime/deformable-detr") + >>> model = DeformableDetrForObjectDetection.from_pretrained("SenseTime/deformable-detr") + + >>> inputs = image_processor(images=image, return_tensors="pt") + >>> outputs = model(**inputs) + + >>> # convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax) + >>> target_sizes = torch.tensor([image.size[::-1]]) + >>> results = image_processor.post_process_object_detection(outputs, threshold=0.5, target_sizes=target_sizes)[ + ... 0 + ... ] + >>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]): + ... box = [round(i, 2) for i in box.tolist()] + ... print( + ... f"Detected {model.config.id2label[label.item()]} with confidence " + ... f"{round(score.item(), 3)} at location {box}" + ... ) + Detected cat with confidence 0.8 at location [16.5, 52.84, 318.25, 470.78] + Detected cat with confidence 0.789 at location [342.19, 24.3, 640.02, 372.25] + Detected remote with confidence 0.633 at location [40.79, 72.78, 176.76, 117.25] + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # First, sent images through DETR base model to obtain encoder + decoder outputs + outputs = self.model( + pixel_values, + pixel_mask=pixel_mask, + decoder_attention_mask=decoder_attention_mask, + encoder_outputs=encoder_outputs, + inputs_embeds=inputs_embeds, + decoder_inputs_embeds=decoder_inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs.intermediate_hidden_states if return_dict else outputs[2] + init_reference = outputs.init_reference_points if return_dict else outputs[0] + inter_references = outputs.intermediate_reference_points if return_dict else outputs[3] + + # class logits + predicted bounding boxes + outputs_classes = [] + outputs_coords = [] + + for level in range(hidden_states.shape[1]): + if level == 0: + reference = init_reference + else: + reference = inter_references[:, level - 1] + reference = inverse_sigmoid(reference) + outputs_class = self.class_embed[level](hidden_states[:, level]) + delta_bbox = self.bbox_embed[level](hidden_states[:, level]) + if reference.shape[-1] == 4: + outputs_coord_logits = delta_bbox + reference + elif reference.shape[-1] == 2: + delta_bbox[..., :2] += reference + outputs_coord_logits = delta_bbox + else: + raise ValueError(f"reference.shape[-1] should be 4 or 2, but got {reference.shape[-1]}") + outputs_coord = outputs_coord_logits.sigmoid() + outputs_classes.append(outputs_class) + outputs_coords.append(outputs_coord) + outputs_class = torch.stack(outputs_classes) + outputs_coord = torch.stack(outputs_coords) + + logits = outputs_class[-1] + pred_boxes = outputs_coord[-1] + + loss, loss_dict, auxiliary_outputs = None, None, None + if labels is not None: + loss, loss_dict, auxiliary_outputs = self.loss_function( + logits, + labels, + self.device, + pred_boxes, + self.config, + outputs_class, + outputs_coord, + ) + if not return_dict: + if auxiliary_outputs is not None: + output = (logits, pred_boxes) + auxiliary_outputs + outputs + else: + output = (logits, pred_boxes) + outputs + tuple_outputs = ((loss, loss_dict) + output) if loss is not None else output + + return tuple_outputs + + dict_outputs = DeformableDetrObjectDetectionOutput( + loss=loss, + loss_dict=loss_dict, + logits=logits, + pred_boxes=pred_boxes, + auxiliary_outputs=auxiliary_outputs, + last_hidden_state=outputs.last_hidden_state, + decoder_hidden_states=outputs.decoder_hidden_states, + decoder_attentions=outputs.decoder_attentions, + cross_attentions=outputs.cross_attentions, + encoder_last_hidden_state=outputs.encoder_last_hidden_state, + encoder_hidden_states=outputs.encoder_hidden_states, + encoder_attentions=outputs.encoder_attentions, + intermediate_hidden_states=outputs.intermediate_hidden_states, + intermediate_reference_points=outputs.intermediate_reference_points, + init_reference_points=outputs.init_reference_points, + enc_outputs_class=outputs.enc_outputs_class, + enc_outputs_coord_logits=outputs.enc_outputs_coord_logits, + ) + + return dict_outputs + + +__all__ = [ + "DeformableDetrForObjectDetection", + "DeformableDetrModel", + "DeformableDetrPreTrainedModel", +] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deformable_detr/modular_deformable_detr.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deformable_detr/modular_deformable_detr.py new file mode 100644 index 0000000000000000000000000000000000000000..2e38df7845a2b656475b9b20b12c9198cc1a9ca6 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deformable_detr/modular_deformable_detr.py @@ -0,0 +1,141 @@ +from typing import Union + +import torch + +from transformers.models.detr.image_processing_detr_fast import DetrImageProcessorFast + +from ...image_transforms import center_to_corners_format +from ...utils import ( + TensorType, + logging, +) + + +logger = logging.get_logger(__name__) + + +class DeformableDetrImageProcessorFast(DetrImageProcessorFast): + def post_process(self, outputs, target_sizes): + """ + Converts the raw output of [`DeformableDetrForObjectDetection`] into final bounding boxes in (top_left_x, + top_left_y, bottom_right_x, bottom_right_y) format. Only supports PyTorch. + + Args: + outputs ([`DeformableDetrObjectDetectionOutput`]): + Raw outputs of the model. + target_sizes (`torch.Tensor` of shape `(batch_size, 2)`): + Tensor containing the size (height, width) of each image of the batch. For evaluation, this must be the + original image size (before any data augmentation). For visualization, this should be the image size + after data augment, but before padding. + Returns: + `list[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image + in the batch as predicted by the model. + """ + logger.warning_once( + "`post_process` is deprecated and will be removed in v5 of Transformers, please use" + " `post_process_object_detection` instead, with `threshold=0.` for equivalent results.", + ) + + out_logits, out_bbox = outputs.logits, outputs.pred_boxes + + if len(out_logits) != len(target_sizes): + raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits") + if target_sizes.shape[1] != 2: + raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch") + + prob = out_logits.sigmoid() + topk_values, topk_indexes = torch.topk(prob.view(out_logits.shape[0], -1), 100, dim=1) + scores = topk_values + topk_boxes = torch.div(topk_indexes, out_logits.shape[2], rounding_mode="floor") + labels = topk_indexes % out_logits.shape[2] + boxes = center_to_corners_format(out_bbox) + boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4)) + + # and from relative [0, 1] to absolute [0, height] coordinates + img_h, img_w = target_sizes.unbind(1) + scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1) + boxes = boxes * scale_fct[:, None, :] + + results = [{"scores": s, "labels": l, "boxes": b} for s, l, b in zip(scores, labels, boxes)] + + return results + + def post_process_object_detection( + self, outputs, threshold: float = 0.5, target_sizes: Union[TensorType, list[tuple]] = None, top_k: int = 100 + ): + """ + Converts the raw output of [`DeformableDetrForObjectDetection`] into final bounding boxes in (top_left_x, + top_left_y, bottom_right_x, bottom_right_y) format. Only supports PyTorch. + + Args: + outputs ([`DetrObjectDetectionOutput`]): + Raw outputs of the model. + threshold (`float`, *optional*): + Score threshold to keep object detection predictions. + target_sizes (`torch.Tensor` or `list[tuple[int, int]]`, *optional*): + Tensor of shape `(batch_size, 2)` or list of tuples (`tuple[int, int]`) containing the target size + (height, width) of each image in the batch. If left to None, predictions will not be resized. + top_k (`int`, *optional*, defaults to 100): + Keep only top k bounding boxes before filtering by thresholding. + + Returns: + `list[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image + in the batch as predicted by the model. + """ + out_logits, out_bbox = outputs.logits, outputs.pred_boxes + + if target_sizes is not None: + if len(out_logits) != len(target_sizes): + raise ValueError( + "Make sure that you pass in as many target sizes as the batch dimension of the logits" + ) + + prob = out_logits.sigmoid() + prob = prob.view(out_logits.shape[0], -1) + k_value = min(top_k, prob.size(1)) + topk_values, topk_indexes = torch.topk(prob, k_value, dim=1) + scores = topk_values + topk_boxes = torch.div(topk_indexes, out_logits.shape[2], rounding_mode="floor") + labels = topk_indexes % out_logits.shape[2] + boxes = center_to_corners_format(out_bbox) + boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4)) + + # and from relative [0, 1] to absolute [0, height] coordinates + if target_sizes is not None: + if isinstance(target_sizes, list): + img_h = torch.Tensor([i[0] for i in target_sizes]) + img_w = torch.Tensor([i[1] for i in target_sizes]) + else: + img_h, img_w = target_sizes.unbind(1) + scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device) + boxes = boxes * scale_fct[:, None, :] + + results = [] + for s, l, b in zip(scores, labels, boxes): + score = s[s > threshold] + label = l[s > threshold] + box = b[s > threshold] + results.append({"scores": score, "labels": label, "boxes": box}) + + return results + + def post_process_segmentation(self): + raise NotImplementedError("Segmentation post-processing is not implemented for Deformable DETR yet.") + + def post_process_instance(self): + raise NotImplementedError("Instance post-processing is not implemented for Deformable DETR yet.") + + def post_process_panoptic(self): + raise NotImplementedError("Panoptic post-processing is not implemented for Deformable DETR yet.") + + def post_process_instance_segmentation(self): + raise NotImplementedError("Segmentation post-processing is not implemented for Deformable DETR yet.") + + def post_process_semantic_segmentation(self): + raise NotImplementedError("Semantic segmentation post-processing is not implemented for Deformable DETR yet.") + + def post_process_panoptic_segmentation(self): + raise NotImplementedError("Panoptic segmentation post-processing is not implemented for Deformable DETR yet.") + + +__all__ = ["DeformableDetrImageProcessorFast"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deprecated/__pycache__/__init__.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deprecated/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..87f3f1ab36f8a41559665006819a54d03d9af66d Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deprecated/__pycache__/__init__.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deprecated/efficientformer/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deprecated/efficientformer/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..db3d0a634051c713aee33b48f2962647ac48d0eb --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deprecated/efficientformer/__init__.py @@ -0,0 +1,29 @@ +# Copyright 2022 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ....utils import _LazyModule +from ....utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_efficientformer import * + from .image_processing_efficientformer import * + from .modeling_efficientformer import * + from .modeling_tf_efficientformer import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deprecated/efficientformer/configuration_efficientformer.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deprecated/efficientformer/configuration_efficientformer.py new file mode 100644 index 0000000000000000000000000000000000000000..9aaee649ba047e5967f132eb0b3527dce5855d11 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deprecated/efficientformer/configuration_efficientformer.py @@ -0,0 +1,170 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""EfficientFormer model configuration""" + +from ....configuration_utils import PretrainedConfig +from ....utils import logging + + +logger = logging.get_logger(__name__) + + +class EfficientFormerConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of an [`EfficientFormerModel`]. It is used to + instantiate an EfficientFormer model according to the specified arguments, defining the model architecture. + Instantiating a configuration with the defaults will yield a similar configuration to that of the EfficientFormer + [snap-research/efficientformer-l1](https://huggingface.co/snap-research/efficientformer-l1) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + depths (`List(int)`, *optional*, defaults to `[3, 2, 6, 4]`) + Depth of each stage. + hidden_sizes (`List(int)`, *optional*, defaults to `[48, 96, 224, 448]`) + Dimensionality of each stage. + downsamples (`List(bool)`, *optional*, defaults to `[True, True, True, True]`) + Whether or not to downsample inputs between two stages. + dim (`int`, *optional*, defaults to 448): + Number of channels in Meta3D layers + key_dim (`int`, *optional*, defaults to 32): + The size of the key in meta3D block. + attention_ratio (`int`, *optional*, defaults to 4): + Ratio of the dimension of the query and value to the dimension of the key in MSHA block + resolution (`int`, *optional*, defaults to 7) + Size of each patch + num_hidden_layers (`int`, *optional*, defaults to 5): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 8): + Number of attention heads for each attention layer in the 3D MetaBlock. + mlp_expansion_ratio (`int`, *optional*, defaults to 4): + Ratio of size of the hidden dimensionality of an MLP to the dimensionality of its input. + hidden_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings and encoder. + patch_size (`int`, *optional*, defaults to 16): + The size (resolution) of each patch. + num_channels (`int`, *optional*, defaults to 3): + The number of input channels. + pool_size (`int`, *optional*, defaults to 3): + Kernel size of pooling layers. + downsample_patch_size (`int`, *optional*, defaults to 3): + The size of patches in downsampling layers. + downsample_stride (`int`, *optional*, defaults to 2): + The stride of convolution kernels in downsampling layers. + downsample_pad (`int`, *optional*, defaults to 1): + Padding in downsampling layers. + drop_path_rate (`int`, *optional*, defaults to 0): + Rate at which to increase dropout probability in DropPath. + num_meta3d_blocks (`int`, *optional*, defaults to 1): + The number of 3D MetaBlocks in the last stage. + distillation (`bool`, *optional*, defaults to `True`): + Whether to add a distillation head. + use_layer_scale (`bool`, *optional*, defaults to `True`): + Whether to scale outputs from token mixers. + layer_scale_init_value (`float`, *optional*, defaults to 1e-5): + Factor by which outputs from token mixers are scaled. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"` and `"gelu_new"` are supported. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-12): + The epsilon used by the layer normalization layers. + image_size (`int`, *optional*, defaults to `224`): + The size (resolution) of each image. + + Example: + + ```python + >>> from transformers import EfficientFormerConfig, EfficientFormerModel + + >>> # Initializing a EfficientFormer efficientformer-l1 style configuration + >>> configuration = EfficientFormerConfig() + + >>> # Initializing a EfficientFormerModel (with random weights) from the efficientformer-l3 style configuration + >>> model = EfficientFormerModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "efficientformer" + + def __init__( + self, + depths: list[int] = [3, 2, 6, 4], + hidden_sizes: list[int] = [48, 96, 224, 448], + downsamples: list[bool] = [True, True, True, True], + dim: int = 448, + key_dim: int = 32, + attention_ratio: int = 4, + resolution: int = 7, + num_hidden_layers: int = 5, + num_attention_heads: int = 8, + mlp_expansion_ratio: int = 4, + hidden_dropout_prob: float = 0.0, + patch_size: int = 16, + num_channels: int = 3, + pool_size: int = 3, + downsample_patch_size: int = 3, + downsample_stride: int = 2, + downsample_pad: int = 1, + drop_path_rate: float = 0.0, + num_meta3d_blocks: int = 1, + distillation: bool = True, + use_layer_scale: bool = True, + layer_scale_init_value: float = 1e-5, + hidden_act: str = "gelu", + initializer_range: float = 0.02, + layer_norm_eps: float = 1e-12, + image_size: int = 224, + batch_norm_eps: float = 1e-05, + **kwargs, + ) -> None: + super().__init__(**kwargs) + + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.hidden_sizes = hidden_sizes + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + self.patch_size = patch_size + self.num_channels = num_channels + self.depths = depths + self.mlp_expansion_ratio = mlp_expansion_ratio + self.downsamples = downsamples + self.dim = dim + self.key_dim = key_dim + self.attention_ratio = attention_ratio + self.resolution = resolution + self.pool_size = pool_size + self.downsample_patch_size = downsample_patch_size + self.downsample_stride = downsample_stride + self.downsample_pad = downsample_pad + self.drop_path_rate = drop_path_rate + self.num_meta3d_blocks = num_meta3d_blocks + self.distillation = distillation + self.use_layer_scale = use_layer_scale + self.layer_scale_init_value = layer_scale_init_value + self.image_size = image_size + self.batch_norm_eps = batch_norm_eps + + +__all__ = [ + "EfficientFormerConfig", +] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deprecated/efficientformer/image_processing_efficientformer.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deprecated/efficientformer/image_processing_efficientformer.py new file mode 100644 index 0000000000000000000000000000000000000000..a2dd1281e920fdea3b4c51ef79c09bc86c605921 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deprecated/efficientformer/image_processing_efficientformer.py @@ -0,0 +1,324 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Image processor class for EfficientFormer.""" + +from typing import Optional, Union + +import numpy as np + +from ....image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict +from ....image_transforms import ( + get_resize_output_image_size, + resize, + to_channel_dimension_format, +) +from ....image_utils import ( + IMAGENET_DEFAULT_MEAN, + IMAGENET_DEFAULT_STD, + ChannelDimension, + ImageInput, + PILImageResampling, + infer_channel_dimension_format, + is_batched, + is_scaled_image, + to_numpy_array, + valid_images, + validate_kwargs, + validate_preprocess_arguments, +) +from ....utils import TensorType, logging + + +logger = logging.get_logger(__name__) + + +class EfficientFormerImageProcessor(BaseImageProcessor): + r""" + Constructs a EfficientFormer image processor. + + Args: + do_resize (`bool`, *optional*, defaults to `True`): + Whether to resize the image's (height, width) dimensions to the specified `(size["height"], + size["width"])`. Can be overridden by the `do_resize` parameter in the `preprocess` method. + size (`dict`, *optional*, defaults to `{"height": 224, "width": 224}`): + Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess` + method. + resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`): + Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the + `preprocess` method. + do_center_crop (`bool`, *optional*, defaults to `True`): + Whether to center crop the image to the specified `crop_size`. Can be overridden by `do_center_crop` in the + `preprocess` method. + crop_size (`dict[str, int]` *optional*, defaults to 224): + Size of the output image after applying `center_crop`. Can be overridden by `crop_size` in the `preprocess` + method. + do_rescale (`bool`, *optional*, defaults to `True`): + Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale` + parameter in the `preprocess` method. + rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): + Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the + `preprocess` method. + do_normalize: + Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess` + method. + image_mean (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`): + Mean to use if normalizing the image. This is a float or list of floats the length of the number of + channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. + image_std (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`): + Standard deviation to use if normalizing the image. This is a float or list of floats the length of the + number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method. + """ + + model_input_names = ["pixel_values"] + + def __init__( + self, + do_resize: bool = True, + size: Optional[dict[str, int]] = None, + resample: PILImageResampling = PILImageResampling.BICUBIC, + do_center_crop: bool = True, + do_rescale: bool = True, + rescale_factor: Union[int, float] = 1 / 255, + crop_size: Optional[dict[str, int]] = None, + do_normalize: bool = True, + image_mean: Optional[Union[float, list[float]]] = None, + image_std: Optional[Union[float, list[float]]] = None, + **kwargs, + ) -> None: + super().__init__(**kwargs) + size = size if size is not None else {"height": 224, "width": 224} + size = get_size_dict(size) + crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224} + crop_size = get_size_dict(crop_size, default_to_square=True, param_name="crop_size") + + self.do_resize = do_resize + self.do_rescale = do_rescale + self.do_normalize = do_normalize + self.do_center_crop = do_center_crop + self.crop_size = crop_size + self.size = size + self.resample = resample + self.rescale_factor = rescale_factor + self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN + self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD + self._valid_processor_keys = [ + "images", + "do_resize", + "size", + "resample", + "do_center_crop", + "crop_size", + "do_rescale", + "rescale_factor", + "do_normalize", + "image_mean", + "image_std", + "return_tensors", + "data_format", + "input_data_format", + ] + + def resize( + self, + image: np.ndarray, + size: dict[str, int], + resample: PILImageResampling = PILImageResampling.BILINEAR, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ) -> np.ndarray: + """ + Resize an image to `(size["height"], size["width"])`. + + Args: + image (`np.ndarray`): + Image to resize. + size (`dict[str, int]`): + Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image. + resample: + `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`. + data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the output image. If unset, the channel dimension format of the input + image is used. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format of the input image. If not provided, it will be inferred. + + Returns: + `np.ndarray`: The resized image. + """ + size = get_size_dict(size) + + if "shortest_edge" in size: + size = get_resize_output_image_size( + image, size=size["shortest_edge"], default_to_square=False, input_data_format=input_data_format + ) + # size = get_resize_output_image_size(image, size["shortest_edge"], size["longest_edge"]) + elif "height" in size and "width" in size: + size = (size["height"], size["width"]) + else: + raise ValueError(f"Size must contain 'height' and 'width' keys or 'shortest_edge' key. Got {size.keys()}") + return resize( + image, size=size, resample=resample, data_format=data_format, input_data_format=input_data_format, **kwargs + ) + + def preprocess( + self, + images: ImageInput, + do_resize: Optional[bool] = None, + size: Optional[dict[str, int]] = None, + resample: Optional[PILImageResampling] = None, + do_center_crop: Optional[bool] = None, + crop_size: Optional[int] = None, + do_rescale: Optional[bool] = None, + rescale_factor: Optional[float] = None, + do_normalize: Optional[bool] = None, + image_mean: Optional[Union[float, list[float]]] = None, + image_std: Optional[Union[float, list[float]]] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ) -> BatchFeature: + """ + Preprocess an image or batch of images. + + Args: + images (`ImageInput`): + Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If + passing in images with pixel values between 0 and 1, set `do_rescale=False`. + do_resize (`bool`, *optional*, defaults to `self.do_resize`): + Whether to resize the image. + size (`dict[str, int]`, *optional*, defaults to `self.size`): + Dictionary in the format `{"height": h, "width": w}` specifying the size of the output image after + resizing. + resample (`PILImageResampling` filter, *optional*, defaults to `self.resample`): + `PILImageResampling` filter to use if resizing the image e.g. `PILImageResampling.BILINEAR`. Only has + an effect if `do_resize` is set to `True`. + do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`): + Whether to center crop the image. + do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): + Whether to rescale the image values between [0 - 1]. + rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): + Rescale factor to rescale the image by if `do_rescale` is set to `True`. + crop_size (`dict[str, int]`, *optional*, defaults to `self.crop_size`): + Size of the center crop. Only has an effect if `do_center_crop` is set to `True`. + do_normalize (`bool`, *optional*, defaults to `self.do_normalize`): + Whether to normalize the image. + image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`): + Image mean to use if `do_normalize` is set to `True`. + image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`): + Image standard deviation to use if `do_normalize` is set to `True`. + return_tensors (`str` or `TensorType`, *optional*): + The type of tensors to return. Can be one of: + - Unset: Return a list of `np.ndarray`. + - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`. + - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. + - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. + - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. + data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - Unset: Use the channel dimension format of the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + """ + do_resize = do_resize if do_resize is not None else self.do_resize + do_rescale = do_rescale if do_rescale is not None else self.do_rescale + do_normalize = do_normalize if do_normalize is not None else self.do_normalize + do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop + crop_size = crop_size if crop_size is not None else self.crop_size + crop_size = get_size_dict(crop_size, param_name="crop_size", default_to_square=True) + resample = resample if resample is not None else self.resample + rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor + image_mean = image_mean if image_mean is not None else self.image_mean + image_std = image_std if image_std is not None else self.image_std + + size = size if size is not None else self.size + size_dict = get_size_dict(size) + + validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self._valid_processor_keys) + + if not is_batched(images): + images = [images] + + if not valid_images(images): + raise ValueError( + "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " + "torch.Tensor, tf.Tensor or jax.ndarray." + ) + validate_preprocess_arguments( + do_rescale=do_rescale, + rescale_factor=rescale_factor, + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + do_center_crop=do_center_crop, + crop_size=crop_size, + do_resize=do_resize, + size=size, + resample=resample, + ) + # All transformations expect numpy arrays. + images = [to_numpy_array(image) for image in images] + + if do_rescale and is_scaled_image(images[0]): + logger.warning_once( + "It looks like you are trying to rescale already rescaled images. If the input" + " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again." + ) + + if input_data_format is None: + # We assume that all images have the same channel dimension format. + input_data_format = infer_channel_dimension_format(images[0]) + + if do_resize: + images = [ + self.resize(image=image, size=size_dict, resample=resample, input_data_format=input_data_format) + for image in images + ] + + if do_center_crop: + images = [ + self.center_crop(image=image, size=crop_size, input_data_format=input_data_format) for image in images + ] + + if do_rescale: + images = [ + self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format) + for image in images + ] + + if do_normalize: + images = [ + self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format) + for image in images + ] + + images = [ + to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images + ] + + data = {"pixel_values": images} + return BatchFeature(data=data, tensor_type=return_tensors) + + +__all__ = ["EfficientFormerImageProcessor"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deprecated/efficientformer/modeling_efficientformer.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deprecated/efficientformer/modeling_efficientformer.py new file mode 100644 index 0000000000000000000000000000000000000000..d35d3e82c007c6c6cf9326e1c933df34c6ae41ba --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deprecated/efficientformer/modeling_efficientformer.py @@ -0,0 +1,786 @@ +# coding=utf-8 +# Copyright 2022 Snapchat Research and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch EfficientFormer model.""" + +import itertools +from dataclasses import dataclass +from typing import Optional, Union + +import torch +from torch import nn + +from ....activations import ACT2FN +from ....modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, ImageClassifierOutput +from ....modeling_utils import PreTrainedModel +from ....utils import ( + ModelOutput, + add_code_sample_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, +) +from .configuration_efficientformer import EfficientFormerConfig + + +logger = logging.get_logger(__name__) + +# General docstring +_CONFIG_FOR_DOC = "EfficientFormerConfig" + +# Base docstring +_CHECKPOINT_FOR_DOC = "snap-research/efficientformer-l1-300" +_EXPECTED_OUTPUT_SHAPE = [1, 49, 448] + +# Image classification docstring +_IMAGE_CLASS_CHECKPOINT = "snap-research/efficientformer-l1-300" +_IMAGE_CLASS_EXPECTED_OUTPUT = "Egyptian cat" + + +class EfficientFormerPatchEmbeddings(nn.Module): + """ + This class performs downsampling between two stages. For the input tensor with the shape [batch_size, num_channels, + height, width] it produces output tensor with the shape [batch_size, num_channels, height/stride, width/stride] + """ + + def __init__(self, config: EfficientFormerConfig, num_channels: int, embed_dim: int, apply_norm: bool = True): + super().__init__() + self.num_channels = num_channels + + self.projection = nn.Conv2d( + num_channels, + embed_dim, + kernel_size=config.downsample_patch_size, + stride=config.downsample_stride, + padding=config.downsample_pad, + ) + self.norm = nn.BatchNorm2d(embed_dim, eps=config.batch_norm_eps) if apply_norm else nn.Identity() + + def forward(self, pixel_values: torch.Tensor) -> torch.Tensor: + batch_size, num_channels, height, width = pixel_values.shape + if num_channels != self.num_channels: + raise ValueError( + "Make sure that the channel dimension of the pixel values match with the one set in the configuration." + ) + + embeddings = self.projection(pixel_values) + embeddings = self.norm(embeddings) + + return embeddings + + +class EfficientFormerSelfAttention(nn.Module): + def __init__(self, dim: int, key_dim: int, num_heads: int, attention_ratio: int, resolution: int): + super().__init__() + + self.num_heads = num_heads + self.key_dim = key_dim + self.attention_ratio = attention_ratio + self.scale = key_dim**-0.5 + self.total_key_dim = key_dim * num_heads + self.expanded_key_dim = int(attention_ratio * key_dim) + self.total_expanded_key_dim = int(self.expanded_key_dim * num_heads) + hidden_size = self.total_expanded_key_dim + self.total_key_dim * 2 + self.qkv = nn.Linear(dim, hidden_size) + self.projection = nn.Linear(self.total_expanded_key_dim, dim) + points = list(itertools.product(range(resolution), range(resolution))) + num_points = len(points) + attention_offsets = {} + idxs = [] + for point_1 in points: + for point_2 in points: + offset = (abs(point_1[0] - point_2[0]), abs(point_1[1] - point_2[1])) + if offset not in attention_offsets: + attention_offsets[offset] = len(attention_offsets) + idxs.append(attention_offsets[offset]) + self.attention_biases = torch.nn.Parameter(torch.zeros(num_heads, len(attention_offsets))) + self.register_buffer("attention_bias_idxs", torch.LongTensor(idxs).view(num_points, num_points)) + + @torch.no_grad() + def train(self, mode=True): + super().train(mode) + if mode and hasattr(self, "ab"): + del self.ab + else: + self.ab = self.attention_biases[:, self.attention_bias_idxs] + + def forward(self, hidden_states: torch.Tensor, output_attentions: bool = False) -> tuple[torch.Tensor]: + batch_size, sequence_length, num_channels = hidden_states.shape + qkv = self.qkv(hidden_states) + query_layer, key_layer, value_layer = qkv.reshape(batch_size, sequence_length, self.num_heads, -1).split( + [self.key_dim, self.key_dim, self.expanded_key_dim], dim=3 + ) + query_layer = query_layer.permute(0, 2, 1, 3) + key_layer = key_layer.permute(0, 2, 1, 3) + value_layer = value_layer.permute(0, 2, 1, 3) + + # set `model.to(torch_device)` won't change `self.ab.device`, if there is no follow-up `train` or `eval` call. + # Let's do it manually here, so users won't have to do this everytime. + if not self.training: + self.ab = self.ab.to(self.attention_biases.device) + attention_probs = (torch.matmul(query_layer, key_layer.transpose(-2, -1))) * self.scale + ( + self.attention_biases[:, self.attention_bias_idxs] if self.training else self.ab + ) + + attention_probs = attention_probs.softmax(dim=-1) + + context_layer = torch.matmul(attention_probs, value_layer).transpose(1, 2) + context_layer = context_layer.reshape(batch_size, sequence_length, self.total_expanded_key_dim) + context_layer = self.projection(context_layer) + + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + + return outputs + + +class EfficientFormerConvStem(nn.Module): + def __init__(self, config: EfficientFormerConfig, out_channels: int): + super().__init__() + + self.convolution1 = nn.Conv2d(config.num_channels, out_channels // 2, kernel_size=3, stride=2, padding=1) + self.batchnorm_before = nn.BatchNorm2d(out_channels // 2, eps=config.batch_norm_eps) + + self.convolution2 = nn.Conv2d(out_channels // 2, out_channels, kernel_size=3, stride=2, padding=1) + self.batchnorm_after = nn.BatchNorm2d(out_channels, eps=config.batch_norm_eps) + + self.activation = nn.ReLU() + + def forward(self, pixel_values: torch.Tensor) -> torch.Tensor: + features = self.batchnorm_before(self.convolution1(pixel_values)) + features = self.activation(features) + features = self.batchnorm_after(self.convolution2(features)) + features = self.activation(features) + + return features + + +class EfficientFormerPooling(nn.Module): + def __init__(self, pool_size: int): + super().__init__() + self.pool = nn.AvgPool2d(pool_size, stride=1, padding=pool_size // 2, count_include_pad=False) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + output = self.pool(hidden_states) - hidden_states + return output + + +class EfficientFormerDenseMlp(nn.Module): + def __init__( + self, + config: EfficientFormerConfig, + in_features: int, + hidden_features: Optional[int] = None, + out_features: Optional[int] = None, + ): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + + self.linear_in = nn.Linear(in_features, hidden_features) + self.activation = ACT2FN[config.hidden_act] + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.linear_out = nn.Linear(hidden_features, out_features) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.linear_in(hidden_states) + hidden_states = self.activation(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.linear_out(hidden_states) + hidden_states = self.dropout(hidden_states) + + return hidden_states + + +class EfficientFormerConvMlp(nn.Module): + def __init__( + self, + config: EfficientFormerConfig, + in_features: int, + hidden_features: Optional[int] = None, + out_features: Optional[int] = None, + drop: float = 0.0, + ): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + + self.convolution1 = nn.Conv2d(in_features, hidden_features, 1) + self.activation = ACT2FN[config.hidden_act] + self.convolution2 = nn.Conv2d(hidden_features, out_features, 1) + self.dropout = nn.Dropout(drop) + + self.batchnorm_before = nn.BatchNorm2d(hidden_features, eps=config.batch_norm_eps) + self.batchnorm_after = nn.BatchNorm2d(out_features, eps=config.batch_norm_eps) + + def forward(self, hidden_state: torch.Tensor) -> torch.Tensor: + hidden_state = self.convolution1(hidden_state) + hidden_state = self.batchnorm_before(hidden_state) + + hidden_state = self.activation(hidden_state) + hidden_state = self.dropout(hidden_state) + hidden_state = self.convolution2(hidden_state) + + hidden_state = self.batchnorm_after(hidden_state) + hidden_state = self.dropout(hidden_state) + + return hidden_state + + +def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor: + """ + Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + + Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks, + however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... + See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the + layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the + argument. + """ + if drop_prob == 0.0 or not training: + return input + keep_prob = 1 - drop_prob + shape = (input.shape[0],) + (1,) * (input.ndim - 1) # work with diff dim tensors, not just 2D ConvNets + random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device) + random_tensor.floor_() # binarize + output = input.div(keep_prob) * random_tensor + return output + + +class EfficientFormerDropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" + + def __init__(self, drop_prob: Optional[float] = None) -> None: + super().__init__() + self.drop_prob = drop_prob + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + return drop_path(hidden_states, self.drop_prob, self.training) + + def extra_repr(self) -> str: + return f"p={self.drop_prob}" + + +class EfficientFormerFlat(nn.Module): + def __init__(self): + super().__init__() + + def forward(self, hidden_states: torch.Tensor) -> tuple[torch.Tensor]: + hidden_states = hidden_states.flatten(2).transpose(1, 2) + return hidden_states + + +class EfficientFormerMeta3D(nn.Module): + def __init__(self, config: EfficientFormerConfig, dim: int, drop_path: float = 0.0): + super().__init__() + + self.token_mixer = EfficientFormerSelfAttention( + dim=config.dim, + key_dim=config.key_dim, + num_heads=config.num_attention_heads, + attention_ratio=config.attention_ratio, + resolution=config.resolution, + ) + + self.layernorm1 = nn.LayerNorm(dim, eps=config.layer_norm_eps) + self.layernorm2 = nn.LayerNorm(dim, eps=config.layer_norm_eps) + + mlp_hidden_dim = int(dim * config.mlp_expansion_ratio) + self.mlp = EfficientFormerDenseMlp(config, in_features=dim, hidden_features=mlp_hidden_dim) + + self.drop_path = EfficientFormerDropPath(drop_path) if drop_path > 0.0 else nn.Identity() + self.use_layer_scale = config.use_layer_scale + if config.use_layer_scale: + self.layer_scale_1 = nn.Parameter(config.layer_scale_init_value * torch.ones(dim), requires_grad=True) + self.layer_scale_2 = nn.Parameter(config.layer_scale_init_value * torch.ones(dim), requires_grad=True) + + def forward(self, hidden_states: torch.Tensor, output_attentions: bool = False) -> tuple[torch.Tensor]: + self_attention_outputs = self.token_mixer(self.layernorm1(hidden_states), output_attentions) + attention_output = self_attention_outputs[0] + outputs = self_attention_outputs[1:] # add self attentions if we output attention weights + + if self.use_layer_scale: + layer_output = hidden_states + self.drop_path( + self.layer_scale_1.unsqueeze(0).unsqueeze(0) * attention_output + ) + layer_output = layer_output + self.drop_path( + self.layer_scale_2.unsqueeze(0).unsqueeze(0) * self.mlp(self.layernorm2(layer_output)) + ) + else: + layer_output = hidden_states + self.drop_path(attention_output) + layer_output = layer_output + self.drop_path(self.mlp(self.layernorm2(layer_output))) + + outputs = (layer_output,) + outputs + + return outputs + + +class EfficientFormerMeta3DLayers(nn.Module): + def __init__(self, config: EfficientFormerConfig): + super().__init__() + drop_paths = [ + config.drop_path_rate * (block_idx + sum(config.depths[:-1])) + for block_idx in range(config.num_meta3d_blocks) + ] + self.blocks = nn.ModuleList( + [EfficientFormerMeta3D(config, config.hidden_sizes[-1], drop_path=drop_path) for drop_path in drop_paths] + ) + + def forward(self, hidden_states: torch.Tensor, output_attentions: bool = False) -> tuple[torch.Tensor]: + all_attention_outputs = () if output_attentions else None + + for layer_module in self.blocks: + if isinstance(hidden_states, tuple): + hidden_states = hidden_states[0] + + hidden_states = layer_module(hidden_states, output_attentions) + + if output_attentions: + all_attention_outputs = all_attention_outputs + (hidden_states[1],) + + if output_attentions: + outputs = (hidden_states[0],) + all_attention_outputs + return outputs + + return hidden_states + + +class EfficientFormerMeta4D(nn.Module): + def __init__(self, config: EfficientFormerConfig, dim: int, drop_path: float = 0.0): + super().__init__() + pool_size = config.pool_size if config.pool_size is not None else 3 + self.token_mixer = EfficientFormerPooling(pool_size=pool_size) + mlp_hidden_dim = int(dim * config.mlp_expansion_ratio) + self.mlp = EfficientFormerConvMlp( + config, in_features=dim, hidden_features=mlp_hidden_dim, drop=config.hidden_dropout_prob + ) + + self.drop_path = EfficientFormerDropPath(drop_path) if drop_path > 0.0 else nn.Identity() + self.use_layer_scale = config.use_layer_scale + if config.use_layer_scale: + self.layer_scale_1 = nn.Parameter(config.layer_scale_init_value * torch.ones(dim), requires_grad=True) + self.layer_scale_2 = nn.Parameter(config.layer_scale_init_value * torch.ones(dim), requires_grad=True) + + def forward(self, hidden_states: torch.Tensor) -> tuple[torch.Tensor]: + outputs = self.token_mixer(hidden_states) + + if self.use_layer_scale: + layer_output = hidden_states + self.drop_path(self.layer_scale_1.unsqueeze(-1).unsqueeze(-1) * outputs) + + layer_output = layer_output + self.drop_path( + self.layer_scale_2.unsqueeze(-1).unsqueeze(-1) * self.mlp(layer_output) + ) + else: + layer_output = hidden_states + self.drop_path(outputs) + layer_output = layer_output + self.drop_path(self.mlp(layer_output)) + + return layer_output + + +class EfficientFormerMeta4DLayers(nn.Module): + def __init__(self, config: EfficientFormerConfig, stage_idx: int): + super().__init__() + num_layers = ( + config.depths[stage_idx] if stage_idx != -1 else config.depths[stage_idx] - config.num_meta3d_blocks + ) + drop_paths = [ + config.drop_path_rate * (block_idx + sum(config.depths[:stage_idx])) for block_idx in range(num_layers) + ] + + self.blocks = nn.ModuleList( + [ + EfficientFormerMeta4D(config, config.hidden_sizes[stage_idx], drop_path=drop_path) + for drop_path in drop_paths + ] + ) + + def forward(self, hidden_states: torch.Tensor) -> tuple[torch.Tensor]: + for layer_module in self.blocks: + hidden_states = layer_module(hidden_states) + return hidden_states + + +class EfficientFormerIntermediateStage(nn.Module): + def __init__(self, config: EfficientFormerConfig, index: int): + super().__init__() + self.meta4D_layers = EfficientFormerMeta4DLayers(config, index) + + def forward(self, hidden_states: torch.Tensor) -> tuple[torch.Tensor]: + hidden_states = self.meta4D_layers(hidden_states) + return hidden_states + + +class EfficientFormerLastStage(nn.Module): + def __init__(self, config: EfficientFormerConfig): + super().__init__() + self.meta4D_layers = EfficientFormerMeta4DLayers(config, -1) + self.flat = EfficientFormerFlat() + self.meta3D_layers = EfficientFormerMeta3DLayers(config) + + def forward(self, hidden_states: torch.Tensor, output_attentions: bool = False) -> tuple[torch.Tensor]: + hidden_states = self.meta4D_layers(hidden_states) + hidden_states = self.flat(hidden_states) + hidden_states = self.meta3D_layers(hidden_states, output_attentions) + + return hidden_states + + +class EfficientFormerEncoder(nn.Module): + def __init__(self, config: EfficientFormerConfig): + super().__init__() + self.config = config + num_intermediate_stages = len(config.depths) - 1 + downsamples = [ + config.downsamples[i] or config.hidden_sizes[i] != config.hidden_sizes[i + 1] + for i in range(num_intermediate_stages) + ] + intermediate_stages = [] + + for i in range(num_intermediate_stages): + intermediate_stages.append(EfficientFormerIntermediateStage(config, i)) + if downsamples[i]: + intermediate_stages.append( + EfficientFormerPatchEmbeddings(config, config.hidden_sizes[i], config.hidden_sizes[i + 1]) + ) + + self.intermediate_stages = nn.ModuleList(intermediate_stages) + self.last_stage = EfficientFormerLastStage(config) + + def forward( + self, + hidden_states: torch.Tensor, + output_hidden_states: bool = False, + output_attentions: bool = False, + return_dict: bool = True, + ) -> BaseModelOutput: + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + for layer_module in self.intermediate_stages: + hidden_states = layer_module(hidden_states) + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_output = self.last_stage(hidden_states, output_attentions=output_attentions) + + if output_attentions: + all_self_attentions = all_self_attentions + layer_output[1:] + + if output_hidden_states: + all_hidden_states = all_hidden_states + (layer_output[0],) + + if not return_dict: + return tuple(v for v in [layer_output[0], all_hidden_states, all_self_attentions] if v is not None) + + return BaseModelOutput( + last_hidden_state=layer_output[0], + hidden_states=all_hidden_states, + attentions=all_self_attentions, + ) + + +class EfficientFormerPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config: EfficientFormerConfig + base_model_prefix = "efficientformer" + main_input_name = "pixel_values" + supports_gradient_checkpointing = False + + def _init_weights(self, module: nn.Module): + """Initialize the weights""" + if isinstance(module, (nn.Linear, nn.Conv2d)): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + +EFFICIENTFORMER_START_DOCSTRING = r""" + This model is a PyTorch [nn.Module](https://pytorch.org/docs/stable/nn.html#nn.Module) subclass. Use it as a + regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and behavior. + + Parameters: + config ([`EfficientFormerConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +EFFICIENTFORMER_INPUTS_DOCSTRING = r""" + Args: + pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + Pixel values. Pixel values can be obtained using [`ViTImageProcessor`]. See + [`ViTImageProcessor.preprocess`] for details. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +@add_start_docstrings( + "The bare EfficientFormer Model transformer outputting raw hidden-states without any specific head on top.", + EFFICIENTFORMER_START_DOCSTRING, +) +class EfficientFormerModel(EfficientFormerPreTrainedModel): + def __init__(self, config: EfficientFormerConfig): + super().__init__(config) + self.config = config + _no_split_modules = ["EfficientFormerMeta4D"] + + self.patch_embed = EfficientFormerConvStem(config, config.hidden_sizes[0]) + self.encoder = EfficientFormerEncoder(config) + self.layernorm = nn.LayerNorm(config.hidden_sizes[-1], eps=config.layer_norm_eps) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(EFFICIENTFORMER_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=BaseModelOutputWithPooling, + config_class=_CONFIG_FOR_DOC, + modality="vision", + expected_output=_EXPECTED_OUTPUT_SHAPE, + ) + def forward( + self, + pixel_values: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, BaseModelOutput]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if pixel_values is None: + raise ValueError("You have to specify pixel_values") + + embedding_output = self.patch_embed(pixel_values) + encoder_outputs = self.encoder( + embedding_output, output_attentions=output_attentions, output_hidden_states=output_hidden_states + ) + + sequence_output = encoder_outputs[0] + sequence_output = self.layernorm(sequence_output) + + if not return_dict: + head_outputs = (sequence_output,) + return head_outputs + encoder_outputs[1:] + + return BaseModelOutput( + last_hidden_state=sequence_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + +@add_start_docstrings( + """ + EfficientFormer Model transformer with an image classification head on top (a linear layer on top of the final + hidden state of the [CLS] token) e.g. for ImageNet. + """, + EFFICIENTFORMER_START_DOCSTRING, +) +class EfficientFormerForImageClassification(EfficientFormerPreTrainedModel): + def __init__(self, config: EfficientFormerConfig): + super().__init__(config) + + self.num_labels = config.num_labels + self.efficientformer = EfficientFormerModel(config) + + # Classifier head + self.classifier = ( + nn.Linear(config.hidden_sizes[-1], config.num_labels) if config.num_labels > 0 else nn.Identity() + ) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(EFFICIENTFORMER_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_IMAGE_CLASS_CHECKPOINT, + output_type=ImageClassifierOutput, + config_class=_CONFIG_FOR_DOC, + expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT, + ) + def forward( + self, + pixel_values: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, ImageClassifierOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the image classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.efficientformer( + pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + logits = self.classifier(sequence_output.mean(-2)) + + loss = None + if labels is not None: + loss = self.loss_function(labels, logits, self.config) + + if not return_dict: + output = (logits,) + outputs[1:] + return ((loss,) + output) if loss is not None else output + + return ImageClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@dataclass +class EfficientFormerForImageClassificationWithTeacherOutput(ModelOutput): + """ + Output type of [`EfficientFormerForImageClassificationWithTeacher`]. + + Args: + logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`): + Prediction scores as the average of the cls_logits and distillation logits. + cls_logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`): + Prediction scores of the classification head (i.e. the linear layer on top of the final hidden state of the + class token). + distillation_logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`): + Prediction scores of the distillation head (i.e. the linear layer on top of the final hidden state of the + distillation token). + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of + shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer + plus the initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in + the self-attention heads. + """ + + logits: Optional[torch.FloatTensor] = None + cls_logits: Optional[torch.FloatTensor] = None + distillation_logits: Optional[torch.FloatTensor] = None + hidden_states: Optional[tuple[torch.FloatTensor]] = None + attentions: Optional[tuple[torch.FloatTensor]] = None + + +@add_start_docstrings( + """ + EfficientFormer Model transformer with image classification heads on top (a linear layer on top of the final hidden + state of the [CLS] token and a linear layer on top of the final hidden state of the distillation token) e.g. for + ImageNet. + + + + This model supports inference-only. Fine-tuning with distillation (i.e. with a teacher) is not yet + supported. + + + """, + EFFICIENTFORMER_START_DOCSTRING, +) +class EfficientFormerForImageClassificationWithTeacher(EfficientFormerPreTrainedModel): + def __init__(self, config: EfficientFormerConfig): + super().__init__(config) + + self.num_labels = config.num_labels + self.efficientformer = EfficientFormerModel(config) + + # Classifier head + self.classifier = nn.Linear(config.hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity() + # Distillation head + self.distillation_classifier = ( + nn.Linear(config.hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity() + ) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(EFFICIENTFORMER_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_IMAGE_CLASS_CHECKPOINT, + output_type=EfficientFormerForImageClassificationWithTeacherOutput, + config_class=_CONFIG_FOR_DOC, + expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT, + ) + def forward( + self, + pixel_values: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, EfficientFormerForImageClassificationWithTeacherOutput]: + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + outputs = self.efficientformer( + pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + cls_logits = self.classifier(sequence_output.mean(-2)) + distillation_logits = self.distillation_classifier(sequence_output.mean(-2)) + + # during inference, return the average of both classifier predictions + logits = (cls_logits + distillation_logits) / 2 + + if not return_dict: + output = (logits, cls_logits, distillation_logits) + outputs[1:] + return output + + return EfficientFormerForImageClassificationWithTeacherOutput( + logits=logits, + cls_logits=cls_logits, + distillation_logits=distillation_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +__all__ = [ + "EfficientFormerForImageClassification", + "EfficientFormerForImageClassificationWithTeacher", + "EfficientFormerModel", + "EfficientFormerPreTrainedModel", +] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deprecated/efficientformer/modeling_tf_efficientformer.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deprecated/efficientformer/modeling_tf_efficientformer.py new file mode 100644 index 0000000000000000000000000000000000000000..643097e79c3eab47b62f7e7c4781b2b276418176 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deprecated/efficientformer/modeling_tf_efficientformer.py @@ -0,0 +1,1198 @@ +# coding=utf-8 +# Copyright 2023 Snapchat Research and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""TensorFlow EfficientFormer model.""" + +import itertools +from dataclasses import dataclass +from typing import Optional, Union + +import tensorflow as tf + +from ....activations_tf import ACT2FN +from ....modeling_tf_outputs import ( + TFBaseModelOutput, + TFBaseModelOutputWithPooling, + TFImageClassifierOutput, +) +from ....modeling_tf_utils import ( + TFPreTrainedModel, + TFSequenceClassificationLoss, + get_initializer, + keras, + keras_serializable, + unpack_inputs, +) +from ....tf_utils import shape_list, stable_softmax +from ....utils import ( + ModelOutput, + add_code_sample_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, +) +from .configuration_efficientformer import EfficientFormerConfig + + +logger = logging.get_logger(__name__) + +# General docstring +_CONFIG_FOR_DOC = "EfficientFormerConfig" + +# Base docstring +_CHECKPOINT_FOR_DOC = "snap-research/efficientformer-l1-300" +_EXPECTED_OUTPUT_SHAPE = [1, 49, 448] + +# Image classification docstring +_IMAGE_CLASS_CHECKPOINT = "snap-research/efficientformer-l1-300" +_IMAGE_CLASS_EXPECTED_OUTPUT = "LABEL_281" + + +class TFEfficientFormerPatchEmbeddings(keras.layers.Layer): + """ + This class performs downsampling between two stages. For the input tensor with the shape [batch_size, num_channels, + height, width] it produces output tensor with the shape [batch_size, num_channels, height/stride, width/stride] + """ + + def __init__( + self, config: EfficientFormerConfig, num_channels: int, embed_dim: int, apply_norm: bool = True, **kwargs + ) -> None: + super().__init__(**kwargs) + self.num_channels = num_channels + + self.padding = keras.layers.ZeroPadding2D(padding=config.downsample_pad) + self.projection = keras.layers.Conv2D( + filters=embed_dim, + kernel_size=config.downsample_patch_size, + strides=config.downsample_stride, + padding="valid", + name="projection", + ) + # Use same default momentum and epsilon as PyTorch equivalent for BatchNormalization + self.norm = ( + keras.layers.BatchNormalization(axis=-1, epsilon=config.batch_norm_eps, momentum=0.9, name="norm") + if apply_norm + else tf.identity + ) + self.embed_dim = embed_dim + + def call(self, pixel_values: tf.Tensor, training: bool = False) -> tf.Tensor: + tf.debugging.assert_shapes( + [(pixel_values, (..., None, None, self.num_channels))], + message="Make sure that the channel dimension of the pixel values match with the one set in the configuration.", + ) + embeddings = self.projection(self.padding(pixel_values)) + embeddings = self.norm(embeddings, training=training) + return embeddings + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "projection", None) is not None: + with tf.name_scope(self.projection.name): + self.projection.build([None, None, None, self.num_channels]) + if getattr(self, "norm", None) is not None: + if hasattr(self.norm, "name"): + with tf.name_scope(self.norm.name): + self.norm.build([None, None, None, self.embed_dim]) + + +class TFEfficientFormerSelfAttention(keras.layers.Layer): + def __init__( + self, + dim: int, + key_dim: int, + num_heads: int, + attention_ratio: int, + resolution: int, + config: EfficientFormerConfig, + **kwargs, + ): + super().__init__(**kwargs) + + self.num_heads = num_heads + self.key_dim = key_dim + self.attention_ratio = attention_ratio + self.scale = key_dim**-0.5 + self.total_key_dim = key_dim * num_heads + self.expanded_key_dim = int(attention_ratio * key_dim) + self.total_expanded_key_dim = int(self.expanded_key_dim * num_heads) + hidden_size = self.total_expanded_key_dim + self.total_key_dim * 2 + + self.qkv = keras.layers.Dense( + units=hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="qkv" + ) + self.projection = keras.layers.Dense( + units=dim, kernel_initializer=get_initializer(config.initializer_range), name="projection" + ) + self.resolution = resolution + self.dim = dim + + def build(self, input_shape: tf.TensorShape) -> None: + points = list(itertools.product(range(self.resolution), range(self.resolution))) + num_points = len(points) + attention_offsets = {} + + idxs = [] + + for point_1 in points: + for point_2 in points: + offset = (abs(point_1[0] - point_2[0]), abs(point_1[1] - point_2[1])) + if offset not in attention_offsets: + attention_offsets[offset] = len(attention_offsets) + idxs.append(attention_offsets[offset]) + + self.attention_biases = self.add_weight( + shape=(self.num_heads, len(attention_offsets)), + initializer=keras.initializers.zeros(), + trainable=True, + name="attention_biases", + ) + self.attention_bias_idxs = self.add_weight( + shape=(num_points, num_points), + trainable=False, + dtype=tf.int32, + name="attention_bias_idxs", + ) + + self.attention_bias_idxs.assign(tf.reshape(tf.cast(idxs, dtype=tf.int32), (num_points, num_points))) + + if self.built: + return + self.built = True + if getattr(self, "qkv", None) is not None: + with tf.name_scope(self.qkv.name): + self.qkv.build([None, None, self.dim]) + if getattr(self, "projection", None) is not None: + with tf.name_scope(self.projection.name): + self.projection.build([None, None, self.total_expanded_key_dim]) + + def call( + self, hidden_states: tf.Tensor, output_attentions: bool = False, training: bool = False + ) -> tuple[tf.Tensor]: + batch_size, sequence_length, *_ = shape_list(hidden_states) + qkv = self.qkv(inputs=hidden_states) + + query_layer, key_layer, value_layer = tf.split( + tf.reshape(tensor=qkv, shape=(batch_size, sequence_length, self.num_heads, -1)), + num_or_size_splits=[self.key_dim, self.key_dim, self.expanded_key_dim], + axis=3, + ) + + query_layer = tf.transpose(query_layer, perm=[0, 2, 1, 3]) + key_layer = tf.transpose(key_layer, perm=[0, 2, 1, 3]) + value_layer = tf.transpose(value_layer, perm=[0, 2, 1, 3]) + + attention_probs = tf.matmul(query_layer, tf.transpose(key_layer, perm=[0, 1, 3, 2])) + scale = tf.cast(self.scale, dtype=attention_probs.dtype) + attention_probs = tf.multiply(attention_probs, scale) + + attention_biases = tf.gather(params=self.attention_biases, indices=self.attention_bias_idxs, axis=1) + attention_probs = attention_probs + attention_biases + attention_probs = stable_softmax(logits=attention_probs, axis=-1) + + context_layer = tf.matmul(attention_probs, value_layer) + context_layer = tf.transpose(context_layer, perm=[0, 2, 1, 3]) + + context_layer = tf.reshape( + tensor=context_layer, shape=(batch_size, sequence_length, self.total_expanded_key_dim) + ) + context_layer = self.projection(context_layer) + + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + + return outputs + + +class TFEfficientFormerConvStem(keras.layers.Layer): + def __init__(self, config: EfficientFormerConfig, out_channels: int, **kwargs): + super().__init__(**kwargs) + + self.padding = keras.layers.ZeroPadding2D(padding=1) + self.convolution1 = keras.layers.Conv2D( + filters=out_channels // 2, kernel_size=3, strides=2, padding="valid", name="convolution1" + ) + # Use same default momentum and epsilon as PyTorch equivalent for BatchNormalization + self.batchnorm_before = keras.layers.BatchNormalization( + axis=-1, epsilon=config.batch_norm_eps, momentum=0.9, name="batchnorm_before" + ) + + self.convolution2 = keras.layers.Conv2D( + filters=out_channels, + kernel_size=3, + strides=2, + padding="valid", + name="convolution2", + ) + # Use same default momentum and epsilon as PyTorch equivalent for BatchNormalization + self.batchnorm_after = keras.layers.BatchNormalization( + axis=-1, epsilon=config.batch_norm_eps, momentum=0.9, name="batchnorm_after" + ) + + self.activation = keras.layers.Activation(activation=keras.activations.relu, name="activation") + self.out_channels = out_channels + self.config = config + + def call(self, pixel_values: tf.Tensor, training: bool = False) -> tf.Tensor: + features = self.batchnorm_before(self.convolution1(self.padding(pixel_values)), training=training) + features = self.activation(features) + features = self.batchnorm_after(self.convolution2(self.padding(features)), training=training) + features = self.activation(features) + return features + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "convolution1", None) is not None: + with tf.name_scope(self.convolution1.name): + self.convolution1.build([None, None, None, self.config.num_channels]) + if getattr(self, "batchnorm_before", None) is not None: + with tf.name_scope(self.batchnorm_before.name): + self.batchnorm_before.build([None, None, None, self.out_channels // 2]) + if getattr(self, "convolution2", None) is not None: + with tf.name_scope(self.convolution2.name): + self.convolution2.build([None, None, None, self.out_channels // 2]) + if getattr(self, "batchnorm_after", None) is not None: + with tf.name_scope(self.batchnorm_after.name): + self.batchnorm_after.build([None, None, None, self.out_channels]) + if getattr(self, "activation", None) is not None: + with tf.name_scope(self.activation.name): + self.activation.build(None) + + +class TFEfficientFormerPooling(keras.layers.Layer): + def __init__(self, pool_size: int, **kwargs): + super().__init__(**kwargs) + self.pool = keras.layers.AveragePooling2D(pool_size=pool_size, strides=1, padding="same") + + def call(self, hidden_states: tf.Tensor) -> tf.Tensor: + output = self.pool(hidden_states) + output = output - hidden_states + return output + + +class TFEfficientFormerDenseMlp(keras.layers.Layer): + def __init__( + self, + config: EfficientFormerConfig, + in_features: int, + hidden_features: Optional[int] = None, + out_features: Optional[int] = None, + **kwargs, + ): + super().__init__(**kwargs) + out_features = out_features or in_features + hidden_features = hidden_features or in_features + + self.linear_in = keras.layers.Dense( + units=hidden_features, kernel_initializer=get_initializer(config.initializer_range), name="linear_in" + ) + self.activation = ACT2FN[config.hidden_act] + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) + + self.linear_out = keras.layers.Dense( + units=out_features, kernel_initializer=get_initializer(config.initializer_range), name="linear_out" + ) + self.hidden_features = hidden_features + self.in_features = in_features + + def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor: + hidden_states = self.linear_in(inputs=hidden_states) + hidden_states = self.activation(hidden_states) + hidden_states = self.dropout(inputs=hidden_states, training=training) + hidden_states = self.linear_out(inputs=hidden_states) + hidden_states = self.dropout(inputs=hidden_states, training=training) + + return hidden_states + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "linear_in", None) is not None: + with tf.name_scope(self.linear_in.name): + self.linear_in.build([None, None, self.in_features]) + if getattr(self, "linear_out", None) is not None: + with tf.name_scope(self.linear_out.name): + self.linear_out.build([None, None, self.hidden_features]) + + +class TFEfficientFormerConvMlp(keras.layers.Layer): + def __init__( + self, + config: EfficientFormerConfig, + in_features: int, + hidden_features: Optional[int] = None, + out_features: Optional[int] = None, + drop: float = 0.0, + **kwargs, + ): + super().__init__(**kwargs) + out_features = out_features or in_features + hidden_features = hidden_features or in_features + + self.convolution1 = keras.layers.Conv2D( + filters=hidden_features, + kernel_size=1, + name="convolution1", + padding="valid", + ) + + self.activation = ACT2FN[config.hidden_act] + + self.convolution2 = keras.layers.Conv2D( + filters=out_features, + kernel_size=1, + name="convolution2", + padding="valid", + ) + + self.dropout = keras.layers.Dropout(rate=drop) + + # Use same default momentum and epsilon as PyTorch equivalent for BatchNormalization + self.batchnorm_before = keras.layers.BatchNormalization( + axis=-1, epsilon=config.batch_norm_eps, momentum=0.9, name="batchnorm_before" + ) + # Use same default momentum and epsilon as PyTorch equivalent for BatchNormalization + self.batchnorm_after = keras.layers.BatchNormalization( + axis=-1, epsilon=config.batch_norm_eps, momentum=0.9, name="batchnorm_after" + ) + self.hidden_features = hidden_features + self.in_features = in_features + self.out_features = out_features + + def call(self, hidden_state: tf.Tensor, training: bool = False) -> tf.Tensor: + hidden_state = self.convolution1(hidden_state) + hidden_state = self.batchnorm_before(hidden_state, training=training) + hidden_state = self.activation(hidden_state) + hidden_state = self.dropout(hidden_state, training=training) + hidden_state = self.convolution2(hidden_state) + hidden_state = self.batchnorm_after(hidden_state, training=training) + hidden_state = self.dropout(hidden_state, training=training) + return hidden_state + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "convolution1", None) is not None: + with tf.name_scope(self.convolution1.name): + self.convolution1.build([None, None, None, self.in_features]) + if getattr(self, "convolution2", None) is not None: + with tf.name_scope(self.convolution2.name): + self.convolution2.build([None, None, None, self.hidden_features]) + if getattr(self, "batchnorm_before", None) is not None: + with tf.name_scope(self.batchnorm_before.name): + self.batchnorm_before.build([None, None, None, self.hidden_features]) + if getattr(self, "batchnorm_after", None) is not None: + with tf.name_scope(self.batchnorm_after.name): + self.batchnorm_after.build([None, None, None, self.out_features]) + + +# Copied from transformers.models.convnext.modeling_tf_convnext.TFConvNextDropPath with ConvNext->EfficientFormer +class TFEfficientFormerDropPath(keras.layers.Layer): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + References: + (1) github.com:rwightman/pytorch-image-models + """ + + def __init__(self, drop_path: float, **kwargs): + super().__init__(**kwargs) + self.drop_path = drop_path + + def call(self, x: tf.Tensor, training=None): + if training: + keep_prob = 1 - self.drop_path + shape = (tf.shape(x)[0],) + (1,) * (len(tf.shape(x)) - 1) + random_tensor = keep_prob + tf.random.uniform(shape, 0, 1) + random_tensor = tf.floor(random_tensor) + return (x / keep_prob) * random_tensor + return x + + +class TFEfficientFormerFlat(keras.layers.Layer): + def __init__(self, **kwargs): + super().__init__(**kwargs) + + def call(self, hidden_states: tf.Tensor) -> tuple[tf.Tensor]: + batch_size, _, _, in_channels = shape_list(hidden_states) + hidden_states = tf.reshape(hidden_states, shape=[batch_size, -1, in_channels]) + return hidden_states + + +class TFEfficientFormerMeta3D(keras.layers.Layer): + def __init__(self, config: EfficientFormerConfig, dim: int, drop_path: float = 0.0, **kwargs): + super().__init__(**kwargs) + + self.token_mixer = TFEfficientFormerSelfAttention( + dim=config.dim, + key_dim=config.key_dim, + num_heads=config.num_attention_heads, + attention_ratio=config.attention_ratio, + resolution=config.resolution, + name="token_mixer", + config=config, + ) + self.dim = dim + self.config = config + + self.layernorm1 = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm1") + self.layernorm2 = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm2") + mlp_hidden_dim = int(dim * config.mlp_expansion_ratio) + self.mlp = TFEfficientFormerDenseMlp(config, in_features=dim, hidden_features=mlp_hidden_dim, name="mlp") + + # Using `layers.Activation` instead of `tf.identity` to better control `training' behavior. + self.drop_path = ( + TFEfficientFormerDropPath(drop_path) + if drop_path > 0.0 + else keras.layers.Activation("linear", name="drop_path") + ) + self.config = config + + def build(self, input_shape=None): + self.layer_scale_1 = None + self.layer_scale_2 = None + + if self.config.use_layer_scale: + self.layer_scale_1 = self.add_weight( + shape=(self.dim,), + initializer=keras.initializers.Constant(value=self.config.layer_scale_init_value), + trainable=True, + name="layer_scale_1", + ) + self.layer_scale_2 = self.add_weight( + shape=(self.dim,), + initializer=keras.initializers.Constant(value=self.config.layer_scale_init_value), + trainable=True, + name="layer_scale_2", + ) + + if self.built: + return + self.built = True + if getattr(self, "token_mixer", None) is not None: + with tf.name_scope(self.token_mixer.name): + self.token_mixer.build(None) + if getattr(self, "layernorm1", None) is not None: + with tf.name_scope(self.layernorm1.name): + self.layernorm1.build([None, None, self.dim]) + if getattr(self, "layernorm2", None) is not None: + with tf.name_scope(self.layernorm2.name): + self.layernorm2.build([None, None, self.dim]) + if getattr(self, "mlp", None) is not None: + with tf.name_scope(self.mlp.name): + self.mlp.build(None) + if getattr(self, "drop_path", None) is not None: + with tf.name_scope(self.drop_path.name): + self.drop_path.build(None) + + def call( + self, hidden_states: tf.Tensor, output_attentions: bool = False, training: bool = False + ) -> tuple[tf.Tensor]: + self_attention_outputs = self.token_mixer( + hidden_states=self.layernorm1(hidden_states, training=training), + output_attentions=output_attentions, + training=training, + ) + + attention_output = self_attention_outputs[0] + outputs = self_attention_outputs[1:] # add self attentions if we output attention weights + + if self.config.use_layer_scale: + layer_output = hidden_states + self.drop_path( + tf.expand_dims(tf.expand_dims(self.layer_scale_1, 0), 0) * attention_output, + training=training, + ) + layer_output = layer_output + self.drop_path( + tf.expand_dims(tf.expand_dims(self.layer_scale_2, 0), 0) + * self.mlp(hidden_states=self.layernorm2(inputs=layer_output, training=training), training=training), + training=training, + ) + else: + layer_output = hidden_states + self.drop_path(attention_output, training=training) + layer_output = layer_output + self.drop_path( + self.mlp(hidden_states=self.layernorm2(inputs=layer_output, training=training), training=training), + training=training, + ) + + outputs = (layer_output,) + outputs + + return outputs + + +class TFEfficientFormerMeta3DLayers(keras.layers.Layer): + def __init__(self, config: EfficientFormerConfig, **kwargs): + super().__init__(**kwargs) + drop_paths = [ + config.drop_path_rate * (block_idx + sum(config.depths[:-1])) + for block_idx in range(config.num_meta3d_blocks) + ] + self.blocks = [ + TFEfficientFormerMeta3D(config, config.hidden_sizes[-1], drop_path=drop_path, name=f"blocks.{i}") + for i, drop_path in enumerate(drop_paths) + ] + + def call( + self, hidden_states: tf.Tensor, output_attentions: bool = False, training: bool = False + ) -> tuple[tf.Tensor]: + all_attention_outputs = () if output_attentions else None + + for i, layer_module in enumerate(self.blocks): + if isinstance(hidden_states, tuple): + hidden_states = hidden_states[0] + + hidden_states = layer_module( + hidden_states=hidden_states, output_attentions=output_attentions, training=training + ) + if output_attentions: + all_attention_outputs = all_attention_outputs + (hidden_states[1],) + + if output_attentions: + outputs = (hidden_states[0],) + all_attention_outputs + return outputs + + return hidden_states + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "blocks", None) is not None: + for layer in self.blocks: + with tf.name_scope(layer.name): + layer.build(None) + + +class TFEfficientFormerMeta4D(keras.layers.Layer): + def __init__(self, config: EfficientFormerConfig, dim: int, drop_path: float = 0.0, **kwargs): + super().__init__(**kwargs) + pool_size = config.pool_size if config.pool_size is not None else 3 + self.token_mixer = TFEfficientFormerPooling(pool_size=pool_size, name="token_mixer") + self.dim = dim + mlp_hidden_dim = int(dim * config.mlp_expansion_ratio) + self.mlp = TFEfficientFormerConvMlp( + config=config, in_features=dim, hidden_features=mlp_hidden_dim, drop=config.hidden_dropout_prob, name="mlp" + ) + + self.drop_path = ( + TFEfficientFormerDropPath(drop_path, name="drop_path") + if drop_path > 0.0 + else keras.layers.Activation("linear", name="drop_path") + ) + self.config = config + + def build(self, input_shape=None): + self.layer_scale_1 = None + self.layer_scale_2 = None + + if self.config.use_layer_scale: + self.layer_scale_1 = self.add_weight( + shape=(self.dim), + initializer=keras.initializers.Constant(value=self.config.layer_scale_init_value), + trainable=True, + name="layer_scale_1", + ) + self.layer_scale_2 = self.add_weight( + shape=(self.dim), + initializer=keras.initializers.Constant(value=self.config.layer_scale_init_value), + trainable=True, + name="layer_scale_2", + ) + + if self.built: + return + self.built = True + if getattr(self, "token_mixer", None) is not None: + with tf.name_scope(self.token_mixer.name): + self.token_mixer.build(None) + if getattr(self, "mlp", None) is not None: + with tf.name_scope(self.mlp.name): + self.mlp.build(None) + if getattr(self, "drop_path", None) is not None: + with tf.name_scope(self.drop_path.name): + self.drop_path.build(None) + + def call(self, hidden_states: tf.Tensor, training: bool = False) -> tuple[tf.Tensor]: + outputs = self.token_mixer(hidden_states) + + if self.config.use_layer_scale: + layer_output = hidden_states + self.drop_path( + tf.expand_dims(tf.expand_dims(self.layer_scale_1, 0), 0) * outputs, + training=training, + ) + + layer_output = layer_output + self.drop_path( + tf.expand_dims(tf.expand_dims(self.layer_scale_2, 0), 0) + * self.mlp(hidden_state=layer_output, training=training), + training=training, + ) + + else: + layer_output = hidden_states + self.drop_path(outputs, training=training) + layer_output = layer_output + self.drop_path( + self.mlp(hidden_state=layer_output, training=training), training=training + ) + + return layer_output + + +class TFEfficientFormerMeta4DLayers(keras.layers.Layer): + def __init__(self, config: EfficientFormerConfig, stage_idx: int, **kwargs): + super().__init__(**kwargs) + num_layers = ( + config.depths[stage_idx] if stage_idx != -1 else config.depths[stage_idx] - config.num_meta3d_blocks + ) + drop_paths = [ + config.drop_path_rate * (block_idx + sum(config.depths[:stage_idx])) for block_idx in range(num_layers) + ] + + self.blocks = [ + TFEfficientFormerMeta4D( + config=config, dim=config.hidden_sizes[stage_idx], drop_path=drop_paths[i], name=f"blocks.{i}" + ) + for i in range(len(drop_paths)) + ] + + def call(self, hidden_states: tf.Tensor, training: bool = False) -> tuple[tf.Tensor]: + for layer_module in self.blocks: + hidden_states = layer_module(hidden_states=hidden_states, training=training) + return hidden_states + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "blocks", None) is not None: + for layer in self.blocks: + with tf.name_scope(layer.name): + layer.build(None) + + +class TFEfficientFormerIntermediateStage(keras.layers.Layer): + def __init__(self, config: EfficientFormerConfig, index: int, **kwargs): + super().__init__(**kwargs) + self.meta4D_layers = TFEfficientFormerMeta4DLayers(config=config, stage_idx=index, name="meta4D_layers") + + def call(self, hidden_states: tf.Tensor, training: bool = False) -> tuple[tf.Tensor]: + hidden_states = self.meta4D_layers(hidden_states=hidden_states, training=training) + return hidden_states + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "meta4D_layers", None) is not None: + with tf.name_scope(self.meta4D_layers.name): + self.meta4D_layers.build(None) + + +class TFEfficientFormerLastStage(keras.layers.Layer): + def __init__(self, config: EfficientFormerConfig, **kwargs): + super().__init__(**kwargs) + self.meta4D_layers = TFEfficientFormerMeta4DLayers(config=config, stage_idx=-1, name="meta4D_layers") + self.flat = TFEfficientFormerFlat(name="flat") + self.meta3D_layers = TFEfficientFormerMeta3DLayers(config, name="meta3D_layers") + + def call( + self, hidden_states: tf.Tensor, output_attentions: bool = False, training: bool = False + ) -> tuple[tf.Tensor]: + hidden_states = self.meta4D_layers(hidden_states=hidden_states, training=training) + hidden_states = self.flat(hidden_states=hidden_states) + hidden_states = self.meta3D_layers( + hidden_states=hidden_states, output_attentions=output_attentions, training=training + ) + + return hidden_states + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "meta4D_layers", None) is not None: + with tf.name_scope(self.meta4D_layers.name): + self.meta4D_layers.build(None) + if getattr(self, "flat", None) is not None: + with tf.name_scope(self.flat.name): + self.flat.build(None) + if getattr(self, "meta3D_layers", None) is not None: + with tf.name_scope(self.meta3D_layers.name): + self.meta3D_layers.build(None) + + +class TFEfficientFormerEncoder(keras.layers.Layer): + def __init__(self, config: EfficientFormerConfig, **kwargs): + super().__init__(**kwargs) + + self.config = config + num_intermediate_stages = len(config.depths) - 1 + downsamples = [ + config.downsamples[i] or config.hidden_sizes[i] != config.hidden_sizes[i + 1] + for i in range(num_intermediate_stages) + ] + + intermediate_stages = [] + layer_count = -1 + for i in range(num_intermediate_stages): + layer_count += 1 + intermediate_stages.append( + TFEfficientFormerIntermediateStage(config, i, name=f"intermediate_stages.{layer_count}") + ) + if downsamples[i]: + layer_count += 1 + intermediate_stages.append( + TFEfficientFormerPatchEmbeddings( + config, + config.hidden_sizes[i], + config.hidden_sizes[i + 1], + name=f"intermediate_stages.{layer_count}", + ) + ) + self.intermediate_stages = intermediate_stages + self.last_stage = TFEfficientFormerLastStage(config, name="last_stage") + + def call( + self, + hidden_states: tf.Tensor, + output_hidden_states: bool, + output_attentions: bool, + return_dict: bool, + training: bool = False, + ) -> TFBaseModelOutput: + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + for layer_module in self.intermediate_stages: + hidden_states = layer_module(hidden_states, training=training) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_output = self.last_stage(hidden_states, output_attentions=output_attentions, training=training) + + if output_attentions: + all_self_attentions = all_self_attentions + layer_output[1:] + + if output_hidden_states: + all_hidden_states = all_hidden_states + (layer_output[0],) + + if not return_dict: + return tuple(v for v in [layer_output[0], all_hidden_states, all_self_attentions] if v is not None) + + return TFBaseModelOutput( + last_hidden_state=layer_output[0], + hidden_states=all_hidden_states, + attentions=all_self_attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "last_stage", None) is not None: + with tf.name_scope(self.last_stage.name): + self.last_stage.build(None) + for layer in self.intermediate_stages: + with tf.name_scope(layer.name): + layer.build(None) + + +@keras_serializable +class TFEfficientFormerMainLayer(keras.layers.Layer): + config_class = EfficientFormerConfig + + def __init__(self, config: EfficientFormerConfig, **kwargs) -> None: + super().__init__(**kwargs) + self.config = config + + self.patch_embed = TFEfficientFormerConvStem(config, config.hidden_sizes[0], name="patch_embed") + self.encoder = TFEfficientFormerEncoder(config, name="encoder") + self.layernorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm") + + @unpack_inputs + def call( + self, + pixel_values: Optional[tf.Tensor] = None, + output_attentions: Optional[tf.Tensor] = None, + output_hidden_states: Optional[tf.Tensor] = None, + return_dict: Optional[bool] = None, + training: bool = False, + ) -> Union[TFBaseModelOutput, tuple[tf.Tensor, ...]]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if pixel_values is None: + raise ValueError("You have to specify pixel_values") + + # When running on CPU, keras.layers.Conv2D and keras.layers.AveragePool2D do not + # support channels first NCHW format. A number of blocks contain both. + # So change the input format from (batch_size, num_channels, height, width) to + # (batch_size, height, width, num_channels) here. + # shape = (batch_size, in_height, in_width, in_channels=num_channels) + pixel_values = tf.transpose(pixel_values, perm=(0, 2, 3, 1)) + embedding_output = self.patch_embed(pixel_values, training=training) + + encoder_outputs = self.encoder( + hidden_states=embedding_output, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + sequence_output = encoder_outputs[0] + sequence_output = self.layernorm(sequence_output, training=training) + + # Change the hidden states from (batch_size, height, width, num_channels) to + # (batch_size, num_channels, height, width). + # The hidden states are in (batch_size, height, width, num_channels) + # shape after all stages except the MB3D blocks. + if output_hidden_states: + hidden_states = tuple(tf.transpose(h, perm=(0, 3, 1, 2)) for h in encoder_outputs[1][:-1]) + ( + encoder_outputs[1][-1], + ) + + if not return_dict: + head_outputs = (sequence_output,) + return head_outputs + encoder_outputs[1:] + + return TFBaseModelOutput( + last_hidden_state=sequence_output, + hidden_states=hidden_states if output_hidden_states else encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "patch_embed", None) is not None: + with tf.name_scope(self.patch_embed.name): + self.patch_embed.build(None) + if getattr(self, "encoder", None) is not None: + with tf.name_scope(self.encoder.name): + self.encoder.build(None) + if getattr(self, "layernorm", None) is not None: + with tf.name_scope(self.layernorm.name): + self.layernorm.build([None, None, self.config.hidden_sizes[-1]]) + + +class TFEfficientFormerPreTrainedModel(TFPreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = EfficientFormerConfig + base_model_prefix = "efficientformer" + main_input_name = "pixel_values" + + +EFFICIENTFORMER_START_DOCSTRING = r""" + This model is a TensorFlow + [keras.layers.Layer](https://www.tensorflow.org/api_docs/python/tf/keras/layers/Layer). Use it as a regular + TensorFlow Module and refer to the TensorFlow documentation for all matter related to general usage and behavior. + + + Parameters: + config ([`EfficientFormerConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +EFFICIENTFORMER_INPUTS_DOCSTRING = r""" + Args: + pixel_values ((`tf.Tensor` of shape `(batch_size, num_channels, height, width)`): + Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See + [`EfficientFormerImageProcessor.__call__`] for details. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +@add_start_docstrings( + "The bare EfficientFormer Model transformer outputting raw hidden-states without any specific head on top.", + EFFICIENTFORMER_START_DOCSTRING, +) +class TFEfficientFormerModel(TFEfficientFormerPreTrainedModel): + def __init__(self, config: EfficientFormerConfig, **kwargs) -> None: + super().__init__(config, **kwargs) + + self.efficientformer = TFEfficientFormerMainLayer(config, name="efficientformer") + + @unpack_inputs + @add_start_docstrings_to_model_forward(EFFICIENTFORMER_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFBaseModelOutputWithPooling, + config_class=_CONFIG_FOR_DOC, + modality="vision", + expected_output=_EXPECTED_OUTPUT_SHAPE, + ) + def call( + self, + pixel_values: Optional[tf.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + training: bool = False, + ) -> Union[tuple, TFBaseModelOutput]: + outputs = self.efficientformer( + pixel_values=pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + return outputs + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "efficientformer", None) is not None: + with tf.name_scope(self.efficientformer.name): + self.efficientformer.build(None) + + +@add_start_docstrings( + """ + EfficientFormer Model transformer with an image classification head on top of pooled last hidden state, e.g. for + ImageNet. + """, + EFFICIENTFORMER_START_DOCSTRING, +) +class TFEfficientFormerForImageClassification(TFEfficientFormerPreTrainedModel, TFSequenceClassificationLoss): + def __init__(self, config: EfficientFormerConfig): + super().__init__(config) + + self.num_labels = config.num_labels + self.efficientformer = TFEfficientFormerMainLayer(config, name="efficientformer") + + # Classifier head + self.classifier = ( + keras.layers.Dense(config.num_labels, name="classifier") + if config.num_labels > 0 + else keras.layers.Activation("linear", name="classifier") + ) + self.config = config + + @unpack_inputs + @add_start_docstrings_to_model_forward(EFFICIENTFORMER_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_IMAGE_CLASS_CHECKPOINT, + output_type=TFImageClassifierOutput, + config_class=_CONFIG_FOR_DOC, + expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT, + ) + def call( + self, + pixel_values: Optional[tf.Tensor] = None, + labels: Optional[tf.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + training: bool = False, + ) -> Union[tf.Tensor, TFImageClassifierOutput]: + r""" + labels (`tf.Tensor` of shape `(batch_size,)`, *optional*): + Labels for computing the image classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.efficientformer( + pixel_values=pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + sequence_output = outputs[0] + + logits = self.classifier(tf.reduce_mean(sequence_output, axis=-2)) + + loss = None if labels is None else self.hf_compute_loss(labels, logits) + + if not return_dict: + output = (logits,) + outputs[1:] + return ((loss,) + output) if loss is not None else output + + return TFImageClassifierOutput( + loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "efficientformer", None) is not None: + with tf.name_scope(self.efficientformer.name): + self.efficientformer.build(None) + if getattr(self, "classifier", None) is not None: + if hasattr(self.classifier, "name"): + with tf.name_scope(self.classifier.name): + self.classifier.build([None, None, self.config.hidden_sizes[-1]]) + + +@dataclass +class TFEfficientFormerForImageClassificationWithTeacherOutput(ModelOutput): + """ + Args: + Output type of [`EfficientFormerForImageClassificationWithTeacher`]. + logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`): + Prediction scores as the average of the cls_logits and distillation logits. + cls_logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`): + Prediction scores of the classification head (i.e. the linear layer on top of the final hidden state of the + class token). + distillation_logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`): + Prediction scores of the distillation head (i.e. the linear layer on top of the final hidden state of the + distillation token). + hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when + `config.output_hidden_states=True`): + Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape + `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus + the initial embedding outputs. + attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when + `config.output_attentions=True`): + Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in + the self-attention heads. + """ + + logits: Optional[tf.Tensor] = None + cls_logits: Optional[tf.Tensor] = None + distillation_logits: Optional[tf.Tensor] = None + hidden_states: Optional[tuple[tf.Tensor]] = None + attentions: Optional[tuple[tf.Tensor]] = None + + +@add_start_docstrings( + """ + EfficientFormer Model transformer with image classification heads on top (a linear layer on top of the final hidden + state and a linear layer on top of the final hidden state of the distillation token) e.g. for ImageNet. + + .. warning:: + This model supports inference-only. Fine-tuning with distillation (i.e. with a teacher) is not yet + supported. + """, + EFFICIENTFORMER_START_DOCSTRING, +) +class TFEfficientFormerForImageClassificationWithTeacher(TFEfficientFormerPreTrainedModel): + def __init__(self, config: EfficientFormerConfig) -> None: + super().__init__(config) + + self.num_labels = config.num_labels + self.efficientformer = TFEfficientFormerMainLayer(config, name="efficientformer") + + # Classifier heads + self.classifier = ( + keras.layers.Dense(config.num_labels, name="classifier") + if config.num_labels > 0 + else keras.layers.Activation("linear", name="classifier") + ) + self.distillation_classifier = ( + keras.layers.Dense(config.num_labels, name="distillation_classifier") + if config.num_labels > 0 + else keras.layers.Activation("linear", name="distillation_classifier") + ) + + @unpack_inputs + @add_start_docstrings_to_model_forward(EFFICIENTFORMER_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_IMAGE_CLASS_CHECKPOINT, + output_type=TFEfficientFormerForImageClassificationWithTeacherOutput, + config_class=_CONFIG_FOR_DOC, + expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT, + ) + def call( + self, + pixel_values: Optional[tf.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + training: bool = False, + ) -> Union[tuple, TFEfficientFormerForImageClassificationWithTeacherOutput]: + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if training: + raise Exception( + "This model supports inference-only. Fine-tuning with distillation (i.e. with a teacher) is not yet supported." + ) + + outputs = self.efficientformer( + pixel_values=pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + sequence_output = outputs[0] + + cls_logits = self.classifier(tf.reduce_mean(sequence_output, axis=-2)) + distillation_logits = self.distillation_classifier(tf.reduce_mean(sequence_output, axis=-2)) + logits = (cls_logits + distillation_logits) / 2 + + if not return_dict: + output = (logits, cls_logits, distillation_logits) + outputs[1:] + return output + + return TFEfficientFormerForImageClassificationWithTeacherOutput( + logits=logits, + cls_logits=cls_logits, + distillation_logits=distillation_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "efficientformer", None) is not None: + with tf.name_scope(self.efficientformer.name): + self.efficientformer.build(None) + if getattr(self, "classifier", None) is not None: + if hasattr(self.classifier, "name"): + with tf.name_scope(self.classifier.name): + self.classifier.build([None, None, self.config.hidden_sizes[-1]]) + if getattr(self, "distillation_classifier", None) is not None: + if hasattr(self.distillation_classifier, "name"): + with tf.name_scope(self.distillation_classifier.name): + self.distillation_classifier.build([None, None, self.config.hidden_sizes[-1]]) + + +__all__ = [ + "TFEfficientFormerForImageClassification", + "TFEfficientFormerForImageClassificationWithTeacher", + "TFEfficientFormerModel", + "TFEfficientFormerPreTrainedModel", +] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deprecated/ernie_m/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deprecated/ernie_m/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..2beb8f463ff10af33ea980599ea4d5fc05888acb --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deprecated/ernie_m/__init__.py @@ -0,0 +1,28 @@ +# Copyright 2023 The HuggingFace and Baidu Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ....utils import _LazyModule +from ....utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_ernie_m import * + from .modeling_ernie_m import * + from .tokenization_ernie_m import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deprecated/ernie_m/modeling_ernie_m.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deprecated/ernie_m/modeling_ernie_m.py new file mode 100644 index 0000000000000000000000000000000000000000..f0e97c132d0989079aa647cac4e64bbae451a116 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deprecated/ernie_m/modeling_ernie_m.py @@ -0,0 +1,1061 @@ +# coding=utf-8 +# Copyright 2023 Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch ErnieM model.""" + +import math +from typing import Optional, Union + +import torch +from torch import nn, tensor +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from ....activations import ACT2FN +from ....cache_utils import Cache +from ....modeling_outputs import ( + BaseModelOutputWithPastAndCrossAttentions, + BaseModelOutputWithPoolingAndCrossAttentions, + MultipleChoiceModelOutput, + QuestionAnsweringModelOutput, + SequenceClassifierOutput, + TokenClassifierOutput, +) +from ....modeling_utils import PreTrainedModel +from ....pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer +from ....utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging +from ....utils.deprecation import deprecate_kwarg +from .configuration_ernie_m import ErnieMConfig + + +logger = logging.get_logger(__name__) + +_CHECKPOINT_FOR_DOC = "susnato/ernie-m-base_pytorch" +_CONFIG_FOR_DOC = "ErnieMConfig" +_TOKENIZER_FOR_DOC = "ErnieMTokenizer" + + +# Adapted from paddlenlp.transformers.ernie_m.modeling.ErnieEmbeddings +class ErnieMEmbeddings(nn.Module): + """Construct the embeddings from word and position embeddings.""" + + def __init__(self, config): + super().__init__() + self.hidden_size = config.hidden_size + self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id) + self.position_embeddings = nn.Embedding( + config.max_position_embeddings, config.hidden_size, padding_idx=config.pad_token_id + ) + self.layer_norm = nn.LayerNorm(normalized_shape=config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(p=config.hidden_dropout_prob) + self.padding_idx = config.pad_token_id + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.LongTensor] = None, + past_key_values_length: int = 0, + ) -> torch.Tensor: + if inputs_embeds is None: + inputs_embeds = self.word_embeddings(input_ids) + if position_ids is None: + input_shape = inputs_embeds.size()[:-1] + ones = torch.ones(input_shape, dtype=torch.int64, device=inputs_embeds.device) + seq_length = torch.cumsum(ones, dim=1) + position_ids = seq_length - ones + + if past_key_values_length > 0: + position_ids = position_ids + past_key_values_length + # to mimic paddlenlp implementation + position_ids += 2 + position_embeddings = self.position_embeddings(position_ids) + embeddings = inputs_embeds + position_embeddings + embeddings = self.layer_norm(embeddings) + embeddings = self.dropout(embeddings) + + return embeddings + + +class ErnieMSelfAttention(nn.Module): + def __init__(self, config, position_embedding_type=None): + super().__init__() + if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): + raise ValueError( + f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention " + f"heads ({config.num_attention_heads})" + ) + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.q_proj = nn.Linear(config.hidden_size, self.all_head_size) + self.k_proj = nn.Linear(config.hidden_size, self.all_head_size) + self.v_proj = nn.Linear(config.hidden_size, self.all_head_size) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + self.position_embedding_type = position_embedding_type or getattr( + config, "position_embedding_type", "absolute" + ) + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + self.max_position_embeddings = config.max_position_embeddings + self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) + + self.is_decoder = config.is_decoder + + def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor: + new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) + x = x.view(new_x_shape) + return x.permute(0, 2, 1, 3) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_values: Optional[Cache] = None, + output_attentions: Optional[bool] = False, + ) -> tuple[torch.Tensor]: + mixed_query_layer = self.q_proj(hidden_states) + + # If this is instantiated as a cross-attention module, the keys + # and values come from an encoder; the attention mask needs to be + # such that the encoder's padding tokens are not attended to. + is_cross_attention = encoder_hidden_states is not None + + if is_cross_attention and past_key_values is not None: + # reuse k,v, cross_attentions + key_layer = past_key_values[0] + value_layer = past_key_values[1] + attention_mask = encoder_attention_mask + elif is_cross_attention: + key_layer = self.transpose_for_scores(self.k_proj(encoder_hidden_states)) + value_layer = self.transpose_for_scores(self.v_proj(encoder_hidden_states)) + attention_mask = encoder_attention_mask + elif past_key_values is not None: + key_layer = self.transpose_for_scores(self.k_proj(hidden_states)) + value_layer = self.transpose_for_scores(self.v_proj(hidden_states)) + key_layer = torch.cat([past_key_values[0], key_layer], dim=2) + value_layer = torch.cat([past_key_values[1], value_layer], dim=2) + else: + key_layer = self.transpose_for_scores(self.k_proj(hidden_states)) + value_layer = self.transpose_for_scores(self.v_proj(hidden_states)) + + query_layer = self.transpose_for_scores(mixed_query_layer) + + use_cache = past_key_values is not None + if self.is_decoder: + # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states. + # Further calls to cross_attention layer can then reuse all cross-attention + # key/value_states (first "if" case) + # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of + # all previous decoder key/value_states. Further calls to uni-directional self-attention + # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) + # if encoder bi-directional self-attention `past_key_values` is always `None` + past_key_values = (key_layer, value_layer) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + query_length, key_length = query_layer.shape[2], key_layer.shape[2] + if use_cache: + position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=hidden_states.device).view( + -1, 1 + ) + else: + position_ids_l = torch.arange(query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1) + position_ids_r = torch.arange(key_length, dtype=torch.long, device=hidden_states.device).view(1, -1) + distance = position_ids_l - position_ids_r + + positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1) + positional_embedding = positional_embedding.to(dtype=query_layer.dtype) # fp16 compatibility + + if self.position_embedding_type == "relative_key": + relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores + elif self.position_embedding_type == "relative_key_query": + relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key + + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in ErnieMModel forward() function) + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = nn.functional.softmax(attention_scores, dim=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = attention_probs * head_mask + + context_layer = torch.matmul(attention_probs, value_layer) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(new_context_layer_shape) + + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + + if self.is_decoder: + outputs = outputs + (past_key_values,) + return outputs + + +class ErnieMAttention(nn.Module): + def __init__(self, config, position_embedding_type=None): + super().__init__() + self.self_attn = ErnieMSelfAttention(config, position_embedding_type=position_embedding_type) + self.out_proj = nn.Linear(config.hidden_size, config.hidden_size) + self.pruned_heads = set() + + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.self_attn.num_attention_heads, self.self_attn.attention_head_size, self.pruned_heads + ) + + # Prune linear layers + self.self_attn.q_proj = prune_linear_layer(self.self_attn.q_proj, index) + self.self_attn.k_proj = prune_linear_layer(self.self_attn.k_proj, index) + self.self_attn.v_proj = prune_linear_layer(self.self_attn.v_proj, index) + self.out_proj = prune_linear_layer(self.out_proj, index, dim=1) + + # Update hyper params and store pruned heads + self.self_attn.num_attention_heads = self.self_attn.num_attention_heads - len(heads) + self.self_attn.all_head_size = self.self_attn.attention_head_size * self.self_attn.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_values: Optional[Cache] = None, + output_attentions: Optional[bool] = False, + ) -> tuple[torch.Tensor]: + self_outputs = self.self_attn( + hidden_states, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_values, + output_attentions, + ) + attention_output = self.out_proj(self_outputs[0]) + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + return outputs + + +class ErnieMEncoderLayer(nn.Module): + def __init__(self, config): + super().__init__() + # to mimic paddlenlp implementation + dropout = 0.1 if config.hidden_dropout_prob is None else config.hidden_dropout_prob + act_dropout = config.hidden_dropout_prob if config.act_dropout is None else config.act_dropout + + self.self_attn = ErnieMAttention(config) + self.linear1 = nn.Linear(config.hidden_size, config.intermediate_size) + self.dropout = nn.Dropout(act_dropout) + self.linear2 = nn.Linear(config.intermediate_size, config.hidden_size) + self.norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.norm2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout1 = nn.Dropout(dropout) + self.dropout2 = nn.Dropout(dropout) + if isinstance(config.hidden_act, str): + self.activation = ACT2FN[config.hidden_act] + else: + self.activation = config.hidden_act + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + past_key_values: Optional[Cache] = None, + output_attentions: Optional[bool] = True, + ): + residual = hidden_states + if output_attentions: + hidden_states, attention_opt_weights = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + head_mask=head_mask, + past_key_values=past_key_values, + output_attentions=output_attentions, + ) + + else: + hidden_states = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + head_mask=head_mask, + past_key_values=past_key_values, + output_attentions=output_attentions, + ) + hidden_states = residual + self.dropout1(hidden_states) + hidden_states = self.norm1(hidden_states) + residual = hidden_states + + hidden_states = self.linear1(hidden_states) + hidden_states = self.activation(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.linear2(hidden_states) + hidden_states = residual + self.dropout2(hidden_states) + hidden_states = self.norm2(hidden_states) + + if output_attentions: + return hidden_states, attention_opt_weights + else: + return hidden_states + + +class ErnieMEncoder(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.layers = nn.ModuleList([ErnieMEncoderLayer(config) for _ in range(config.num_hidden_layers)]) + + def forward( + self, + input_embeds: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + past_key_values: Optional[Cache] = None, + output_attentions: Optional[bool] = False, + output_hidden_states: Optional[bool] = False, + return_dict: Optional[bool] = True, + ) -> Union[tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]: + hidden_states = () if output_hidden_states else None + attentions = () if output_attentions else None + + output = input_embeds + if output_hidden_states: + hidden_states = hidden_states + (output,) + for i, layer in enumerate(self.layers): + layer_head_mask = head_mask[i] if head_mask is not None else None + + output, opt_attn_weights = layer( + hidden_states=output, + attention_mask=attention_mask, + head_mask=layer_head_mask, + past_key_values=past_key_values[i] if past_key_values is not None else None, + ) + + if output_hidden_states: + hidden_states = hidden_states + (output,) + if output_attentions: + attentions = attentions + (opt_attn_weights,) + + last_hidden_state = output + if not return_dict: + return tuple(v for v in [last_hidden_state, hidden_states, attentions] if v is not None) + + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=last_hidden_state, hidden_states=hidden_states, attentions=attentions + ) + + +class ErnieMPooler(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.activation = nn.Tanh() + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. + first_token_tensor = hidden_states[:, 0] + pooled_output = self.dense(first_token_tensor) + pooled_output = self.activation(pooled_output) + return pooled_output + + +class ErnieMPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config: ErnieMConfig + base_model_prefix = "ernie_m" + + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, nn.Linear): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + +ERNIE_M_START_DOCSTRING = r""" + + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use + it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and + behavior. + + Parameters: + config ([`ErnieMConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +ERNIE_M_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `({0})`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`ErnieMTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + position_ids (`torch.LongTensor` of shape `({0})`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + + [What are position IDs?](../glossary#position-ids) + head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert *input_ids* indices into associated vectors than the + model's internal embedding lookup matrix. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +@add_start_docstrings( + "The bare ErnieM Model transformer outputting raw hidden-states without any specific head on top.", + ERNIE_M_START_DOCSTRING, +) +class ErnieMModel(ErnieMPreTrainedModel): + def __init__(self, config, add_pooling_layer=True): + super().__init__(config) + self.initializer_range = config.initializer_range + self.embeddings = ErnieMEmbeddings(config) + self.encoder = ErnieMEncoder(config) + self.pooler = ErnieMPooler(config) if add_pooling_layer else None + self.post_init() + + def get_input_embeddings(self): + return self.embeddings.word_embeddings + + def set_input_embeddings(self, value): + self.embeddings.word_embeddings = value + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layers[layer].self_attn.prune_heads(heads) + + @add_start_docstrings_to_model_forward(ERNIE_M_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + processor_class=_TOKENIZER_FOR_DOC, + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=BaseModelOutputWithPastAndCrossAttentions, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids: Optional[tensor] = None, + position_ids: Optional[tensor] = None, + attention_mask: Optional[tensor] = None, + head_mask: Optional[tensor] = None, + inputs_embeds: Optional[tensor] = None, + past_key_values: Optional[tuple[tuple[tensor]]] = None, + use_cache: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + output_attentions: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple[torch.FloatTensor], BaseModelOutputWithPoolingAndCrossAttentions]: + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time.") + + # init the default bool value + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.return_dict + + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + + past_key_values_length = 0 + if past_key_values is not None: + past_key_values_length = past_key_values.get_seq_length() + + # Adapted from paddlenlp.transformers.ernie_m.ErnieMModel + if attention_mask is None: + attention_mask = (input_ids == self.config.pad_token_id).to(torch.float32) + attention_mask *= torch.finfo(attention_mask.dtype).min + if past_key_values is not None: + batch_size = past_key_values[0][0].shape[0] + past_mask = torch.zeros([batch_size, 1, 1, past_key_values_length], dtype=attention_mask.dtype) + attention_mask = torch.concat([past_mask, attention_mask], dim=-1) + # For 2D attention_mask from tokenizer + elif attention_mask.ndim == 2: + attention_mask = attention_mask.to(torch.float32) + attention_mask = 1.0 - attention_mask + attention_mask *= torch.finfo(attention_mask.dtype).min + + extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(1) + + embedding_output = self.embeddings( + input_ids=input_ids, + position_ids=position_ids, + inputs_embeds=inputs_embeds, + past_key_values_length=past_key_values_length, + ) + encoder_outputs = self.encoder( + embedding_output, + attention_mask=extended_attention_mask, + head_mask=head_mask, + past_key_values=past_key_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + if not return_dict: + sequence_output = encoder_outputs[0] + pooler_output = self.pooler(sequence_output) if self.pooler is not None else None + return (sequence_output, pooler_output) + encoder_outputs[1:] + + sequence_output = encoder_outputs["last_hidden_state"] + pooler_output = self.pooler(sequence_output) if self.pooler is not None else None + hidden_states = None if not output_hidden_states else encoder_outputs["hidden_states"] + attentions = None if not output_attentions else encoder_outputs["attentions"] + + return BaseModelOutputWithPoolingAndCrossAttentions( + last_hidden_state=sequence_output, + pooler_output=pooler_output, + hidden_states=hidden_states, + attentions=attentions, + ) + + +@add_start_docstrings( + """ErnieM Model transformer with a sequence classification/regression head on top (a linear layer on top of + the pooled output) e.g. for GLUE tasks.""", + ERNIE_M_START_DOCSTRING, +) +class ErnieMForSequenceClassification(ErnieMPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.config = config + + self.ernie_m = ErnieMModel(config) + classifier_dropout = ( + config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob + ) + self.dropout = nn.Dropout(classifier_dropout) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(ERNIE_M_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + processor_class=_TOKENIZER_FOR_DOC, + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=SequenceClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + past_key_values: Optional[Cache] = None, + use_cache: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + output_attentions: Optional[bool] = None, + return_dict: Optional[bool] = True, + labels: Optional[torch.Tensor] = None, + ) -> Union[tuple[torch.FloatTensor], SequenceClassifierOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.ernie_m( + input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + past_key_values=past_key_values, + output_hidden_states=output_hidden_states, + output_attentions=output_attentions, + return_dict=return_dict, + ) + + pooled_output = outputs[1] + + pooled_output = self.dropout(pooled_output) + logits = self.classifier(pooled_output) + + loss = None + if labels is not None: + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits, labels) + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ErnieM Model with a multiple choice classification head on top (a linear layer on top of + the pooled output and a softmax) e.g. for RocStories/SWAG tasks.""", + ERNIE_M_START_DOCSTRING, +) +class ErnieMForMultipleChoice(ErnieMPreTrainedModel): + def __init__(self, config): + super().__init__(config) + + self.ernie_m = ErnieMModel(config) + classifier_dropout = ( + config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob + ) + self.dropout = nn.Dropout(classifier_dropout) + self.classifier = nn.Linear(config.hidden_size, 1) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(ERNIE_M_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=MultipleChoiceModelOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = True, + ) -> Union[tuple[torch.FloatTensor], MultipleChoiceModelOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., + num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See + `input_ids` above) + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] + + input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None + attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None + position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None + inputs_embeds = ( + inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1)) + if inputs_embeds is not None + else None + ) + + outputs = self.ernie_m( + input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + pooled_output = outputs[1] + + pooled_output = self.dropout(pooled_output) + logits = self.classifier(pooled_output) + reshaped_logits = logits.view(-1, num_choices) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(reshaped_logits, labels) + + if not return_dict: + output = (reshaped_logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return MultipleChoiceModelOutput( + loss=loss, + logits=reshaped_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ErnieM Model with a token classification head on top (a linear layer on top of + the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks.""", + ERNIE_M_START_DOCSTRING, +) +class ErnieMForTokenClassification(ErnieMPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + + self.ernie_m = ErnieMModel(config, add_pooling_layer=False) + classifier_dropout = ( + config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob + ) + self.dropout = nn.Dropout(classifier_dropout) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(ERNIE_M_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + processor_class=_TOKENIZER_FOR_DOC, + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TokenClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + past_key_values: Optional[Cache] = None, + output_hidden_states: Optional[bool] = None, + output_attentions: Optional[bool] = None, + return_dict: Optional[bool] = True, + labels: Optional[torch.Tensor] = None, + ) -> Union[tuple[torch.FloatTensor], TokenClassifierOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.ernie_m( + input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + past_key_values=past_key_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + sequence_output = self.dropout(sequence_output) + logits = self.classifier(sequence_output) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TokenClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ErnieM Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear + layers on top of the hidden-states output to compute `span start logits` and `span end logits`).""", + ERNIE_M_START_DOCSTRING, +) +class ErnieMForQuestionAnswering(ErnieMPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + + self.ernie_m = ErnieMModel(config, add_pooling_layer=False) + self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(ERNIE_M_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + processor_class=_TOKENIZER_FOR_DOC, + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=QuestionAnsweringModelOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + start_positions: Optional[torch.Tensor] = None, + end_positions: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = True, + ) -> Union[tuple[torch.FloatTensor], QuestionAnsweringModelOutput]: + r""" + start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for position (index) of the start of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence + are not taken into account for computing the loss. + end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for position (index) of the end of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence + are not taken into account for computing the loss. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.ernie_m( + input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + logits = self.qa_outputs(sequence_output) + start_logits, end_logits = logits.split(1, dim=-1) + start_logits = start_logits.squeeze(-1).contiguous() + end_logits = end_logits.squeeze(-1).contiguous() + + total_loss = None + if start_positions is not None and end_positions is not None: + # If we are on multi-GPU, split add a dimension + if len(start_positions.size()) > 1: + start_positions = start_positions.squeeze(-1) + if len(end_positions.size()) > 1: + end_positions = end_positions.squeeze(-1) + # sometimes the start/end positions are outside our model inputs, we ignore these terms + ignored_index = start_logits.size(1) + start_positions = start_positions.clamp(0, ignored_index) + end_positions = end_positions.clamp(0, ignored_index) + + loss_fct = CrossEntropyLoss(ignore_index=ignored_index) + start_loss = loss_fct(start_logits, start_positions) + end_loss = loss_fct(end_logits, end_positions) + total_loss = (start_loss + end_loss) / 2 + + if not return_dict: + output = (start_logits, end_logits) + outputs[2:] + return ((total_loss,) + output) if total_loss is not None else output + + return QuestionAnsweringModelOutput( + loss=total_loss, + start_logits=start_logits, + end_logits=end_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ErnieMForInformationExtraction is a Ernie-M Model with two linear layer on top of the hidden-states output to + compute `start_prob` and `end_prob`, designed for Universal Information Extraction.""", + ERNIE_M_START_DOCSTRING, +) +class ErnieMForInformationExtraction(ErnieMPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.ernie_m = ErnieMModel(config) + self.linear_start = nn.Linear(config.hidden_size, 1) + self.linear_end = nn.Linear(config.hidden_size, 1) + self.sigmoid = nn.Sigmoid() + self.post_init() + + @add_start_docstrings_to_model_forward(ERNIE_M_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")) + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + start_positions: Optional[torch.Tensor] = None, + end_positions: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = True, + ) -> Union[tuple[torch.FloatTensor], QuestionAnsweringModelOutput]: + r""" + start_positions (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for position (index) for computing the start_positions loss. Position outside of the sequence are + not taken into account for computing the loss. + end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for position (index) for computing the end_positions loss. Position outside of the sequence are not + taken into account for computing the loss. + """ + + result = self.ernie_m( + input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + if return_dict: + sequence_output = result.last_hidden_state + elif not return_dict: + sequence_output = result[0] + + start_logits = self.linear_start(sequence_output) + start_logits = start_logits.squeeze(-1) + end_logits = self.linear_end(sequence_output) + end_logits = end_logits.squeeze(-1) + + total_loss = None + if start_positions is not None and end_positions is not None: + # If we are on multi-GPU, split add a dimension + if len(start_positions.size()) > 1: + start_positions = start_positions.squeeze(-1) + if len(end_positions.size()) > 1: + end_positions = end_positions.squeeze(-1) + # sometimes the start/end positions are outside our model inputs, we ignore these terms + ignored_index = start_logits.size(1) + start_positions = start_positions.clamp(0, ignored_index) + end_positions = end_positions.clamp(0, ignored_index) + + loss_fct = BCEWithLogitsLoss() + start_loss = loss_fct(start_logits, start_positions) + end_loss = loss_fct(end_logits, end_positions) + total_loss = (start_loss + end_loss) / 2 + + if not return_dict: + return tuple( + i + for i in [total_loss, start_logits, end_logits, result.hidden_states, result.attentions] + if i is not None + ) + + return QuestionAnsweringModelOutput( + loss=total_loss, + start_logits=start_logits, + end_logits=end_logits, + hidden_states=result.hidden_states, + attentions=result.attentions, + ) + + +__all__ = [ + "ErnieMForMultipleChoice", + "ErnieMForQuestionAnswering", + "ErnieMForSequenceClassification", + "ErnieMForTokenClassification", + "ErnieMModel", + "ErnieMPreTrainedModel", + "ErnieMForInformationExtraction", +] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deprecated/ernie_m/tokenization_ernie_m.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deprecated/ernie_m/tokenization_ernie_m.py new file mode 100644 index 0000000000000000000000000000000000000000..4c33abd043e6d170485e6d01298788d461275aa6 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deprecated/ernie_m/tokenization_ernie_m.py @@ -0,0 +1,409 @@ +# coding=utf-8 +# Copyright 2023 Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes for Ernie-M.""" + +import os +import unicodedata +from typing import Any, Optional + +import sentencepiece as spm + +from ....tokenization_utils import PreTrainedTokenizer +from ....utils import logging +from ....utils.import_utils import requires + + +logger = logging.get_logger(__name__) + +SPIECE_UNDERLINE = "▁" + +VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "sentencepiece_model_ckpt": "sentencepiece.bpe.model"} + +RESOURCE_FILES_NAMES = { + "sentencepiece_model_file": "sentencepiece.bpe.model", + "vocab_file": "vocab.txt", +} + + +# Adapted from paddlenlp.transformers.ernie_m.tokenizer.ErnieMTokenizer +@requires(backends=("sentencepiece",)) +class ErnieMTokenizer(PreTrainedTokenizer): + r""" + Constructs a Ernie-M tokenizer. It uses the `sentencepiece` tools to cut the words to sub-words. + + Args: + sentencepiece_model_file (`str`): + The file path of sentencepiece model. + vocab_file (`str`, *optional*): + The file path of the vocabulary. + do_lower_case (`str`, *optional*, defaults to `True`): + Whether or not to lowercase the input when tokenizing. + unk_token (`str`, *optional*, defaults to `"[UNK]"`): + A special token representing the `unknown (out-of-vocabulary)` token. An unknown token is set to be + `unk_token` inorder to be converted to an ID. + sep_token (`str`, *optional*, defaults to `"[SEP]"`): + A special token separating two different sentences in the same input. + pad_token (`str`, *optional*, defaults to `"[PAD]"`): + A special token used to make arrays of tokens the same size for batching purposes. + cls_token (`str`, *optional*, defaults to `"[CLS]"`): + A special token used for sequence classification. It is the last token of the sequence when built with + special tokens. + mask_token (`str`, *optional*, defaults to `"[MASK]"`): + A special token representing a masked token. This is the token used in the masked language modeling task + which the model tries to predict the original unmasked ones. + """ + + # Ernie-M model doesn't have token_type embedding. + model_input_names: list[str] = ["input_ids"] + + vocab_files_names = VOCAB_FILES_NAMES + resource_files_names = RESOURCE_FILES_NAMES + + def __init__( + self, + sentencepiece_model_ckpt, + vocab_file=None, + do_lower_case=False, + encoding="utf8", + unk_token="[UNK]", + sep_token="[SEP]", + pad_token="[PAD]", + cls_token="[CLS]", + mask_token="[MASK]", + sp_model_kwargs: Optional[dict[str, Any]] = None, + **kwargs, + ) -> None: + # Mask token behave like a normal word, i.e. include the space before it and + # is included in the raw text, there should be a match in a non-normalized sentence. + + self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + + self.do_lower_case = do_lower_case + self.sentencepiece_model_ckpt = sentencepiece_model_ckpt + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) + self.sp_model.Load(sentencepiece_model_ckpt) + + # to mimic paddlenlp.transformers.ernie_m.tokenizer.ErnieMTokenizer functioning + if vocab_file is not None: + self.vocab = self.load_vocab(filepath=vocab_file) + else: + self.vocab = {self.sp_model.id_to_piece(id): id for id in range(self.sp_model.get_piece_size())} + self.reverse_vocab = {v: k for k, v in self.vocab.items()} + + super().__init__( + do_lower_case=do_lower_case, + unk_token=unk_token, + sep_token=sep_token, + pad_token=pad_token, + cls_token=cls_token, + mask_token=mask_token, + vocab_file=vocab_file, + encoding=encoding, + sp_model_kwargs=self.sp_model_kwargs, + **kwargs, + ) + + def get_offset_mapping(self, text): + if text is None: + return None + + split_tokens = self.tokenize(text) + normalized_text, char_mapping = "", [] + + for i, ch in enumerate(text): + if ch in self.SP_CHAR_MAPPING: + ch = self.SP_CHAR_MAPPING.get(ch) + else: + ch = unicodedata.normalize("NFKC", ch) + if self.is_whitespace(ch): + continue + normalized_text += ch + char_mapping.extend([i] * len(ch)) + + text, token_mapping, offset = normalized_text, [], 0 + + if self.do_lower_case: + text = text.lower() + + for token in split_tokens: + if token[:1] == "▁": + token = token[1:] + start = text[offset:].index(token) + offset + end = start + len(token) + + token_mapping.append((char_mapping[start], char_mapping[end - 1] + 1)) + offset = end + return token_mapping + + @property + def vocab_size(self): + return len(self.vocab) + + def get_vocab(self): + return dict(self.vocab, **self.added_tokens_encoder) + + def __getstate__(self): + state = self.__dict__.copy() + state["sp_model"] = None + return state + + def __setstate__(self, d): + self.__dict__ = d + + # for backward compatibility + if not hasattr(self, "sp_model_kwargs"): + self.sp_model_kwargs = {} + + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) + self.sp_model.Load(self.sentencepiece_model_ckpt) + + def clean_text(self, text): + """Performs invalid character removal and whitespace cleanup on text.""" + return "".join(self.SP_CHAR_MAPPING.get(c, c) for c in text) + + def _tokenize(self, text, enable_sampling=False, nbest_size=64, alpha=0.1): + """Tokenize a string.""" + + if self.sp_model_kwargs.get("enable_sampling") is True: + enable_sampling = True + if self.sp_model_kwargs.get("alpha") is not None: + alpha = self.sp_model_kwargs.get("alpha") + if self.sp_model_kwargs.get("nbest_size") is not None: + nbest_size = self.sp_model_kwargs.get("nbest_size") + + if not enable_sampling: + pieces = self.sp_model.EncodeAsPieces(text) + else: + pieces = self.sp_model.SampleEncodeAsPieces(text, nbest_size, alpha) + new_pieces = [] + for pi, piece in enumerate(pieces): + if piece == SPIECE_UNDERLINE: + if not pieces[pi + 1].startswith(SPIECE_UNDERLINE) and pi != 0: + new_pieces.append(SPIECE_UNDERLINE) + continue + else: + continue + lst_i = 0 + for i, chunk in enumerate(piece): + if chunk == SPIECE_UNDERLINE: + continue + if self.is_ch_char(chunk) or self.is_punct(chunk): + if i > lst_i and piece[lst_i:i] != SPIECE_UNDERLINE: + new_pieces.append(piece[lst_i:i]) + new_pieces.append(chunk) + lst_i = i + 1 + elif chunk.isdigit() and i > 0 and not piece[i - 1].isdigit(): + if i > lst_i and piece[lst_i:i] != SPIECE_UNDERLINE: + new_pieces.append(piece[lst_i:i]) + lst_i = i + elif not chunk.isdigit() and i > 0 and piece[i - 1].isdigit(): + if i > lst_i and piece[lst_i:i] != SPIECE_UNDERLINE: + new_pieces.append(piece[lst_i:i]) + lst_i = i + if len(piece) > lst_i: + new_pieces.append(piece[lst_i:]) + return new_pieces + + def convert_tokens_to_string(self, tokens): + """Converts a sequence of tokens (strings for sub-words) in a single string.""" + out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip() + return out_string + + def convert_ids_to_string(self, ids): + """ + Converts a sequence of tokens (strings for sub-words) in a single string. + """ + tokens = self.convert_ids_to_tokens(ids) + out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip() + return out_string + + # to mimic paddlenlp.transformers.ernie_m.tokenizer.ErnieMTokenizer functioning + def _convert_token_to_id(self, token): + return self.vocab.get(token, self.vocab.get(self.unk_token)) + + # to mimic paddlenlp.transformers.ernie_m.tokenizer.ErnieMTokenizer functioning + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.reverse_vocab.get(index, self.unk_token) + + def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): + r""" + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. An ErnieM sequence has the following format: + + - single sequence: `[CLS] X [SEP]` + - pair of sequences: `[CLS] A [SEP] [SEP] B [SEP]` + + Args: + token_ids_0 (`list[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`list[int]`, *optional*): + Optional second list of IDs for sequence pairs. + Returns: + `list[int]`: List of input_id with the appropriate special tokens. + """ + if token_ids_1 is None: + return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] + _cls = [self.cls_token_id] + _sep = [self.sep_token_id] + return _cls + token_ids_0 + _sep + _sep + token_ids_1 + _sep + + def build_offset_mapping_with_special_tokens(self, offset_mapping_0, offset_mapping_1=None): + r""" + Build offset map from a pair of offset map by concatenating and adding offsets of special tokens. An Ernie-M + offset_mapping has the following format: + + - single sequence: `(0,0) X (0,0)` + - pair of sequences: `(0,0) A (0,0) (0,0) B (0,0)` + + Args: + offset_mapping_ids_0 (`list[tuple]`): + List of char offsets to which the special tokens will be added. + offset_mapping_ids_1 (`list[tuple]`, *optional*): + Optional second list of wordpiece offsets for offset mapping pairs. + Returns: + `list[tuple]`: List of wordpiece offsets with the appropriate offsets of special tokens. + """ + if offset_mapping_1 is None: + return [(0, 0)] + offset_mapping_0 + [(0, 0)] + + return [(0, 0)] + offset_mapping_0 + [(0, 0), (0, 0)] + offset_mapping_1 + [(0, 0)] + + def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False): + r""" + Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer `encode` method. + + Args: + token_ids_0 (`list[int]`): + List of ids of the first sequence. + token_ids_1 (`list[int]`, *optional*): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (`str`, *optional*, defaults to `False`): + Whether or not the token list is already formatted with special tokens for the model. + Returns: + `list[int]`: + The list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + + if already_has_special_tokens: + if token_ids_1 is not None: + raise ValueError( + "You should not supply a second sequence if the provided sequence of " + "ids is already formatted with special tokens for the model." + ) + return [1 if x in [self.sep_token_id, self.cls_token_id] else 0 for x in token_ids_0] + + if token_ids_1 is not None: + return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1] + return [1] + ([0] * len(token_ids_0)) + [1] + + def create_token_type_ids_from_sequences( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None + ) -> list[int]: + """ + Create the token type IDs corresponding to the sequences passed. [What are token type + IDs?](../glossary#token-type-ids) Should be overridden in a subclass if the model has a special way of + building: those. + + Args: + token_ids_0 (`list[int]`): + The first tokenized sequence. + token_ids_1 (`list[int]`, *optional*): + The second tokenized sequence. + Returns: + `list[int]`: The token type ids. + """ + # called when `add_special_tokens` is True, so align with `build_inputs_with_special_tokens` method + if token_ids_1 is None: + # [CLS] X [SEP] + return (len(token_ids_0) + 2) * [0] + + # [CLS] A [SEP] [SEP] B [SEP] + return [0] * (len(token_ids_0) + 1) + [1] * (len(token_ids_1) + 3) + + def is_ch_char(self, char): + """ + is_ch_char + """ + if "\u4e00" <= char <= "\u9fff": + return True + return False + + def is_alpha(self, char): + """ + is_alpha + """ + if ("a" <= char <= "z") or ("A" <= char <= "Z"): + return True + return False + + def is_punct(self, char): + """ + is_punct + """ + if char in ",;:.?!~,;:。?!《》【】": + return True + return False + + def is_whitespace(self, char): + """ + is whitespace + """ + if char == " " or char == "\t" or char == "\n" or char == "\r": + return True + if len(char) == 1: + cat = unicodedata.category(char) + if cat == "Zs": + return True + return False + + def load_vocab(self, filepath): + token_to_idx = {} + with open(filepath, "r", encoding="utf-8") as f: + for index, line in enumerate(f): + token = line.rstrip("\n") + token_to_idx[token] = int(index) + + return token_to_idx + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]: + index = 0 + if os.path.isdir(save_directory): + vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + else: + vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory + with open(vocab_file, "w", encoding="utf-8") as writer: + for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]): + if index != token_index: + logger.warning( + f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive." + " Please check that the vocabulary is not corrupted!" + ) + index = token_index + writer.write(token + "\n") + index += 1 + + tokenizer_model_file = os.path.join(save_directory, "sentencepiece.bpe.model") + with open(tokenizer_model_file, "wb") as fi: + content_spiece_model = self.sp_model.serialized_model_proto() + fi.write(content_spiece_model) + + return (vocab_file,) + + +__all__ = ["ErnieMTokenizer"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deprecated/mmbt/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deprecated/mmbt/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..03b556e2eddf6d5f81f6f4a15346596dd5a32f85 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deprecated/mmbt/__init__.py @@ -0,0 +1,27 @@ +# Copyright 2020 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ....utils import _LazyModule +from ....utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_mmbt import * + from .modeling_mmbt import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deprecated/mmbt/configuration_mmbt.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deprecated/mmbt/configuration_mmbt.py new file mode 100644 index 0000000000000000000000000000000000000000..1c58e4e6cdd0d0a8c87b9b94ca896c87970b5654 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deprecated/mmbt/configuration_mmbt.py @@ -0,0 +1,45 @@ +# coding=utf-8 +# Copyright (c) Facebook, Inc. and its affiliates. +# Copyright (c) HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""MMBT configuration""" + +from ....utils import logging + + +logger = logging.get_logger(__name__) + + +class MMBTConfig: + """ + This is the configuration class to store the configuration of a [`MMBTModel`]. It is used to instantiate a MMBT + model according to the specified arguments, defining the model architecture. + + Args: + config ([`PreTrainedConfig`]): + Config of the underlying Transformer models. Its values are copied over to use a single config. + num_labels (`int`, *optional*): + Size of final Linear layer for classification. + modal_hidden_size (`int`, *optional*, defaults to 2048): + Embedding dimension of the non-text modality encoder. + """ + + def __init__(self, config, num_labels=None, modal_hidden_size=2048): + self.__dict__ = config.__dict__ + self.modal_hidden_size = modal_hidden_size + if num_labels: + self.num_labels = num_labels + + +__all__ = ["MMBTConfig"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deprecated/mmbt/modeling_mmbt.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deprecated/mmbt/modeling_mmbt.py new file mode 100644 index 0000000000000000000000000000000000000000..45ae577f7fced2d276f4f54c5bf859e27e08ebae --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deprecated/mmbt/modeling_mmbt.py @@ -0,0 +1,410 @@ +# coding=utf-8 +# Copyright (c) Facebook, Inc. and its affiliates. +# Copyright (c) HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch MMBT model.""" + +import torch +from torch import nn +from torch.nn import CrossEntropyLoss, MSELoss + +from ....modeling_outputs import BaseModelOutputWithPooling, SequenceClassifierOutput +from ....modeling_utils import ModuleUtilsMixin +from ....utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings + + +logger = logging.get_logger(__name__) + +_CONFIG_FOR_DOC = "MMBTConfig" + + +class ModalEmbeddings(nn.Module): + """Generic Modal Embeddings which takes in an encoder, and a transformer embedding.""" + + def __init__(self, config, encoder, embeddings): + super().__init__() + self.config = config + self.encoder = encoder + self.proj_embeddings = nn.Linear(config.modal_hidden_size, config.hidden_size) + self.position_embeddings = embeddings.position_embeddings + self.token_type_embeddings = embeddings.token_type_embeddings + self.word_embeddings = embeddings.word_embeddings + self.LayerNorm = embeddings.LayerNorm + self.dropout = nn.Dropout(p=config.hidden_dropout_prob) + + def forward(self, input_modal, start_token=None, end_token=None, position_ids=None, token_type_ids=None): + token_embeddings = self.proj_embeddings(self.encoder(input_modal)) + seq_length = token_embeddings.size(1) + + if start_token is not None: + start_token_embeds = self.word_embeddings(start_token) + seq_length += 1 + token_embeddings = torch.cat([start_token_embeds.unsqueeze(1), token_embeddings], dim=1) + + if end_token is not None: + end_token_embeds = self.word_embeddings(end_token) + seq_length += 1 + token_embeddings = torch.cat([token_embeddings, end_token_embeds.unsqueeze(1)], dim=1) + + if position_ids is None: + position_ids = torch.arange(seq_length, dtype=torch.long, device=input_modal.device) + position_ids = position_ids.unsqueeze(0).expand(input_modal.size(0), seq_length) + + if token_type_ids is None: + token_type_ids = torch.zeros( + (input_modal.size(0), seq_length), dtype=torch.long, device=input_modal.device + ) + + position_embeddings = self.position_embeddings(position_ids) + token_type_embeddings = self.token_type_embeddings(token_type_ids) + embeddings = token_embeddings + position_embeddings + token_type_embeddings + embeddings = self.LayerNorm(embeddings) + embeddings = self.dropout(embeddings) + return embeddings + + +MMBT_START_DOCSTRING = r""" + MMBT model was proposed in [Supervised Multimodal Bitransformers for Classifying Images and + Text](https://github.com/facebookresearch/mmbt) by Douwe Kiela, Suvrat Bhooshan, Hamed Firooz, Davide Testuggine. + It's a supervised multimodal bitransformer model that fuses information from text and other image encoders, and + obtain state-of-the-art performance on various multimodal classification benchmark tasks. + + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`MMBTConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. + transformer (`nn.Module`): A text transformer that is used by MMBT. + It should have embeddings, encoder, and pooler attributes. + encoder (`nn.Module`): Encoder for the second modality. + It should take in a batch of modal inputs and return k, n dimension embeddings. +""" + +MMBT_INPUTS_DOCSTRING = r""" + Args: + input_modal (`torch.FloatTensor` of shape `(batch_size, ***)`): + The other modality data. It will be the shape that the encoder for that type expects. e.g. With an Image + Encoder, the shape would be (batch_size, channels, height, width) + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. It does not expect [CLS] token to be added as it's + appended to the end of other modality embeddings. Indices can be obtained using [`AutoTokenizer`]. See + [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + modal_start_tokens (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Optional start token to be added to Other Modality Embedding. [CLS] Most commonly used for classification + tasks. + modal_end_tokens (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Optional end token to be added to Other Modality Embedding. [SEP] Most commonly used. + attention_mask (*optional*) `torch.FloatTensor` of shape `(batch_size, sequence_length)`: + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + token_type_ids (*optional*) `torch.LongTensor` of shape `(batch_size, sequence_length)`: + Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, + 1]`: + + - 0 corresponds to a *sentence A* token, + - 1 corresponds to a *sentence B* token. + + [What are token type IDs?](../glossary#token-type-ids) + modal_token_type_ids (*optional*) `torch.LongTensor` of shape `(batch_size, modal_sequence_length)`: + Segment token indices to indicate different portions of the non-text modality. The embeddings from these + tokens will be summed with the respective token embeddings for the non-text modality. + position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + + [What are position IDs?](../glossary#position-ids) + modal_position_ids (`torch.LongTensor` of shape `(batch_size, modal_sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings for the non-text modality. + Selected in the range `[0, config.max_position_embeddings - 1]`. + + [What are position IDs?](../glossary#position-ids) + head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, embedding_dim)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if + the model is configured as a decoder. + encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in + the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +@add_start_docstrings( + "The bare MMBT Model outputting raw hidden-states without any specific head on top.", + MMBT_START_DOCSTRING, +) +class MMBTModel(nn.Module, ModuleUtilsMixin): + def __init__(self, config, transformer, encoder): + super().__init__() + self.config = config + self.transformer = transformer + self.modal_encoder = ModalEmbeddings(config, encoder, transformer.embeddings) + + @add_start_docstrings_to_model_forward(MMBT_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_modal, + input_ids=None, + modal_start_tokens=None, + modal_end_tokens=None, + attention_mask=None, + token_type_ids=None, + modal_token_type_ids=None, + position_ids=None, + modal_position_ids=None, + head_mask=None, + inputs_embeds=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + Returns: + + Examples: + + ```python + # For example purposes. Not runnable. + transformer = BertModel.from_pretrained("google-bert/bert-base-uncased") + encoder = ImageEncoder(args) + mmbt = MMBTModel(config, transformer, encoder) + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + input_txt_shape = input_ids.size() + elif inputs_embeds is not None: + input_txt_shape = inputs_embeds.size()[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + device = input_ids.device if input_ids is not None else inputs_embeds.device + + modal_embeddings = self.modal_encoder( + input_modal, + start_token=modal_start_tokens, + end_token=modal_end_tokens, + position_ids=modal_position_ids, + token_type_ids=modal_token_type_ids, + ) + + input_modal_shape = modal_embeddings.size()[:-1] + + if token_type_ids is None: + token_type_ids = torch.ones(input_txt_shape, dtype=torch.long, device=device) + + txt_embeddings = self.transformer.embeddings( + input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds + ) + + embedding_output = torch.cat([modal_embeddings, txt_embeddings], 1) + + input_shape = embedding_output.size()[:-1] + + if attention_mask is None: + attention_mask = torch.ones(input_shape, device=device) + else: + attention_mask = torch.cat( + [torch.ones(input_modal_shape, device=device, dtype=torch.long), attention_mask], dim=1 + ) + if encoder_attention_mask is None: + encoder_attention_mask = torch.ones(input_shape, device=device) + else: + encoder_attention_mask = torch.cat( + [torch.ones(input_modal_shape, device=device), encoder_attention_mask], dim=1 + ) + + extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape) + encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + + encoder_outputs = self.transformer.encoder( + embedding_output, + attention_mask=extended_attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_extended_attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = encoder_outputs[0] + pooled_output = self.transformer.pooler(sequence_output) + + if not return_dict: + return (sequence_output, pooled_output) + encoder_outputs[1:] + + return BaseModelOutputWithPooling( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + def get_input_embeddings(self): + return self.embeddings.word_embeddings + + def set_input_embeddings(self, value): + self.embeddings.word_embeddings = value + + +@add_start_docstrings( + """ + MMBT Model with a sequence classification/regression head on top (a linear layer on top of the pooled output) + """, + MMBT_START_DOCSTRING, + MMBT_INPUTS_DOCSTRING, +) +class MMBTForClassification(nn.Module): + r""" + **labels**: (*optional*) `torch.LongTensor` of shape `(batch_size,)`: + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + + Returns: *Tuple* comprising various elements depending on the configuration (config) and inputs: **loss**: + (*optional*, returned when `labels` is provided) `torch.FloatTensor` of shape `(1,)`: Classification (or + regression if config.num_labels==1) loss. **logits**: + `torch.FloatTensor` of shape `(batch_size, config.num_labels)` Classification (or regression if + config.num_labels==1) scores (before SoftMax). + **hidden_states**: (*optional*, returned when `output_hidden_states=True`) list of `torch.FloatTensor` (one for + the output of each layer + the output of the embeddings) of shape `(batch_size, sequence_length, hidden_size)`: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. **attentions**: + (*optional*, returned when `output_attentions=True`) list of `torch.FloatTensor` (one for each layer) of shape + `(batch_size, num_heads, sequence_length, sequence_length)`: Attentions weights after the attention softmax, used + to compute the weighted average in the self-attention heads. + + Examples: + + ```python + # For example purposes. Not runnable. + transformer = BertModel.from_pretrained("google-bert/bert-base-uncased") + encoder = ImageEncoder(args) + model = MMBTForClassification(config, transformer, encoder) + outputs = model(input_modal, input_ids, labels=labels) + loss, logits = outputs[:2] + ```""" + + def __init__(self, config, transformer, encoder): + super().__init__() + self.num_labels = config.num_labels + + self.mmbt = MMBTModel(config, transformer, encoder) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + + def forward( + self, + input_modal, + input_ids=None, + modal_start_tokens=None, + modal_end_tokens=None, + attention_mask=None, + token_type_ids=None, + modal_token_type_ids=None, + position_ids=None, + modal_position_ids=None, + head_mask=None, + inputs_embeds=None, + labels=None, + return_dict=None, + ): + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.mmbt( + input_modal=input_modal, + input_ids=input_ids, + modal_start_tokens=modal_start_tokens, + modal_end_tokens=modal_end_tokens, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + modal_token_type_ids=modal_token_type_ids, + position_ids=position_ids, + modal_position_ids=modal_position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + return_dict=return_dict, + ) + + pooled_output = outputs[1] + + pooled_output = self.dropout(pooled_output) + logits = self.classifier(pooled_output) + + loss = None + if labels is not None: + if self.num_labels == 1: + # We are doing regression + loss_fct = MSELoss() + loss = loss_fct(logits.view(-1), labels.view(-1)) + else: + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +__all__ = ["MMBTForClassification", "MMBTModel", "ModalEmbeddings"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deprecated/open_llama/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deprecated/open_llama/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..2b3964d194bed041987c6236b5c60bfcf3b7caf4 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deprecated/open_llama/__init__.py @@ -0,0 +1,27 @@ +# Copyright 2023 EleutherAI and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ....utils import _LazyModule +from ....utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_open_llama import * + from .modeling_open_llama import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deprecated/open_llama/configuration_open_llama.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deprecated/open_llama/configuration_open_llama.py new file mode 100644 index 0000000000000000000000000000000000000000..b4bc9cc72a7661fe571cce6649b40f2307d9b3ce --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deprecated/open_llama/configuration_open_llama.py @@ -0,0 +1,169 @@ +# coding=utf-8 +# Copyright 2023 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Open-Llama model configuration""" + +from ....configuration_utils import PretrainedConfig +from ....utils import logging + + +logger = logging.get_logger(__name__) + + +class OpenLlamaConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`OpenLlamaModel`]. It is used to instantiate an + Open-Llama model according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of the + [s-JoL/Open-Llama-V1](https://huggingface.co/s-JoL/Open-Llama-V1). + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 32000): + Vocabulary size of the Open-Llama model. Defines the number of different tokens that can be represented by + the `inputs_ids` passed when calling [`OpenLlamaModel`] + hidden_size (`int`, *optional*, defaults to 4096): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 11008): + Dimension of the MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 32): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 32): + Number of attention heads for each attention layer in the Transformer encoder. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the decoder. + max_position_embeddings (`int`, *optional*, defaults to 2048): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + rms_norm_eps (`float`, *optional*, defaults to 1e-12): + The epsilon used by the rms normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + tie_word_embeddings(`bool`, *optional*, defaults to `False`): + Whether to tie weight embeddings + rope_theta (`float`, *optional*, defaults to 10000.0): + The base period of the RoPE embeddings. + rope_scaling (`Dict`, *optional*): + Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling + strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is + `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update + `max_position_embeddings` to the expected new maximum. See the following thread for more information on how + these scaling strategies behave: + https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an + experimental feature, subject to breaking API changes in future versions. + + Example: + + ```python + >>> from transformers import OpenLlamaModel, OpenLlamaConfig + + >>> # Initializing a Open-Llama open_llama-7b style configuration + >>> configuration = OpenLlamaConfig() + + >>> # Initializing a model from the open_llama-7b style configuration + >>> model = OpenLlamaModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "open-llama" + + def __init__( + self, + vocab_size=100000, + hidden_size=4096, + intermediate_size=11008, + num_hidden_layers=32, + num_attention_heads=32, + hidden_act="silu", + max_position_embeddings=2048, + initializer_range=0.02, + rms_norm_eps=1e-6, + use_cache=True, + pad_token_id=0, + bos_token_id=1, + eos_token_id=2, + tie_word_embeddings=False, + use_memory_efficient_attention=True, + hidden_dropout_prob=0.1, + attention_dropout_prob=0.1, + use_stable_embedding=True, + shared_input_output_embedding=True, + rope_theta=10000.0, + rope_scaling=None, + **kwargs, + ): + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + self.use_cache = use_cache + self.use_memory_efficient_attention = kwargs.pop( + "use_memorry_efficient_attention", use_memory_efficient_attention + ) + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_dropout_prob = attention_dropout_prob + self.use_stable_embedding = use_stable_embedding + self.shared_input_output_embedding = shared_input_output_embedding + self.rope_theta = rope_theta + self.rope_scaling = rope_scaling + self._rope_scaling_validation() + + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) + + def _rope_scaling_validation(self): + """ + Validate the `rope_scaling` configuration. + """ + if self.rope_scaling is None: + return + + if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2: + raise ValueError( + f"`rope_scaling` must be a dictionary with two fields, `type` and `factor`, got {self.rope_scaling}" + ) + rope_scaling_type = self.rope_scaling.get("type", None) + rope_scaling_factor = self.rope_scaling.get("factor", None) + if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]: + raise ValueError( + f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}" + ) + if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0: + raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}") + + +__all__ = ["OpenLlamaConfig"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deprecated/open_llama/modeling_open_llama.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deprecated/open_llama/modeling_open_llama.py new file mode 100644 index 0000000000000000000000000000000000000000..5e182e0f813f1c0c9ed138c39561cc23eff8cb26 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deprecated/open_llama/modeling_open_llama.py @@ -0,0 +1,940 @@ +# coding=utf-8 +# Copyright 2023 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch Open-Llama model.""" + +import math +from typing import Optional, Union + +import torch +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from ....activations import ACT2FN +from ....cache_utils import Cache +from ....modeling_attn_mask_utils import _prepare_4d_causal_attention_mask +from ....modeling_layers import GradientCheckpointingLayer +from ....modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast +from ....modeling_utils import PreTrainedModel +from ....utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings +from ....utils.deprecation import deprecate_kwarg +from .configuration_open_llama import OpenLlamaConfig + + +logger = logging.get_logger(__name__) + +try: + from xformers import ops as xops +except ImportError: + xops = None + + +_CONFIG_FOR_DOC = "OpenLlamaConfig" + + +class OpenLlamaRMSNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + """ + OpenLlamaRMSNorm is equivalent to T5LayerNorm + """ + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + return self.weight * hidden_states.to(input_dtype) + + def extra_repr(self): + return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}" + + +class OpenLlamaRotaryEmbedding(nn.Module): + inv_freq: torch.Tensor # fix linting for `register_buffer` + cos_cached: torch.Tensor + sin_cached: torch.Tensor + + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): + super().__init__() + + self.dim = dim + self.max_position_embeddings = max_position_embeddings + self.base = base + inv_freq = 1.0 / ( + self.base + ** (torch.arange(0, self.dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / self.dim) + ) + self.register_buffer("inv_freq", inv_freq, persistent=False) + + # Build here to make `torch.jit.trace` work. + self._set_cos_sin_cache( + seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype() + ) + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq) + + freqs = torch.outer(t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False) + self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False) + + def forward(self, x, seq_len=None): + # x: [bs, num_attention_heads, seq_len, head_size] + if seq_len > self.max_seq_len_cached: + self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype) + + return ( + self.cos_cached[:seq_len].to(dtype=x.dtype), + self.sin_cached[:seq_len].to(dtype=x.dtype), + ) + + +class OpenLlamaLinearScalingRotaryEmbedding(OpenLlamaRotaryEmbedding): + """OpenLlamaRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev""" + + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0): + self.scaling_factor = scaling_factor + super().__init__(dim, max_position_embeddings, base, device) + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq) + t = t / self.scaling_factor + + freqs = torch.outer(t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False) + self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False) + + +class OpenLlamaDynamicNTKScalingRotaryEmbedding(OpenLlamaRotaryEmbedding): + """OpenLlamaRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla""" + + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0): + self.scaling_factor = scaling_factor + super().__init__(dim, max_position_embeddings, base, device) + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + + if seq_len > self.max_position_embeddings: + base = self.base * ( + (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1) + ) ** (self.dim / (self.dim - 2)) + inv_freq = 1.0 / ( + base + ** (torch.arange(0, self.dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / self.dim) + ) + self.register_buffer("inv_freq", inv_freq, persistent=False) + + t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq) + + freqs = torch.outer(t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False) + self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False) + + +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + +def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1): + """Applies Rotary Position Embedding to the query and key tensors. + + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + position_ids (`torch.Tensor`): + The position indices of the tokens corresponding to the query and key tensors. For example, this can be + used to pass offsetted position ids when working with a KV-cache. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + cos = cos[position_ids].unsqueeze(unsqueeze_dim) + sin = sin[position_ids].unsqueeze(unsqueeze_dim) + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + +class OpenLlamaMLP(nn.Module): + def __init__( + self, + hidden_size: int, + intermediate_size: int, + hidden_act: str, + dropout_prob: float, + ): + super().__init__() + self.gate_proj = nn.Linear(hidden_size, intermediate_size, bias=False) + self.down_proj = nn.Linear(intermediate_size, hidden_size, bias=False) + self.up_proj = nn.Linear(hidden_size, intermediate_size, bias=False) + self.act_fn = ACT2FN[hidden_act] + self.dropout = nn.Dropout(dropout_prob) + + def forward(self, x): + out = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) + return self.dropout(out) + + +class OpenLlamaAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config: OpenLlamaConfig): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.hidden_size // self.num_heads + self.max_position_embeddings = config.max_position_embeddings + self.dropout_prob = config.attention_dropout_prob + self.rope_theta = config.rope_theta + + if (self.head_dim * self.num_heads) != self.hidden_size: + raise ValueError( + f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}" + f" and `num_heads`: {self.num_heads})." + ) + self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False) + self.k_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False) + self.v_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False) + self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False) + self._init_rope() + + def _init_rope(self): + if self.config.rope_scaling is None: + self.rotary_emb = OpenLlamaRotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + base=self.rope_theta, + ) + else: + scaling_type = self.config.rope_scaling["type"] + scaling_factor = self.config.rope_scaling["factor"] + if scaling_type == "linear": + self.rotary_emb = OpenLlamaLinearScalingRotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + scaling_factor=scaling_factor, + base=self.rope_theta, + ) + elif scaling_type == "dynamic": + self.rotary_emb = OpenLlamaDynamicNTKScalingRotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + scaling_factor=scaling_factor, + base=self.rope_theta, + ) + else: + raise ValueError(f"Unknown RoPE scaling type {scaling_type}") + + def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): + return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]: + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = self.k_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + value_states = self.v_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_values is not None: + kv_seq_len += past_key_values[0].shape[-2] + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + # [bsz, nh, t, hd] + + if past_key_values is not None: + # reuse k, v, self_attention + key_states = torch.cat([past_key_values[0], key_states], dim=2) + value_states = torch.cat([past_key_values[1], value_states], dim=2) + + past_key_values = (key_states, value_states) if use_cache else None + + if self.config.use_memory_efficient_attention and xops is not None and self.training: + attn_weights = None + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + attn_output = xops.memory_efficient_attention( + query_states, key_states, value_states, attn_bias=xops.LowerTriangularMask(), p=self.dropout_prob + ) + else: + attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) + + if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len): + raise ValueError( + f"Attention weights should be of size {(bsz * self.num_heads, q_len, kv_seq_len)}, but is" + f" {attn_weights.size()}" + ) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" + ) + attn_weights = attn_weights + attention_mask + attn_weights = torch.max( + attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min, device=attn_weights.device) + ) + + # upcast attention to fp32 + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) + attn_output = torch.matmul(attn_weights, value_states) + + if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.transpose(1, 2) + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) + + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_values + + +class OpenLlamaDecoderLayer(GradientCheckpointingLayer): + def __init__(self, config: OpenLlamaConfig): + super().__init__() + self.hidden_size = config.hidden_size + self.self_attn = OpenLlamaAttention(config=config) + self.mlp = OpenLlamaMLP( + hidden_size=self.hidden_size, + intermediate_size=config.intermediate_size, + hidden_act=config.hidden_act, + dropout_prob=config.hidden_dropout_prob, + ) + self.input_layernorm = OpenLlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = OpenLlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]: + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`, *optional*): attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + past_key_values (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states + """ + + residual = hidden_states + + hidden_states = self.input_layernorm(hidden_states) + + # Self Attention + hidden_states, self_attn_weights, present_key_value = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + output_attentions=output_attentions, + use_cache=use_cache, + ) + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights,) + + if use_cache: + outputs += (present_key_value,) + + return outputs + + +OPEN_LLAMA_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`OpenLlamaConfig`]): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + + +@add_start_docstrings( + "The bare Open-Llama Model outputting raw hidden-states without any specific head on top.", + OPEN_LLAMA_START_DOCSTRING, +) +class OpenLlamaPreTrainedModel(PreTrainedModel): + config: OpenLlamaConfig + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["OpenLlamaDecoderLayer"] + + def _init_weights(self, module): + std = self.config.initializer_range + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + if self.config.use_stable_embedding: + torch.nn.init.xavier_normal_(module.weight.data) + else: + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + +OPEN_LLAMA_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see + `past_key_values`). + + If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`] + and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more + information on the default strategy. + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.n_positions - 1]`. + + [What are position IDs?](../glossary#position-ids) + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape + `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape + `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. + + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `(batch_size, sequence_length)`. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +@add_start_docstrings( + "The bare Open-Llama Model outputting raw hidden-states without any specific head on top.", + OPEN_LLAMA_START_DOCSTRING, +) +class OpenLlamaModel(OpenLlamaPreTrainedModel): + """ + Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`OpenLlamaDecoderLayer`] + + Args: + config: OpenLlamaConfig + """ + + def __init__(self, config: OpenLlamaConfig): + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) + if config.use_stable_embedding: + self.embed_layer_norm = nn.LayerNorm(config.hidden_size) + else: + self.embed_layer_norm = None + self.layers = nn.ModuleList([OpenLlamaDecoderLayer(config) for _ in range(config.num_hidden_layers)]) + self.norm = OpenLlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(OPEN_LLAMA_INPUTS_DOCSTRING) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, BaseModelOutputWithPast]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time") + elif input_ids is not None: + batch_size, seq_length = input_ids.shape + elif inputs_embeds is not None: + batch_size, seq_length, _ = inputs_embeds.shape + else: + raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds") + + seq_length_with_past = seq_length + past_key_values_length = 0 + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + if past_key_values is not None: + past_key_values_length = past_key_values.get_seq_length() + seq_length_with_past = seq_length_with_past + past_key_values_length + + if position_ids is None: + device = input_ids.device if input_ids is not None else inputs_embeds.device + position_ids = torch.arange( + past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device + ) + position_ids = position_ids.unsqueeze(0) + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + if self.embed_layer_norm: + inputs_embeds = self.embed_layer_norm(inputs_embeds) + # embed positions + if self.config.use_memory_efficient_attention and self.training: + attention_mask = None + elif attention_mask is None: + attention_mask = torch.ones( + (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device + ) + + input_shape = (batch_size, seq_length) + attention_mask = _prepare_4d_causal_attention_mask( + attention_mask, input_shape, inputs_embeds, past_key_values_length + ) + + hidden_states = inputs_embeds + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + next_decoder_cache = () if use_cache else None + + for idx, decoder_layer in enumerate(self.layers): + if output_hidden_states: + all_hidden_states += (hidden_states,) + + layer_outputs = decoder_layer( + hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values[idx] if past_key_values is not None else None, + output_attentions=output_attentions, + use_cache=use_cache, + ) + + hidden_states = layer_outputs[0] + + if use_cache: + next_decoder_cache += (layer_outputs[2 if output_attentions else 1],) + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + hidden_states = self.norm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + next_cache = next_decoder_cache if use_cache else None + if not return_dict: + return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None) + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + ) + + +class OpenLlamaForCausalLM(OpenLlamaPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.model = OpenLlamaModel(config) + if config.shared_input_output_embedding: + self.lm_head = None + else: + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(OPEN_LLAMA_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, CausalLMOutputWithPast]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, OpenLlamaForCausalLM + + >>> model = OpenLlamaForCausalLM.from_pretrained("openlm-research/open_llama_7b") + >>> tokenizer = AutoTokenizer.from_pretrained("openlm-research/open_llama_7b") + + >>> prompt = "Hey, are you conscious? Can you talk to me?" + >>> inputs = tokenizer(prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(inputs.input_ids, max_length=30) + >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you." + ```""" + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs[0] + if self.config.shared_input_output_embedding: + logits = torch.einsum( + "blh,vh->blv", hidden_states.to(self.model.embed_tokens.weight.device), self.model.embed_tokens.weight + ) + else: + logits = self.lm_head(hidden_states) + + loss = None + if labels is not None: + # move labels to correct device to enable model parallelism + labels = labels.to(logits.device) + # Shift so that tokens < n predict n + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + # Flatten the tokens + loss_fct = CrossEntropyLoss() + shift_logits = shift_logits.view(-1, self.config.vocab_size) + shift_labels = shift_labels.view(-1) + # Enable model parallelism + shift_labels = shift_labels.to(shift_logits.device) + loss = loss_fct(shift_logits, shift_labels) + + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def prepare_inputs_for_generation( + self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs + ): + if past_key_values is not None: + past_length = past_key_values.get_seq_length() + + # Some generation methods already pass only the last input ID + if input_ids.shape[1] > past_length: + remove_prefix_length = past_length + else: + # Default to old behavior: keep only final ID + remove_prefix_length = input_ids.shape[1] - 1 + + input_ids = input_ids[:, remove_prefix_length:] + + position_ids = kwargs.get("position_ids") + if attention_mask is not None and position_ids is None: + # create position_ids on the fly for batch generation + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + if past_key_values: + position_ids = position_ids[:, -input_ids.shape[1] :] + + # if `inputs_embeds` are passed, we only want to use them in the 1st generation step + if inputs_embeds is not None and past_key_values is None: + model_inputs = {"inputs_embeds": inputs_embeds} + else: + model_inputs = {"input_ids": input_ids} + + model_inputs.update( + { + "position_ids": position_ids, + "past_key_values": past_key_values, + "use_cache": kwargs.get("use_cache"), + "attention_mask": attention_mask, + } + ) + return model_inputs + + @staticmethod + def _reorder_cache(past_key_values, beam_idx): + reordered_past = () + for layer_past in past_key_values: + reordered_past += ( + tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past), + ) + return reordered_past + + +@add_start_docstrings( + """ + The LLaMa Model transformer with a sequence classification head on top (linear layer). + + [`OpenLlamaForSequenceClassification`] uses the last token in order to do the classification, as other causal + models (e.g. GPT-2) do. + + Since it does classification on the last token, it requires to know the position of the last token. If a + `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If + no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the + padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in + each row of the batch). + """, + OPEN_LLAMA_START_DOCSTRING, +) +class OpenLlamaForSequenceClassification(OpenLlamaPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.model = OpenLlamaModel(config) + self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(OPEN_LLAMA_INPUTS_DOCSTRING) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, SequenceClassifierOutputWithPast]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + transformer_outputs = self.model( + input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = transformer_outputs[0] + logits = self.score(hidden_states) + + if input_ids is not None: + batch_size = input_ids.shape[0] + else: + batch_size = inputs_embeds.shape[0] + + if self.config.pad_token_id is None and batch_size != 1: + raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.") + if self.config.pad_token_id is None: + sequence_lengths = -1 + else: + if input_ids is not None: + # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility + sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1 + sequence_lengths = sequence_lengths % input_ids.shape[-1] + sequence_lengths = sequence_lengths.to(logits.device) + else: + sequence_lengths = -1 + + pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths] + + loss = None + if labels is not None: + labels = labels.to(logits.device) + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(pooled_logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(pooled_logits, labels) + if not return_dict: + output = (pooled_logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutputWithPast( + loss=loss, + logits=pooled_logits, + past_key_values=transformer_outputs.past_key_values, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) + + +__all__ = ["OpenLlamaPreTrainedModel", "OpenLlamaModel", "OpenLlamaForCausalLM", "OpenLlamaForSequenceClassification"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deprecated/tapex/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deprecated/tapex/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b535eb1df2c40a7680bbf8d57fc70b78e23437f4 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deprecated/tapex/__init__.py @@ -0,0 +1,26 @@ +# Copyright 2022 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ....utils import _LazyModule +from ....utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .tokenization_tapex import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deprecated/tapex/tokenization_tapex.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deprecated/tapex/tokenization_tapex.py new file mode 100644 index 0000000000000000000000000000000000000000..b32383ddd497828b6a429d39c5d4069a21154b9f --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deprecated/tapex/tokenization_tapex.py @@ -0,0 +1,1470 @@ +# coding=utf-8 +# Copyright 2022 Microsoft Research and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes for TAPEX.""" + +import json +import os +import random +from functools import lru_cache +from typing import Optional, Union + +import regex as re + +from ....file_utils import ExplicitEnum, PaddingStrategy, TensorType, add_end_docstrings, is_pandas_available +from ....tokenization_utils import AddedToken, PreTrainedTokenizer +from ....tokenization_utils_base import ENCODE_KWARGS_DOCSTRING, BatchEncoding, TextInput, TruncationStrategy +from ....utils import logging + + +if is_pandas_available(): + import pandas as pd + + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt"} + + +class TapexTruncationStrategy(ExplicitEnum): + """ + Possible values for the `truncation` argument in [`~TapasTokenizer.__call__`]. Useful for tab-completion in an IDE. + """ + + DROP_ROWS_TO_FIT = "drop_rows_to_fit" + + +TAPEX_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING = r""" + add_special_tokens (`bool`, *optional*, defaults to `True`): + Whether or not to encode the sequences with the special tokens relative to their model. + padding (`bool`, `str` or [`~file_utils.PaddingStrategy`], *optional*, defaults to `False`): + Activates and controls padding. Accepts the following values: + + - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single + sequence if provided). + - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum + acceptable input length for the model if that argument is not provided. + - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different + lengths). + truncation (`bool`, `str`, [`TapexTruncationStrategy`] or [`~tokenization_utils_base.TruncationStrategy`], + *optional*, defaults to `False`): + + Activates and controls truncation. Accepts the following values: + + - `'drop_rows_to_fit'`: Truncate to a maximum length specified with the argument `max_length` or to the + maximum acceptable input length for the model if that argument is not provided. This will truncate + row by row, removing rows from the table. + - `True` or `'longest_first'`: Truncate to a maximum length specified with the argument `max_length` or + to the maximum acceptable input length for the model if that argument is not provided. This will + truncate token by token, removing a token from the longest sequence in the pair if a pair of + sequences (or a batch of pairs) is provided. + - `'only_first'`: Truncate to a maximum length specified with the argument `max_length` or to the + maximum acceptable input length for the model if that argument is not provided. This will only + truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided. + - `'only_second'`: Truncate to a maximum length specified with the argument `max_length` or to the + maximum acceptable input length for the model if that argument is not provided. This will only + truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided. + - `False` or `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths + greater than the model maximum admissible input size). + max_length (`int`, *optional*): + Controls the maximum length to use by one of the truncation/padding parameters. If left unset or set to + `None`, this will use the predefined model maximum length if a maximum length is required by one of the + truncation/padding parameters. If the model has no specific maximum input length (like XLNet) + truncation/padding to a maximum length will be deactivated. + stride (`int`, *optional*, defaults to 0): + If set to a number along with `max_length`, the overflowing tokens returned when + `return_overflowing_tokens=True` will contain some tokens from the end of the truncated sequence + returned to provide some overlap between truncated and overflowing sequences. The value of this + argument defines the number of overlapping tokens. + pad_to_multiple_of (`int`, *optional*): + If set will pad the sequence to a multiple of the provided value. This is especially useful to enable + the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta). + return_tensors (`str` or [`~file_utils.TensorType`], *optional*): + If set, will return tensors instead of list of python integers. Acceptable values are: + + - `'tf'`: Return TensorFlow `tf.constant` objects. + - `'pt'`: Return PyTorch `torch.Tensor` objects. + - `'np'`: Return Numpy `np.ndarray` objects. +""" + + +@lru_cache +def bytes_to_unicode(): + """ + Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control + characters the bpe code barfs on. The reversible bpe codes work on unicode strings. This means you need a large # + of unicode characters in your vocab if you want to avoid UNKs. When you're at something like a 10B token dataset + you end up needing around 5K for decent coverage. This is a significant percentage of your normal, say, 32K bpe + vocab. To avoid that, we want lookup tables between utf-8 bytes and unicode strings. + """ + bs = ( + list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1)) + ) + cs = bs[:] + n = 0 + for b in range(2**8): + if b not in bs: + bs.append(b) + cs.append(2**8 + n) + n += 1 + cs = [chr(n) for n in cs] + return dict(zip(bs, cs)) + + +def get_pairs(word): + """ + Return set of symbol pairs in a word. Word is represented as tuple of symbols (symbols being variable-length + strings). + """ + pairs = set() + prev_char = word[0] + for char in word[1:]: + pairs.add((prev_char, char)) + prev_char = char + return pairs + + +class IndexedRowTableLinearize: + """ + FORMAT: col: col1 | col2 | col 3 row 1 : val1 | val2 | val3 row 2 : ... + """ + + def process_table(self, table_content: dict): + """ + Given a table, TableLinearize aims at converting it into a flatten sequence with special symbols. + """ + assert "header" in table_content and "rows" in table_content, self.PROMPT_MESSAGE + # process header + table_str = self.process_header(table_content["header"]) + " " + # process rows + for i, row_example in enumerate(table_content["rows"]): + # NOTE: the row should start from row 1 instead of 0 + table_str += self.process_row(row_example, row_index=i + 1) + " " + return table_str.strip() + + def process_header(self, headers: list): + """ + Given a list of headers, TableLinearize aims at converting it into a flatten sequence with special symbols. + """ + return "col : " + " | ".join(headers) + + def process_row(self, row: list, row_index: int): + """ + Given a row, TableLinearize aims at converting it into a flatten sequence with special symbols. + """ + row_str = "" + row_cell_values = [] + for cell_value in row: + if isinstance(cell_value, int): + row_cell_values.append(str(cell_value)) + else: + row_cell_values.append(cell_value) + row_str += " | ".join(row_cell_values) + return "row " + str(row_index) + " : " + row_str + + +class TapexTokenizer(PreTrainedTokenizer): + r""" + Construct a TAPEX tokenizer. Based on byte-level Byte-Pair-Encoding (BPE). + + This tokenizer can be used to flatten one or more table(s) and concatenate them with one or more related sentences + to be used by TAPEX models. The format that the TAPEX tokenizer creates is the following: + + sentence col: col1 | col2 | col 3 row 1 : val1 | val2 | val3 row 2 : ... + + The tokenizer supports a single table + single query, a single table and multiple queries (in which case the table + will be duplicated for every query), a single query and multiple tables (in which case the query will be duplicated + for every table), and multiple tables and queries. In other words, you can provide a batch of tables + questions to + the tokenizer for instance to prepare them for the model. + + Tokenization itself is based on the BPE algorithm. It is identical to the one used by BART, RoBERTa and GPT-2. + + This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to + this superclass for more information regarding those methods. + + Args: + vocab_file (`str`): + Path to the vocabulary file. + merges_file (`str`): + Path to the merges file. + do_lower_case (`bool`, *optional*, defaults to `True`): + Whether or not to lowercase the input when tokenizing. + errors (`str`, *optional*, defaults to `"replace"`): + Paradigm to follow when decoding bytes to UTF-8. See + [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information. + bos_token (`str`, *optional*, defaults to `""`): + The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token. + + + + When building a sequence using special tokens, this is not the token that is used for the beginning of + sequence. The token used is the `cls_token`. + + + + eos_token (`str`, *optional*, defaults to `""`): + The end of sequence token. + + + + When building a sequence using special tokens, this is not the token that is used for the end of sequence. + The token used is the `sep_token`. + + + + sep_token (`str`, *optional*, defaults to `""`): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for + sequence classification or for a text and a question for question answering. It is also used as the last + token of a sequence built with special tokens. + cls_token (`str`, *optional*, defaults to `""`): + The classifier token which is used when doing sequence classification (classification of the whole sequence + instead of per-token classification). It is the first token of the sequence when built with special tokens. + unk_token (`str`, *optional*, defaults to `""`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + pad_token (`str`, *optional*, defaults to `""`): + The token used for padding, for example when batching sequences of different lengths. + mask_token (`str`, *optional*, defaults to `""`): + The token used for masking values. This is the token used when training this model with masked language + modeling. This is the token which the model will try to predict. + add_prefix_space (`bool`, *optional*, defaults to `False`): + Whether or not to add an initial space to the input. This allows to treat the leading word just as any + other word. (BART tokenizer detect beginning of words by the preceding space). + max_cell_length (`int`, *optional*, defaults to 15): + Maximum number of characters per cell when linearizing a table. If this number is exceeded, truncation + takes place. + """ + + vocab_files_names = VOCAB_FILES_NAMES + model_input_names = ["input_ids", "attention_mask"] + + def __init__( + self, + vocab_file, + merges_file, + do_lower_case=True, + errors="replace", + bos_token="", + eos_token="", + sep_token="", + cls_token="", + unk_token="", + pad_token="", + mask_token="", + add_prefix_space=False, + max_cell_length=15, + **kwargs, + ): + bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token + eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token + sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token + cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token + unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token + pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token + + # Mask token behave like a normal word, i.e. include the space before it + mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token + + with open(vocab_file, encoding="utf-8") as vocab_handle: + self.encoder = json.load(vocab_handle) + self.decoder = {v: k for k, v in self.encoder.items()} + self.errors = errors # how to handle errors in decoding + self.byte_encoder = bytes_to_unicode() + self.byte_decoder = {v: k for k, v in self.byte_encoder.items()} + with open(merges_file, encoding="utf-8") as merges_handle: + bpe_merges = merges_handle.read().split("\n")[1:-1] + bpe_merges = [tuple(merge.split()) for merge in bpe_merges] + self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges)))) + self.cache = {} + self.add_prefix_space = add_prefix_space + self.do_lower_case = do_lower_case + + # Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions + self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""") + + # additional properties + + super().__init__( + vocab_file=vocab_file, + merges_file=merges_file, + do_lower_case=do_lower_case, + errors=errors, + bos_token=bos_token, + eos_token=eos_token, + unk_token=unk_token, + sep_token=sep_token, + cls_token=cls_token, + pad_token=pad_token, + mask_token=mask_token, + add_prefix_space=add_prefix_space, + max_cell_length=max_cell_length, + **kwargs, + ) + + self.max_cell_length = max_cell_length + self.table_linearize = IndexedRowTableLinearize() + + def build_inputs_with_special_tokens( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None + ) -> list[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A TAPEX sequence has the following format: + - single sequence: ` X ` + - pair of sequences: ` A B ` + + Args: + token_ids_0 (`list[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`list[int]`, *optional*): + Optional second list of IDs for sequence pairs. + Returns: + `list[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + if token_ids_1 is None: + return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] + cls = [self.cls_token_id] + sep = [self.sep_token_id] + return cls + token_ids_0 + sep + sep + token_ids_1 + sep + + def get_special_tokens_mask( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False + ) -> list[int]: + """ + Args: + Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer `prepare_for_model` method. + token_ids_0 (`list[int]`): + List of IDs. + token_ids_1 (`list[int]`, *optional*): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not the token list is already formatted with special tokens for the model. + Returns: + `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + if already_has_special_tokens: + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) + + if token_ids_1 is None: + return [1] + ([0] * len(token_ids_0)) + [1] + return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1] + + def create_token_type_ids_from_sequences( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None + ) -> list[int]: + """ + Args: + Create a mask from the two sequences passed to be used in a sequence-pair classification task. TAPEX does not: + make use of token type ids, therefore a list of zeros is returned. + token_ids_0 (`list[int]`): + List of IDs. + token_ids_1 (`list[int]`, *optional*): + Optional second list of IDs for sequence pairs. + Returns: + `list[int]`: List of zeros. + """ + sep = [self.sep_token_id] + cls = [self.cls_token_id] + + if token_ids_1 is None: + return len(cls + token_ids_0 + sep) * [0] + return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0] + + def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs): + add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space) + if (is_split_into_words or add_prefix_space) and (len(text) > 0 and not text[0].isspace()): + text = " " + text + return (text, kwargs) + + @property + def vocab_size(self): + return len(self.encoder) + + def get_vocab(self): + return dict(self.encoder, **self.added_tokens_encoder) + + def bpe(self, token): + if token in self.cache: + return self.cache[token] + word = tuple(token) + pairs = get_pairs(word) + + if not pairs: + return token + + while True: + bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf"))) + if bigram not in self.bpe_ranks: + break + first, second = bigram + new_word = [] + i = 0 + while i < len(word): + try: + j = word.index(first, i) + except ValueError: + new_word.extend(word[i:]) + break + else: + new_word.extend(word[i:j]) + i = j + + if word[i] == first and i < len(word) - 1 and word[i + 1] == second: + new_word.append(first + second) + i += 2 + else: + new_word.append(word[i]) + i += 1 + new_word = tuple(new_word) + word = new_word + if len(word) == 1: + break + else: + pairs = get_pairs(word) + word = " ".join(word) + self.cache[token] = word + return word + + def _tokenize(self, text): + """Tokenize a string.""" + bpe_tokens = [] + for token in re.findall(self.pat, text): + token = "".join( + self.byte_encoder[b] for b in token.encode("utf-8") + ) # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case) + bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" ")) + return bpe_tokens + + def _convert_token_to_id(self, token): + """Converts a token (str) in an id using the vocab.""" + return self.encoder.get(token, self.encoder.get(self.unk_token)) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.decoder.get(index) + + def convert_tokens_to_string(self, tokens): + """Converts a sequence of tokens (string) in a single string.""" + text = "".join(tokens) + text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors) + return text + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]: + if not os.path.isdir(save_directory): + logger.error(f"Vocabulary path ({save_directory}) should be a directory") + return + vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + merge_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"] + ) + + with open(vocab_file, "w", encoding="utf-8") as f: + f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n") + + index = 0 + with open(merge_file, "w", encoding="utf-8") as writer: + writer.write("#version: 0.2\n") + for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]): + if index != token_index: + logger.warning( + f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive." + " Please check that the tokenizer is not corrupted!" + ) + index = token_index + writer.write(" ".join(bpe_tokens) + "\n") + index += 1 + + return vocab_file, merge_file + + @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, TAPEX_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING) + def __call__( + self, + table: Union["pd.DataFrame", list["pd.DataFrame"]] = None, + query: Optional[Union[TextInput, list[TextInput]]] = None, + answer: Optional[Union[str, list[str]]] = None, + add_special_tokens: bool = True, + padding: Union[bool, str, PaddingStrategy] = False, + truncation: Union[bool, str, TruncationStrategy] = None, + max_length: Optional[int] = None, + stride: int = 0, + pad_to_multiple_of: Optional[int] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + return_token_type_ids: Optional[bool] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + return_offsets_mapping: bool = False, + return_length: bool = False, + verbose: bool = True, + **kwargs, + ) -> BatchEncoding: + """ + Main method to tokenize and prepare for the model one or several table-sequence pair(s). + + Args: + table (`pd.DataFrame`, `list[pd.DataFrame]`): + Table(s) containing tabular data. + query (`str` or `list[str]`, *optional*): + Sentence or batch of sentences related to one or more table(s) to be encoded. Note that the number of + sentences must match the number of tables. + answer (`str` or `list[str]`, *optional*): + Optionally, the corresponding answer to the questions as supervision. + """ + + if table is not None: + return self.source_call_func( + table=table, + query=query, + answer=answer, + add_special_tokens=add_special_tokens, + padding=padding, + truncation=truncation, + max_length=max_length, + stride=stride, + pad_to_multiple_of=pad_to_multiple_of, + return_tensors=return_tensors, + return_token_type_ids=return_token_type_ids, + return_attention_mask=return_attention_mask, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + return_offsets_mapping=return_offsets_mapping, + return_length=return_length, + verbose=verbose, + **kwargs, + ) + elif answer is not None: + return self.target_call_func( + answer=answer, + add_special_tokens=add_special_tokens, + padding=padding, + truncation=truncation, + max_length=max_length, + stride=stride, + pad_to_multiple_of=pad_to_multiple_of, + return_tensors=return_tensors, + return_token_type_ids=return_token_type_ids, + return_attention_mask=return_attention_mask, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + return_offsets_mapping=return_offsets_mapping, + return_length=return_length, + verbose=verbose, + **kwargs, + ) + else: + raise ValueError("You need to provide either a `table` or an `answer`.") + + def source_call_func( + self, + table: Union["pd.DataFrame", list["pd.DataFrame"]], + query: Optional[Union[TextInput, list[TextInput]]] = None, + answer: Optional[Union[str, list[str]]] = None, + add_special_tokens: bool = True, + padding: Union[bool, str, PaddingStrategy] = False, + truncation: Union[bool, str, TruncationStrategy] = None, + max_length: Optional[int] = None, + stride: int = 0, + pad_to_multiple_of: Optional[int] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + return_token_type_ids: Optional[bool] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + return_offsets_mapping: bool = False, + return_length: bool = False, + verbose: bool = True, + **kwargs, + ) -> BatchEncoding: + # Input type checking for clearer error + valid_table = False + valid_query = False + + # Check that table have a valid type + if isinstance(table, pd.DataFrame): + valid_table = True + elif isinstance(table, (list, tuple)) and isinstance(table[0], pd.DataFrame): + valid_table = True + + # Check that query have a valid type + if query is None or isinstance(query, str): + valid_query = True + elif isinstance(query, (list, tuple)): + if len(query) == 0 or isinstance(query[0], str): + valid_query = True + + if not valid_table: + raise ValueError( + "table input must of type `pd.DataFrame` (single example), `list[pd.DataFrame]` (batch of examples). " + ) + if not valid_query: + raise ValueError("query input must of type `str` (single example), `list[str]` (batch of examples). ") + is_batched = isinstance(table, (list, tuple)) or isinstance(query, (list, tuple)) + + if is_batched: + return self.batch_encode_plus( + table=table, + query=query, + answer=answer, + add_special_tokens=add_special_tokens, + padding=padding, + truncation=truncation, + max_length=max_length, + pad_to_multiple_of=pad_to_multiple_of, + return_tensors=return_tensors, + return_token_type_ids=return_token_type_ids, + return_attention_mask=return_attention_mask, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + return_offsets_mapping=return_offsets_mapping, + return_length=return_length, + verbose=verbose, + **kwargs, + ) + else: + return self.encode_plus( + table=table, + query=query, + answer=answer, + add_special_tokens=add_special_tokens, + padding=padding, + truncation=truncation, + max_length=max_length, + pad_to_multiple_of=pad_to_multiple_of, + return_tensors=return_tensors, + return_token_type_ids=return_token_type_ids, + return_attention_mask=return_attention_mask, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + return_offsets_mapping=return_offsets_mapping, + return_length=return_length, + verbose=verbose, + **kwargs, + ) + + @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, TAPEX_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING) + def batch_encode_plus( + self, + table: Union["pd.DataFrame", list["pd.DataFrame"]], + query: Optional[list[TextInput]] = None, + answer: Optional[list[str]] = None, + add_special_tokens: bool = True, + padding: Union[bool, str, PaddingStrategy] = False, + truncation: Optional[Union[bool, str]] = None, + max_length: Optional[int] = None, + pad_to_multiple_of: Optional[int] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + return_token_type_ids: Optional[bool] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + return_offsets_mapping: bool = False, + return_length: bool = False, + verbose: bool = True, + **kwargs, + ) -> BatchEncoding: + """ + + + This method is deprecated, `__call__` should be used instead. + + + """ + # Backward compatibility for 'truncation_strategy', 'pad_to_max_length' + padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies( + padding=padding, + truncation=truncation, + max_length=max_length, + pad_to_multiple_of=pad_to_multiple_of, + verbose=verbose, + **kwargs, + ) + + return self._batch_encode_plus( + table=table, + query=query, + answer=answer, + add_special_tokens=add_special_tokens, + padding_strategy=padding_strategy, + truncation_strategy=truncation_strategy, + max_length=max_length, + pad_to_multiple_of=pad_to_multiple_of, + return_tensors=return_tensors, + return_token_type_ids=return_token_type_ids, + return_attention_mask=return_attention_mask, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + return_offsets_mapping=return_offsets_mapping, + return_length=return_length, + verbose=verbose, + **kwargs, + ) + + def _batch_encode_plus( + self, + table: Union["pd.DataFrame", list["pd.DataFrame"]], + query: Optional[list[TextInput]] = None, + answer: Optional[list[str]] = None, + add_special_tokens: bool = True, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE, + max_length: Optional[int] = None, + stride: int = 0, + pad_to_multiple_of: Optional[int] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + return_token_type_ids: Optional[bool] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + return_offsets_mapping: bool = False, + return_length: bool = False, + verbose: bool = True, + **kwargs, + ) -> BatchEncoding: + if return_offsets_mapping: + raise NotImplementedError( + "return_offset_mapping is not available when using Python tokenizers. " + "To use this feature, change your tokenizer to one deriving from " + "transformers.PreTrainedTokenizerFast." + ) + + if isinstance(table, pd.DataFrame) and isinstance(query, (list, tuple)): + # single table, many queries case + # duplicate table for every query + table = [table] * len(query) + if isinstance(table, (list, tuple)) and isinstance(query, str): + # many tables, single query case + # duplicate query for every table + query = [query] * len(table) + + batch_outputs = self._batch_prepare_for_model( + table=table, + query=query, + answer=answer, + add_special_tokens=add_special_tokens, + padding_strategy=padding_strategy, + truncation_strategy=truncation_strategy, + max_length=max_length, + stride=stride, + pad_to_multiple_of=pad_to_multiple_of, + return_attention_mask=return_attention_mask, + return_token_type_ids=return_token_type_ids, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + return_length=return_length, + return_tensors=return_tensors, + verbose=verbose, + ) + + return BatchEncoding(batch_outputs) + + @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, TAPEX_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING) + def _batch_prepare_for_model( + self, + table: Union["pd.DataFrame", list["pd.DataFrame"]], + query: Optional[Union[TextInput, list[TextInput]]] = None, + answer: Optional[Union[str, list[str]]] = None, + add_special_tokens: bool = True, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE, + max_length: Optional[int] = None, + stride: int = 0, + pad_to_multiple_of: Optional[int] = None, + return_tensors: Optional[str] = None, + return_token_type_ids: Optional[bool] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + return_length: bool = False, + verbose: bool = True, + ) -> BatchEncoding: + """ + This method adds special tokens, truncates sequences if overflowing while taking into account the special + tokens and manages a moving window (with user defined stride) for overflowing tokens. + """ + batch_outputs = {} + if answer is None: + answer = [None] * len(table) + for _table, _query, _answer in zip(table, query, answer): + text = self.prepare_table_query( + _table, _query, _answer, truncation_strategy=truncation_strategy, max_length=max_length + ) + + if self.do_lower_case: + text = text.lower() + + tokens = self.tokenize(text) + outputs = self.prepare_for_model( + ids=self.convert_tokens_to_ids(tokens), + add_special_tokens=add_special_tokens, + padding=PaddingStrategy.DO_NOT_PAD.value, # we pad in batch afterwards + truncation=truncation_strategy.value, + max_length=max_length, + stride=stride, + pad_to_multiple_of=None, # we pad in batch afterwards + return_attention_mask=False, # we pad in batch afterwards + return_token_type_ids=return_token_type_ids, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + return_length=return_length, + return_tensors=None, # We convert the whole batch to tensors at the end + prepend_batch_axis=False, + verbose=verbose, + ) + + for key, value in outputs.items(): + if key not in batch_outputs: + batch_outputs[key] = [] + batch_outputs[key].append(value) + + batch_outputs = self.pad( + batch_outputs, + padding=padding_strategy.value, + max_length=max_length, + pad_to_multiple_of=pad_to_multiple_of, + return_attention_mask=return_attention_mask, + ) + + batch_outputs = BatchEncoding(batch_outputs, tensor_type=return_tensors) + + return batch_outputs + + @add_end_docstrings(ENCODE_KWARGS_DOCSTRING) + def encode( + self, + table: "pd.DataFrame", + query: Optional[TextInput] = None, + answer: Optional[str] = None, + add_special_tokens: bool = True, + padding: Union[bool, str, PaddingStrategy] = False, + truncation: Union[bool, str, TruncationStrategy, TapexTruncationStrategy] = None, + max_length: Optional[int] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + **kwargs, + ) -> list[int]: + """ + Prepare a table, a string and possible answer for the model. This method does not return token type IDs, + attention masks, etc. which are necessary for the model to work correctly. Use this method if you want to build + your processing on your own, otherwise refer to `__call__`. + """ + encoded_inputs = self.encode_plus( + table, + query=query, + answer=answer, + add_special_tokens=add_special_tokens, + padding=padding, + truncation=truncation, + max_length=max_length, + return_tensors=return_tensors, + **kwargs, + ) + + return encoded_inputs["input_ids"] + + @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, TAPEX_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING) + def encode_plus( + self, + table: "pd.DataFrame", + query: Optional[TextInput] = None, + answer: Optional[str] = None, + add_special_tokens: bool = True, + padding: Union[bool, str, PaddingStrategy] = False, + truncation: Optional[Union[bool, str]] = None, + max_length: Optional[int] = None, + pad_to_multiple_of: Optional[int] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + return_token_type_ids: Optional[bool] = None, + return_attention_mask: Optional[bool] = None, + return_special_tokens_mask: bool = False, + return_offsets_mapping: bool = False, + return_length: bool = False, + verbose: bool = True, + **kwargs, + ) -> BatchEncoding: + # Backward compatibility for 'truncation_strategy', 'pad_to_max_length' + padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies( + padding=padding, + truncation=truncation, + max_length=max_length, + pad_to_multiple_of=pad_to_multiple_of, + verbose=verbose, + **kwargs, + ) + + return self._encode_plus( + table=table, + query=query, + answer=answer, + add_special_tokens=add_special_tokens, + padding_strategy=padding_strategy, + truncation_strategy=truncation_strategy, + max_length=max_length, + pad_to_multiple_of=pad_to_multiple_of, + return_tensors=return_tensors, + return_token_type_ids=return_token_type_ids, + return_attention_mask=return_attention_mask, + return_special_tokens_mask=return_special_tokens_mask, + return_offsets_mapping=return_offsets_mapping, + return_length=return_length, + verbose=verbose, + **kwargs, + ) + + def _encode_plus( + self, + table: "pd.DataFrame", + query: Optional[TextInput] = None, + answer: Optional[str] = None, + add_special_tokens: bool = True, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE, + max_length: Optional[int] = None, + stride: int = 0, + pad_to_multiple_of: Optional[int] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + return_token_type_ids: Optional[bool] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + return_offsets_mapping: bool = False, + return_length: bool = False, + verbose: bool = True, + **kwargs, + ) -> BatchEncoding: + if return_offsets_mapping: + raise NotImplementedError( + "return_offset_mapping is not available when using Python tokenizers. " + "To use this feature, change your tokenizer to one deriving from " + "transformers.PreTrainedTokenizerFast. " + "More information on available tokenizers at " + "https://github.com/huggingface/transformers/pull/2674" + ) + + text = self.prepare_table_query( + table, query, answer, truncation_strategy=truncation_strategy, max_length=max_length + ) + + # if necessary, perform lower case + if self.do_lower_case: + text = text.lower() + + tokens = self.tokenize(text) + + return self.prepare_for_model( + ids=self.convert_tokens_to_ids(tokens), + add_special_tokens=add_special_tokens, + padding=padding_strategy.value, + truncation=truncation_strategy.value, + max_length=max_length, + stride=stride, + pad_to_multiple_of=pad_to_multiple_of, + return_tensors=return_tensors, + prepend_batch_axis=True, + return_attention_mask=return_attention_mask, + return_token_type_ids=return_token_type_ids, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + return_length=return_length, + verbose=verbose, + ) + + def target_call_func( + self, + answer: Union[str, list[str]], + add_special_tokens: bool = True, + padding: Union[bool, str, PaddingStrategy] = False, + truncation: Union[bool, str, TruncationStrategy] = None, + max_length: Optional[int] = None, + stride: int = 0, + pad_to_multiple_of: Optional[int] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + return_token_type_ids: Optional[bool] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + return_offsets_mapping: bool = False, + return_length: bool = False, + verbose: bool = True, + **kwargs, + ) -> BatchEncoding: + """ + The method tokenizes and prepares the answer label for the model. + + Args: + answer (`str` or `list[str]`): + Corresponding answer supervision to the queries for training the model. + """ + is_batched = isinstance(answer, (list, tuple)) + + if is_batched: + return self.target_batch_encode_plus( + answer=answer, + add_special_tokens=add_special_tokens, + padding=padding, + truncation=truncation, + max_length=max_length, + pad_to_multiple_of=pad_to_multiple_of, + return_tensors=return_tensors, + return_token_type_ids=return_token_type_ids, + return_attention_mask=return_attention_mask, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + return_offsets_mapping=return_offsets_mapping, + return_length=return_length, + verbose=verbose, + **kwargs, + ) + else: + return self.target_encode_plus( + answer=answer, + add_special_tokens=add_special_tokens, + padding=padding, + truncation=truncation, + max_length=max_length, + pad_to_multiple_of=pad_to_multiple_of, + return_tensors=return_tensors, + return_token_type_ids=return_token_type_ids, + return_attention_mask=return_attention_mask, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + return_offsets_mapping=return_offsets_mapping, + return_length=return_length, + verbose=verbose, + **kwargs, + ) + + def target_batch_encode_plus( + self, + answer: list[str], + add_special_tokens: bool = True, + padding: Union[bool, str, PaddingStrategy] = False, + truncation: Optional[Union[bool, str]] = None, + max_length: Optional[int] = None, + pad_to_multiple_of: Optional[int] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + return_token_type_ids: Optional[bool] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + return_offsets_mapping: bool = False, + return_length: bool = False, + verbose: bool = True, + **kwargs, + ) -> BatchEncoding: + """ + Prepare answer strings for the model. + + Args: + answer `list[str]`: + Corresponding answer supervision to the queries for training the model. + """ + # Backward compatibility for 'truncation_strategy', 'pad_to_max_length' + padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies( + padding=padding, + truncation=truncation, + max_length=max_length, + pad_to_multiple_of=pad_to_multiple_of, + verbose=verbose, + **kwargs, + ) + + return self._target_batch_encode_plus( + answer=answer, + add_special_tokens=add_special_tokens, + padding_strategy=padding_strategy, + truncation_strategy=truncation_strategy, + max_length=max_length, + pad_to_multiple_of=pad_to_multiple_of, + return_tensors=return_tensors, + return_token_type_ids=return_token_type_ids, + return_attention_mask=return_attention_mask, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + return_offsets_mapping=return_offsets_mapping, + return_length=return_length, + verbose=verbose, + **kwargs, + ) + + def _target_batch_encode_plus( + self, + answer: list[str], + add_special_tokens: bool = True, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE, + max_length: Optional[int] = None, + stride: int = 0, + pad_to_multiple_of: Optional[int] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + return_token_type_ids: Optional[bool] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + return_offsets_mapping: bool = False, + return_length: bool = False, + verbose: bool = True, + **kwargs, + ) -> BatchEncoding: + batch_outputs = {} + for text in answer: + if self.do_lower_case: + text = text.lower() + + tokens = self.tokenize(text) + outputs = self.prepare_for_model( + ids=self.convert_tokens_to_ids(tokens), + add_special_tokens=add_special_tokens, + padding=PaddingStrategy.DO_NOT_PAD.value, # we pad in batch afterwards + truncation=truncation_strategy.value, + max_length=max_length, + stride=stride, + pad_to_multiple_of=None, # we pad in batch afterwards + return_attention_mask=False, # we pad in batch afterwards + return_token_type_ids=return_token_type_ids, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + return_length=return_length, + return_tensors=None, # We convert the whole batch to tensors at the end + prepend_batch_axis=False, + verbose=verbose, + ) + + for key, value in outputs.items(): + if key not in batch_outputs: + batch_outputs[key] = [] + batch_outputs[key].append(value) + + batch_outputs = self.pad( + batch_outputs, + padding=padding_strategy.value, + max_length=max_length, + pad_to_multiple_of=pad_to_multiple_of, + return_attention_mask=return_attention_mask, + ) + + batch_outputs = BatchEncoding(batch_outputs, tensor_type=return_tensors) + + return BatchEncoding(batch_outputs) + + def target_encode( + self, + answer: str, + add_special_tokens: bool = True, + padding: Union[bool, str, PaddingStrategy] = False, + truncation: Union[bool, str, TruncationStrategy, TapexTruncationStrategy] = None, + max_length: Optional[int] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + **kwargs, + ) -> list[int]: + """ + Prepare the answer string for the model. This method does not return token type IDs, attention masks, etc. + which are necessary for the model to work correctly. Use this method if you want to build your processing on + your own, otherwise refer to `__call__`. + + Args: + answer `str`: + Corresponding answer supervision to the queries for training the model + """ + encoded_outputs = self.target_encode_plus( + answer=answer, + add_special_tokens=add_special_tokens, + padding=padding, + truncation=truncation, + max_length=max_length, + return_tensors=return_tensors, + **kwargs, + ) + + return encoded_outputs["input_ids"] + + def target_encode_plus( + self, + answer: str, + add_special_tokens: bool = True, + padding: Union[bool, str, PaddingStrategy] = False, + truncation: Optional[Union[bool, str]] = None, + max_length: Optional[int] = None, + pad_to_multiple_of: Optional[int] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + return_token_type_ids: Optional[bool] = None, + return_attention_mask: Optional[bool] = None, + return_special_tokens_mask: bool = False, + return_offsets_mapping: bool = False, + return_length: bool = False, + verbose: bool = True, + **kwargs, + ) -> BatchEncoding: + """ + Prepare a answer string for the model. + + Args: + answer `str`: + Corresponding answer supervision to the queries for training the model. + """ + # Backward compatibility for 'truncation_strategy', 'pad_to_max_length' + padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies( + padding=padding, + truncation=truncation, + max_length=max_length, + pad_to_multiple_of=pad_to_multiple_of, + verbose=verbose, + **kwargs, + ) + + return self._target_encode_plus( + answer=answer, + add_special_tokens=add_special_tokens, + padding_strategy=padding_strategy, + truncation_strategy=truncation_strategy, + max_length=max_length, + pad_to_multiple_of=pad_to_multiple_of, + return_tensors=return_tensors, + return_token_type_ids=return_token_type_ids, + return_attention_mask=return_attention_mask, + return_special_tokens_mask=return_special_tokens_mask, + return_offsets_mapping=return_offsets_mapping, + return_length=return_length, + verbose=verbose, + **kwargs, + ) + + def _target_encode_plus( + self, + answer: str, + add_special_tokens: bool = True, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE, + max_length: Optional[int] = None, + stride: int = 0, + pad_to_multiple_of: Optional[int] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + return_token_type_ids: Optional[bool] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + return_offsets_mapping: bool = False, + return_length: bool = False, + verbose: bool = True, + **kwargs, + ) -> BatchEncoding: + if return_offsets_mapping: + raise NotImplementedError( + "return_offset_mapping is not available when using Python tokenizers. " + "To use this feature, change your tokenizer to one deriving from " + "transformers.PreTrainedTokenizerFast. " + "More information on available tokenizers at " + "https://github.com/huggingface/transformers/pull/2674" + ) + + text = answer + + # if necessary, perform lower case + if self.do_lower_case: + text = text.lower() + + tokens = self.tokenize(text) + + return self.prepare_for_model( + ids=self.convert_tokens_to_ids(tokens), + add_special_tokens=add_special_tokens, + padding=padding_strategy.value, + truncation=truncation_strategy.value, + max_length=max_length, + stride=stride, + pad_to_multiple_of=pad_to_multiple_of, + return_tensors=return_tensors, + prepend_batch_axis=True, + return_attention_mask=return_attention_mask, + return_token_type_ids=return_token_type_ids, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + return_length=return_length, + verbose=verbose, + ) + + def prepare_table_query( + self, + table, + query, + answer=None, + truncation_strategy=Union[str, TruncationStrategy, TapexTruncationStrategy], + max_length=None, + ): + """ + This method can be used to linearize a table and add a corresponding query. + + Optionally, it also handles truncation of the table (cells). + + An answer can be provided for more precise truncation. + """ + if not table.empty: + # step 1: create table dictionary + table_content = {"header": list(table.columns), "rows": [list(row.values) for i, row in table.iterrows()]} + + # step 2: modify table internally + # always truncate table cells based on self.max_cell_length + # optionally truncate rows if truncation_strategy is set to it + self.truncate_table_cells(table_content, query, answer) + if truncation_strategy == TapexTruncationStrategy.DROP_ROWS_TO_FIT: + self.truncate_table_rows(table_content, query, answer, max_length=max_length) + + # step 3: linearize table + linear_table = self.table_linearize.process_table(table_content) + else: + linear_table = "" + + if linear_table == "": + logger.warning( + "You provide an empty table, or all cells contain much tokens (e.g., >= 1024 tokens). " + + f"Please carefully check the corresponding table with the query : {query}." + ) + if query == "": + logger.warning("You provide nothing to query with respect to the table.") + # step 4: concatenate query with linear_table + separator = " " if query and linear_table else "" + joint_input = (query + separator + linear_table) if query else linear_table + + return joint_input + + def truncate_table_cells(self, table_content: dict, question: str, answer: list): + # TODO (Qian): is it possible to revert the original cell if it is in the final answer? + cell_mapping = {} + for row in table_content["rows"]: + for i, cell in enumerate(row): + truncate_cell = self.truncate_cell(cell) + if truncate_cell is not None: + cell_mapping[cell] = truncate_cell + row[i] = truncate_cell + + # modify the answer list + if answer is not None: + for i, case in enumerate(answer): + if case in cell_mapping: + answer[i] = cell_mapping[case] + + def truncate_cell(self, cell_value): + # do not process on these cases + if isinstance(cell_value, (int, float)): + return cell_value + if cell_value.strip() != "": + try_tokens = self.tokenize(cell_value) + if len(try_tokens) >= self.max_cell_length: + retain_tokens = try_tokens[: self.max_cell_length] + retain_cell_value = self.convert_tokens_to_string(retain_tokens) + return retain_cell_value + else: + return None + else: + return cell_value + + def truncate_table_rows( + self, table_content: dict, question: str, answer: Optional[Union[str, list[str]]] = None, max_length=None + ): + """ + Args: + table_content: + {"header": xxx, "rows": xxx, "id" (Optionally): xxx} + + question: + natural language sentence + + answer: + if for training, is the supervision; otherwise will be empty + """ + delete_ratio, remain_token_len = self.estimate_delete_ratio(table_content, question, max_length) + # randomly delete unrelated rows + self.delete_unrelated_rows(table_content, question, answer, delete_ratio) + # guarantee the result < max_length + maximum_keep_rows = 0 + for ind, row_example in enumerate(table_content["rows"]): + value_string = self.table_linearize.process_row(row_example, ind + 1) + value_token_len = len(self.tokenize(value_string)) + # over the size limit, and take action + if value_token_len > remain_token_len: + break + remain_token_len -= value_token_len + maximum_keep_rows += 1 + del table_content["rows"][maximum_keep_rows:] + + def estimate_delete_ratio(self, table_content: dict, question: str, max_length=None): + if "header" not in table_content or "rows" not in table_content: + raise ValueError("The table content should contain both 'header' and 'rows' keys.") + # calculate the tokens of header, special tokens will only be pre-prepended into question + question_tokens = self.tokenize(question, add_special_tokens=True) + # calculate the tokens of header + header_string = self.table_linearize.process_header(table_content["header"]) + header_tokens = self.tokenize(header_string, add_special_tokens=False) + # split all cell values into tokens and see how many can be accommodated + used_token_len = len(question_tokens) + len(header_tokens) + # remaining token space for rows + remain_token_len = max_length - used_token_len + + value_string = "" + for _, row_example in enumerate(table_content["rows"]): + # use a general index to roughly estimate the overall token len + value_string += self.table_linearize.process_row(row_example, 100) + " " + value_token_len = len(self.tokenize(value_string)) + + if value_token_len < remain_token_len: + # no row will be deleted + return 0.0, remain_token_len + else: + # calc a roughly delete rate + return 1.0 - remain_token_len / value_token_len, remain_token_len + + def delete_unrelated_rows(self, table_content: dict, question: str, answer: list, delete_ratio: float): + """ + The argument answer is used only during training. + """ + truncated_unrelated_indices = [] + related_indices = [] + if answer is None or len(answer) == 0: + answer_set = set() + else: + answer_set = {ans_ex.lower() for ans_ex in answer} + # add question key words into answer set + if question is not None: + answer_set.update(question.split()) + question_set = set(question.strip("?!.,").split(" ")) + row_max_len = len(table_content["rows"]) + for _row_idx, row in enumerate(table_content["rows"]): + lower_row = {str(cell).lower() for cell in row} + if len(lower_row & answer_set) == 0 and len(lower_row & question_set) == 0: + truncated_unrelated_indices.append(_row_idx) + else: + # add neighbours to preserve information aggressively + related_indices.extend([_row_idx - 2, _row_idx - 1, _row_idx, _row_idx + 1, _row_idx + 2]) + + # remove the neighbours + truncated_unrelated_indices = [ + _row_idx for _row_idx in truncated_unrelated_indices if _row_idx not in related_indices + ] + # select some cases to drop + drop_items = min(len(truncated_unrelated_indices), int(len(table_content["rows"]) * delete_ratio)) + drop_row_indices = random.choices(truncated_unrelated_indices, k=drop_items) + + for _row_idx in reversed(range(row_max_len)): + if _row_idx in drop_row_indices: + del table_content["rows"][_row_idx] + + # only when the drop ratio is too large, logging for warning. + if "id" in table_content and len(drop_row_indices) > 0: + logger.warning("Delete {:.2f} rows in table {}".format(len(drop_row_indices), table_content["id"])) + + +__all__ = ["TapexTokenizer"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deprecated/vit_hybrid/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deprecated/vit_hybrid/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..f5bd93aa4dabea9b2adbc54eeeb3664de589f43a --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/deprecated/vit_hybrid/__init__.py @@ -0,0 +1,28 @@ +# Copyright 2022 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ....utils import _LazyModule +from ....utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_vit_hybrid import * + from .image_processing_vit_hybrid import * + from .modeling_vit_hybrid import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/depth_anything/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/depth_anything/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..7425e37e0399c792155a88488045176fb3b5e7a5 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/depth_anything/__init__.py @@ -0,0 +1,27 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_depth_anything import * + from .modeling_depth_anything import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/depth_anything/configuration_depth_anything.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/depth_anything/configuration_depth_anything.py new file mode 100644 index 0000000000000000000000000000000000000000..65884fe67c3cc85afe5b474eb5756665b8b20e11 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/depth_anything/configuration_depth_anything.py @@ -0,0 +1,176 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""DepthAnything model configuration""" + +import copy + +from ...configuration_utils import PretrainedConfig +from ...utils import logging +from ...utils.backbone_utils import verify_backbone_config_arguments +from ..auto.configuration_auto import CONFIG_MAPPING + + +logger = logging.get_logger(__name__) + + +class DepthAnythingConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`DepthAnythingModel`]. It is used to instantiate a DepthAnything + model according to the specified arguments, defining the model architecture. Instantiating a configuration with the + defaults will yield a similar configuration to that of the DepthAnything + [LiheYoung/depth-anything-small-hf](https://huggingface.co/LiheYoung/depth-anything-small-hf) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + backbone_config (`Union[dict[str, Any], PretrainedConfig]`, *optional*): + The configuration of the backbone model. Only used in case `is_hybrid` is `True` or in case you want to + leverage the [`AutoBackbone`] API. + backbone (`str`, *optional*): + Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this + will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone` + is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights. + use_pretrained_backbone (`bool`, *optional*, defaults to `False`): + Whether to use pretrained weights for the backbone. + use_timm_backbone (`bool`, *optional*, defaults to `False`): + Whether or not to use the `timm` library for the backbone. If set to `False`, will use the [`AutoBackbone`] + API. + backbone_kwargs (`dict`, *optional*): + Keyword arguments to be passed to AutoBackbone when loading from a checkpoint + e.g. `{'out_indices': (0, 1, 2, 3)}`. Cannot be specified if `backbone_config` is set. + patch_size (`int`, *optional*, defaults to 14): + The size of the patches to extract from the backbone features. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + reassemble_hidden_size (`int`, *optional*, defaults to 384): + The number of input channels of the reassemble layers. + reassemble_factors (`list[int]`, *optional*, defaults to `[4, 2, 1, 0.5]`): + The up/downsampling factors of the reassemble layers. + neck_hidden_sizes (`list[str]`, *optional*, defaults to `[48, 96, 192, 384]`): + The hidden sizes to project to for the feature maps of the backbone. + fusion_hidden_size (`int`, *optional*, defaults to 64): + The number of channels before fusion. + head_in_index (`int`, *optional*, defaults to -1): + The index of the features to use in the depth estimation head. + head_hidden_size (`int`, *optional*, defaults to 32): + The number of output channels in the second convolution of the depth estimation head. + depth_estimation_type (`str`, *optional*, defaults to `"relative"`): + The type of depth estimation to use. Can be one of `["relative", "metric"]`. + max_depth (`float`, *optional*): + The maximum depth to use for the "metric" depth estimation head. 20 should be used for indoor models + and 80 for outdoor models. For "relative" depth estimation, this value is ignored. + + Example: + + ```python + >>> from transformers import DepthAnythingConfig, DepthAnythingForDepthEstimation + + >>> # Initializing a DepthAnything small style configuration + >>> configuration = DepthAnythingConfig() + + >>> # Initializing a model from the DepthAnything small style configuration + >>> model = DepthAnythingForDepthEstimation(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "depth_anything" + + def __init__( + self, + backbone_config=None, + backbone=None, + use_pretrained_backbone=False, + use_timm_backbone=False, + backbone_kwargs=None, + patch_size=14, + initializer_range=0.02, + reassemble_hidden_size=384, + reassemble_factors=[4, 2, 1, 0.5], + neck_hidden_sizes=[48, 96, 192, 384], + fusion_hidden_size=64, + head_in_index=-1, + head_hidden_size=32, + depth_estimation_type="relative", + max_depth=None, + **kwargs, + ): + super().__init__(**kwargs) + if backbone_config is None and backbone is None: + logger.info("`backbone_config` is `None`. Initializing the config with the default `Dinov2` backbone.") + backbone_config = CONFIG_MAPPING["dinov2"]( + image_size=518, + hidden_size=384, + num_attention_heads=6, + out_indices=[9, 10, 11, 12], + apply_layernorm=True, + reshape_hidden_states=False, + ) + elif isinstance(backbone_config, dict): + backbone_model_type = backbone_config.get("model_type") + config_class = CONFIG_MAPPING[backbone_model_type] + backbone_config = config_class.from_dict(backbone_config) + + verify_backbone_config_arguments( + use_timm_backbone=use_timm_backbone, + use_pretrained_backbone=use_pretrained_backbone, + backbone=backbone, + backbone_config=backbone_config, + backbone_kwargs=backbone_kwargs, + ) + + self.backbone_config = backbone_config + self.backbone = backbone + self.use_pretrained_backbone = use_pretrained_backbone + self.use_timm_backbone = use_timm_backbone + self.backbone_kwargs = backbone_kwargs + self.reassemble_hidden_size = reassemble_hidden_size + self.patch_size = patch_size + self.initializer_range = initializer_range + self.reassemble_factors = reassemble_factors + self.neck_hidden_sizes = neck_hidden_sizes + self.fusion_hidden_size = fusion_hidden_size + self.head_in_index = head_in_index + self.head_hidden_size = head_hidden_size + if depth_estimation_type not in ["relative", "metric"]: + raise ValueError("depth_estimation_type must be one of ['relative', 'metric']") + self.depth_estimation_type = depth_estimation_type + self.max_depth = max_depth if max_depth else 1 + + @property + def sub_configs(self): + return ( + {"backbone_config": type(self.backbone_config)} + if getattr(self, "backbone_config", None) is not None + else {} + ) + + def to_dict(self): + """ + Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`]. Returns: + `dict[str, any]`: Dictionary of all the attributes that make up this configuration instance, + """ + output = copy.deepcopy(self.__dict__) + + if output["backbone_config"] is not None: + output["backbone_config"] = self.backbone_config.to_dict() + + output["model_type"] = self.__class__.model_type + return output + + +__all__ = ["DepthAnythingConfig"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/depth_anything/modeling_depth_anything.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/depth_anything/modeling_depth_anything.py new file mode 100644 index 0000000000000000000000000000000000000000..bc7d741312041fede83c34467f73282b44ab5430 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/depth_anything/modeling_depth_anything.py @@ -0,0 +1,427 @@ +# coding=utf-8 +# Copyright 2024 TikTok and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch Depth Anything model.""" + +from typing import Optional, Union + +import torch +from torch import nn + +from ...modeling_outputs import DepthEstimatorOutput +from ...modeling_utils import PreTrainedModel +from ...utils import auto_docstring, logging +from ...utils.backbone_utils import load_backbone +from .configuration_depth_anything import DepthAnythingConfig + + +logger = logging.get_logger(__name__) + +# General docstring + + +class DepthAnythingReassembleLayer(nn.Module): + def __init__(self, config, channels, factor): + super().__init__() + self.projection = nn.Conv2d(in_channels=config.reassemble_hidden_size, out_channels=channels, kernel_size=1) + + # up/down sampling depending on factor + if factor > 1: + self.resize = nn.ConvTranspose2d(channels, channels, kernel_size=factor, stride=factor, padding=0) + elif factor == 1: + self.resize = nn.Identity() + elif factor < 1: + # so should downsample + self.resize = nn.Conv2d(channels, channels, kernel_size=3, stride=int(1 / factor), padding=1) + + # Copied from transformers.models.dpt.modeling_dpt.DPTReassembleLayer.forward + def forward(self, hidden_state): + hidden_state = self.projection(hidden_state) + hidden_state = self.resize(hidden_state) + + return hidden_state + + +class DepthAnythingReassembleStage(nn.Module): + """ + This class reassembles the hidden states of the backbone into image-like feature representations at various + resolutions. + + This happens in 3 stages: + 1. Take the patch embeddings and reshape them to image-like feature representations. + 2. Project the channel dimension of the hidden states according to `config.neck_hidden_sizes`. + 3. Resizing the spatial dimensions (height, width). + + Args: + config (`[DepthAnythingConfig]`): + Model configuration class defining the model architecture. + """ + + def __init__(self, config): + super().__init__() + + self.config = config + self.layers = nn.ModuleList() + for channels, factor in zip(config.neck_hidden_sizes, config.reassemble_factors): + self.layers.append(DepthAnythingReassembleLayer(config, channels=channels, factor=factor)) + + def forward(self, hidden_states: list[torch.Tensor], patch_height=None, patch_width=None) -> list[torch.Tensor]: + """ + Args: + hidden_states (`list[torch.FloatTensor]`, each of shape `(batch_size, sequence_length + 1, hidden_size)`): + List of hidden states from the backbone. + """ + out = [] + + for i, hidden_state in enumerate(hidden_states): + # reshape to (batch_size, num_channels, height, width) + hidden_state = hidden_state[:, 1:] + batch_size, _, num_channels = hidden_state.shape + hidden_state = hidden_state.reshape(batch_size, patch_height, patch_width, num_channels) + hidden_state = hidden_state.permute(0, 3, 1, 2).contiguous() + hidden_state = self.layers[i](hidden_state) + out.append(hidden_state) + + return out + + +class DepthAnythingPreActResidualLayer(nn.Module): + """ + ResidualConvUnit, pre-activate residual unit. + + Args: + config (`[DepthAnythingConfig]`): + Model configuration class defining the model architecture. + """ + + def __init__(self, config): + super().__init__() + + self.activation1 = nn.ReLU() + self.convolution1 = nn.Conv2d( + config.fusion_hidden_size, + config.fusion_hidden_size, + kernel_size=3, + stride=1, + padding=1, + bias=True, + ) + + self.activation2 = nn.ReLU() + self.convolution2 = nn.Conv2d( + config.fusion_hidden_size, + config.fusion_hidden_size, + kernel_size=3, + stride=1, + padding=1, + bias=True, + ) + + def forward(self, hidden_state: torch.Tensor) -> torch.Tensor: + residual = hidden_state + hidden_state = self.activation1(hidden_state) + hidden_state = self.convolution1(hidden_state) + hidden_state = self.activation2(hidden_state) + hidden_state = self.convolution2(hidden_state) + + return hidden_state + residual + + +class DepthAnythingFeatureFusionLayer(nn.Module): + """Feature fusion layer, merges feature maps from different stages. + + Args: + config (`[DepthAnythingConfig]`): + Model configuration class defining the model architecture. + """ + + def __init__(self, config): + super().__init__() + + self.projection = nn.Conv2d(config.fusion_hidden_size, config.fusion_hidden_size, kernel_size=1, bias=True) + + self.residual_layer1 = DepthAnythingPreActResidualLayer(config) + self.residual_layer2 = DepthAnythingPreActResidualLayer(config) + + def forward(self, hidden_state, residual=None, size=None): + if residual is not None: + if hidden_state.shape != residual.shape: + residual = nn.functional.interpolate( + residual, size=(hidden_state.shape[2], hidden_state.shape[3]), mode="bilinear", align_corners=False + ) + hidden_state = hidden_state + self.residual_layer1(residual) + + hidden_state = self.residual_layer2(hidden_state) + + modifier = {"scale_factor": 2} if size is None else {"size": size} + + hidden_state = nn.functional.interpolate( + hidden_state, + **modifier, + mode="bilinear", + align_corners=True, + ) + hidden_state = self.projection(hidden_state) + + return hidden_state + + +class DepthAnythingFeatureFusionStage(nn.Module): + # Copied from transformers.models.dpt.modeling_dpt.DPTFeatureFusionStage.__init__ with DPT->DepthAnything + def __init__(self, config: DepthAnythingConfig): + super().__init__() + self.layers = nn.ModuleList() + for _ in range(len(config.neck_hidden_sizes)): + self.layers.append(DepthAnythingFeatureFusionLayer(config)) + + def forward(self, hidden_states, size=None): + # reversing the hidden_states, we start from the last + hidden_states = hidden_states[::-1] + + fused_hidden_states = [] + fused_hidden_state = None + + for idx, (hidden_state, layer) in enumerate(zip(hidden_states, self.layers)): + size = hidden_states[idx + 1].shape[2:] if idx != (len(hidden_states) - 1) else None + + if fused_hidden_state is None: + # first layer only uses the last hidden_state + fused_hidden_state = layer(hidden_state, size=size) + else: + fused_hidden_state = layer(fused_hidden_state, hidden_state, size=size) + + fused_hidden_states.append(fused_hidden_state) + + return fused_hidden_states + + +# Modified from transformers.models.dpt.modeling_dpt.DPTPreTrainedModel with DPT->DepthAnything,dpt->depth_anything +# avoiding sdpa and flash_attn_2 support, it's done in the backend +@auto_docstring +class DepthAnythingPreTrainedModel(PreTrainedModel): + config: DepthAnythingConfig + base_model_prefix = "depth_anything" + main_input_name = "pixel_values" + supports_gradient_checkpointing = True + + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, (nn.Linear, nn.Conv2d, nn.ConvTranspose2d)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + +class DepthAnythingNeck(nn.Module): + """ + DepthAnythingNeck. A neck is a module that is normally used between the backbone and the head. It takes a list of tensors as + input and produces another list of tensors as output. For DepthAnything, it includes 2 stages: + + * DepthAnythingReassembleStage + * DepthAnythingFeatureFusionStage. + + Args: + config (dict): config dict. + """ + + def __init__(self, config): + super().__init__() + self.config = config + + self.reassemble_stage = DepthAnythingReassembleStage(config) + + self.convs = nn.ModuleList() + for channel in config.neck_hidden_sizes: + self.convs.append(nn.Conv2d(channel, config.fusion_hidden_size, kernel_size=3, padding=1, bias=False)) + + # fusion + self.fusion_stage = DepthAnythingFeatureFusionStage(config) + + def forward(self, hidden_states: list[torch.Tensor], patch_height=None, patch_width=None) -> list[torch.Tensor]: + """ + Args: + hidden_states (`list[torch.FloatTensor]`, each of shape `(batch_size, sequence_length, hidden_size)` or `(batch_size, hidden_size, height, width)`): + List of hidden states from the backbone. + """ + if not isinstance(hidden_states, (tuple, list)): + raise TypeError("hidden_states should be a tuple or list of tensors") + + if len(hidden_states) != len(self.config.neck_hidden_sizes): + raise ValueError("The number of hidden states should be equal to the number of neck hidden sizes.") + + # postprocess hidden states + hidden_states = self.reassemble_stage(hidden_states, patch_height, patch_width) + + features = [self.convs[i](feature) for i, feature in enumerate(hidden_states)] + + # fusion blocks + output = self.fusion_stage(features) + + return output + + +class DepthAnythingDepthEstimationHead(nn.Module): + """ + Output head consisting of 3 convolutional layers. It progressively halves the feature dimension and upsamples + the predictions to the input resolution after the first convolutional layer (details can be found in the DPT paper's + supplementary material). The final activation function is either ReLU or Sigmoid, depending on the depth estimation + type (relative or metric). For metric depth estimation, the output is scaled by the maximum depth used during pretraining. + """ + + def __init__(self, config): + super().__init__() + + self.head_in_index = config.head_in_index + self.patch_size = config.patch_size + + features = config.fusion_hidden_size + self.conv1 = nn.Conv2d(features, features // 2, kernel_size=3, stride=1, padding=1) + self.conv2 = nn.Conv2d(features // 2, config.head_hidden_size, kernel_size=3, stride=1, padding=1) + self.activation1 = nn.ReLU() + self.conv3 = nn.Conv2d(config.head_hidden_size, 1, kernel_size=1, stride=1, padding=0) + if config.depth_estimation_type == "relative": + self.activation2 = nn.ReLU() + elif config.depth_estimation_type == "metric": + self.activation2 = nn.Sigmoid() + else: + raise ValueError(f"Unknown depth estimation type: {config.depth_estimation_type}") + self.max_depth = config.max_depth + + def forward(self, hidden_states: list[torch.Tensor], patch_height, patch_width) -> torch.Tensor: + hidden_states = hidden_states[self.head_in_index] + + predicted_depth = self.conv1(hidden_states) + predicted_depth = nn.functional.interpolate( + predicted_depth, + (int(patch_height * self.patch_size), int(patch_width * self.patch_size)), + mode="bilinear", + align_corners=True, + ) + predicted_depth = self.conv2(predicted_depth) + predicted_depth = self.activation1(predicted_depth) + predicted_depth = self.conv3(predicted_depth) + predicted_depth = self.activation2(predicted_depth) * self.max_depth + predicted_depth = predicted_depth.squeeze(dim=1) # shape (batch_size, height, width) + + return predicted_depth + + +@auto_docstring( + custom_intro=""" + Depth Anything Model with a depth estimation head on top (consisting of 3 convolutional layers) e.g. for KITTI, NYUv2. + """ +) +class DepthAnythingForDepthEstimation(DepthAnythingPreTrainedModel): + _no_split_modules = ["DPTViTEmbeddings"] + + def __init__(self, config): + super().__init__(config) + + self.backbone = load_backbone(config) + self.neck = DepthAnythingNeck(config) + self.head = DepthAnythingDepthEstimationHead(config) + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + pixel_values: torch.FloatTensor, + labels: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple[torch.Tensor], DepthEstimatorOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*): + Ground truth depth estimation maps for computing the loss. + + Examples: + ```python + >>> from transformers import AutoImageProcessor, AutoModelForDepthEstimation + >>> import torch + >>> import numpy as np + >>> from PIL import Image + >>> import requests + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> image_processor = AutoImageProcessor.from_pretrained("LiheYoung/depth-anything-small-hf") + >>> model = AutoModelForDepthEstimation.from_pretrained("LiheYoung/depth-anything-small-hf") + + >>> # prepare image for the model + >>> inputs = image_processor(images=image, return_tensors="pt") + + >>> with torch.no_grad(): + ... outputs = model(**inputs) + + >>> # interpolate to original size + >>> post_processed_output = image_processor.post_process_depth_estimation( + ... outputs, + ... target_sizes=[(image.height, image.width)], + ... ) + + >>> # visualize the prediction + >>> predicted_depth = post_processed_output[0]["predicted_depth"] + >>> depth = predicted_depth * 255 / predicted_depth.max() + >>> depth = depth.detach().cpu().numpy() + >>> depth = Image.fromarray(depth.astype("uint8")) + ```""" + loss = None + if labels is not None: + raise NotImplementedError("Training is not implemented yet") + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + + outputs = self.backbone.forward_with_filtered_kwargs( + pixel_values, output_hidden_states=output_hidden_states, output_attentions=output_attentions + ) + hidden_states = outputs.feature_maps + + _, _, height, width = pixel_values.shape + patch_size = self.config.patch_size + patch_height = height // patch_size + patch_width = width // patch_size + + hidden_states = self.neck(hidden_states, patch_height, patch_width) + + predicted_depth = self.head(hidden_states, patch_height, patch_width) + + if not return_dict: + if output_hidden_states: + output = (predicted_depth,) + outputs[1:] + else: + output = (predicted_depth,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return DepthEstimatorOutput( + loss=loss, + predicted_depth=predicted_depth, + hidden_states=outputs.hidden_states if output_hidden_states else None, + attentions=outputs.attentions, + ) + + +__all__ = ["DepthAnythingForDepthEstimation", "DepthAnythingPreTrainedModel"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/depth_pro/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/depth_pro/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..5968aae67b523ef207b1ad9ed00199b412ee6cf5 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/depth_pro/__init__.py @@ -0,0 +1,29 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_depth_pro import * + from .image_processing_depth_pro import * + from .image_processing_depth_pro_fast import * + from .modeling_depth_pro import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/depth_pro/configuration_depth_pro.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/depth_pro/configuration_depth_pro.py new file mode 100644 index 0000000000000000000000000000000000000000..69bfffeb93f1d32f23bc95e2327ce3de9bf4d88d --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/depth_pro/configuration_depth_pro.py @@ -0,0 +1,204 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""DepthPro model configuration""" + +from copy import deepcopy + +from ...configuration_utils import PretrainedConfig +from ...utils import logging +from ..auto.configuration_auto import CONFIG_MAPPING, AutoConfig + + +logger = logging.get_logger(__name__) + + +class DepthProConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`DepthProModel`]. It is used to instantiate a + DepthPro model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of the DepthPro + [apple/DepthPro](https://huggingface.co/apple/DepthPro) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + fusion_hidden_size (`int`, *optional*, defaults to 256): + The number of channels before fusion. + patch_size (`int`, *optional*, defaults to 384): + The size (resolution) of each patch. This is also the image_size for backbone model. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + intermediate_hook_ids (`list[int]`, *optional*, defaults to `[11, 5]`): + Indices of the intermediate hidden states from the patch encoder to use for fusion. + intermediate_feature_dims (`list[int]`, *optional*, defaults to `[256, 256]`): + Hidden state dimensions during upsampling for each intermediate hidden state in `intermediate_hook_ids`. + scaled_images_ratios (`list[float]`, *optional*, defaults to `[0.25, 0.5, 1]`): + Ratios of scaled images to be used by the patch encoder. + scaled_images_overlap_ratios (`list[float]`, *optional*, defaults to `[0.0, 0.5, 0.25]`): + Overlap ratios between patches for each scaled image in `scaled_images_ratios`. + scaled_images_feature_dims (`list[int]`, *optional*, defaults to `[1024, 1024, 512]`): + Hidden state dimensions during upsampling for each scaled image in `scaled_images_ratios`. + merge_padding_value (`int`, *optional*, defaults to 3): + When merging smaller patches back to the image size, overlapping sections of this size are removed. + use_batch_norm_in_fusion_residual (`bool`, *optional*, defaults to `False`): + Whether to use batch normalization in the pre-activate residual units of the fusion blocks. + use_bias_in_fusion_residual (`bool`, *optional*, defaults to `True`): + Whether to use bias in the pre-activate residual units of the fusion blocks. + use_fov_model (`bool`, *optional*, defaults to `False`): + Whether to use `DepthProFovModel` to generate the field of view. + num_fov_head_layers (`int`, *optional*, defaults to 2): + Number of convolution layers in the head of `DepthProFovModel`. + image_model_config (`Union[dict[str, Any], PretrainedConfig]`, *optional*): + The configuration of the image encoder model, which is loaded using the [`AutoModel`] API. + By default, Dinov2 model is used as backbone. + patch_model_config (`Union[dict[str, Any], PretrainedConfig]`, *optional*): + The configuration of the patch encoder model, which is loaded using the [`AutoModel`] API. + By default, Dinov2 model is used as backbone. + fov_model_config (`Union[dict[str, Any], PretrainedConfig]`, *optional*): + The configuration of the fov encoder model, which is loaded using the [`AutoModel`] API. + By default, Dinov2 model is used as backbone. + + Example: + + ```python + >>> from transformers import DepthProConfig, DepthProModel + + >>> # Initializing a DepthPro apple/DepthPro style configuration + >>> configuration = DepthProConfig() + + >>> # Initializing a model (with random weights) from the apple/DepthPro style configuration + >>> model = DepthProModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "depth_pro" + sub_configs = {"image_model_config": AutoConfig, "patch_model_config": AutoConfig, "fov_model_config": AutoConfig} + + def __init__( + self, + fusion_hidden_size=256, + patch_size=384, + initializer_range=0.02, + intermediate_hook_ids=[11, 5], + intermediate_feature_dims=[256, 256], + scaled_images_ratios=[0.25, 0.5, 1], + scaled_images_overlap_ratios=[0.0, 0.5, 0.25], + scaled_images_feature_dims=[1024, 1024, 512], + merge_padding_value=3, + use_batch_norm_in_fusion_residual=False, + use_bias_in_fusion_residual=True, + use_fov_model=False, + num_fov_head_layers=2, + image_model_config=None, + patch_model_config=None, + fov_model_config=None, + **kwargs, + ): + super().__init__(**kwargs) + + # scaled_images_ratios is sorted + if scaled_images_ratios != sorted(scaled_images_ratios): + raise ValueError( + f"Values in scaled_images_ratios={scaled_images_ratios} should be sorted from low to high" + ) + + # scaled_images_ratios, scaled_images_overlap_ratios, scaled_images_feature_dims should be consistent + if not (len(scaled_images_ratios) == len(scaled_images_overlap_ratios) == len(scaled_images_feature_dims)): + raise ValueError( + f"len(scaled_images_ratios)={len(scaled_images_ratios)} and " + f"len(scaled_images_overlap_ratios)={len(scaled_images_overlap_ratios)} and " + f"len(scaled_images_feature_dims)={len(scaled_images_feature_dims)}, " + f"should match in config." + ) + + # intermediate_hook_ids, intermediate_feature_dims should be consistent + if not (len(intermediate_hook_ids) == len(intermediate_feature_dims)): + raise ValueError( + f"len(intermediate_hook_ids)={len(intermediate_hook_ids)} and " + f"len(intermediate_feature_dims)={len(intermediate_feature_dims)}, " + f"should match in config." + ) + + # fusion_hidden_size should be consistent with num_fov_head_layers + if fusion_hidden_size // 2**num_fov_head_layers == 0: + raise ValueError( + f"fusion_hidden_size={fusion_hidden_size} should be consistent with num_fov_head_layers={num_fov_head_layers} " + "i.e fusion_hidden_size // 2**num_fov_head_layers > 0" + ) + + self.fusion_hidden_size = fusion_hidden_size + self.patch_size = patch_size + self.initializer_range = initializer_range + self.use_batch_norm_in_fusion_residual = use_batch_norm_in_fusion_residual + self.use_bias_in_fusion_residual = use_bias_in_fusion_residual + self.use_fov_model = use_fov_model + self.num_fov_head_layers = num_fov_head_layers + self.intermediate_hook_ids = intermediate_hook_ids + self.intermediate_feature_dims = intermediate_feature_dims + self.scaled_images_ratios = scaled_images_ratios + self.scaled_images_overlap_ratios = scaled_images_overlap_ratios + self.scaled_images_feature_dims = scaled_images_feature_dims + self.merge_padding_value = merge_padding_value + self.image_model_config = image_model_config + self.patch_model_config = patch_model_config + self.fov_model_config = fov_model_config + + for sub_config_key in self.sub_configs: + sub_config = getattr(self, sub_config_key) + + if sub_config is None: + sub_config = CONFIG_MAPPING["dinov2"](image_size=patch_size) + logger.info( + f"`{sub_config_key}` is `None`. Initializing `{sub_config_key}` with the `Dinov2Config` " + f"with default values except `{sub_config_key}.image_size` is set to `config.patch_size`." + ) + elif isinstance(sub_config, dict): + sub_config = deepcopy(sub_config) + if "model_type" not in sub_config: + raise KeyError( + f"The `model_type` key is missing in the `{sub_config_key}` dictionary. Please provide the model type." + ) + elif sub_config["model_type"] not in CONFIG_MAPPING: + raise ValueError( + f"The model type `{sub_config['model_type']}` in `{sub_config_key}` is not supported. Please provide a valid model type." + ) + image_size = sub_config.get("image_size") + if image_size != patch_size: + logger.info( + f"The `image_size` in `{sub_config_key}` is set to `{image_size}`, " + f"but it does not match the required `patch_size` of `{patch_size}`. " + f"Updating `image_size` to `{patch_size}` for consistency. " + f"Ensure that `image_size` aligns with `patch_size` in the configuration." + ) + sub_config.update({"image_size": patch_size}) + sub_config = CONFIG_MAPPING[sub_config["model_type"]](**sub_config) + elif isinstance(sub_config, PretrainedConfig): + image_size = getattr(sub_config, "image_size", None) + if image_size != patch_size: + raise ValueError( + f"`config.{sub_config_key}.image_size={image_size}` should match `config.patch_size={patch_size}`." + ) + else: + raise TypeError( + f"Invalid type for `sub_config`. Expected `PretrainedConfig`, `dict`, or `None`, but got {type(sub_config)}." + ) + + setattr(self, sub_config_key, sub_config) + + +__all__ = ["DepthProConfig"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/depth_pro/image_processing_depth_pro.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/depth_pro/image_processing_depth_pro.py new file mode 100644 index 0000000000000000000000000000000000000000..47f224f248bd1c222823c9ba2159999b68c641e1 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/depth_pro/image_processing_depth_pro.py @@ -0,0 +1,389 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Image processor class for DepthPro.""" + +from typing import TYPE_CHECKING, Optional, Union + +import numpy as np + +from ...utils.import_utils import requires + + +if TYPE_CHECKING: + from .modeling_depth_pro import DepthProDepthEstimatorOutput + +from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict +from ...image_transforms import to_channel_dimension_format +from ...image_utils import ( + IMAGENET_STANDARD_MEAN, + IMAGENET_STANDARD_STD, + ChannelDimension, + ImageInput, + PILImageResampling, + infer_channel_dimension_format, + is_scaled_image, + is_torch_available, + make_flat_list_of_images, + to_numpy_array, + valid_images, +) +from ...utils import TensorType, filter_out_non_signature_kwargs, is_torchvision_available, logging, requires_backends + + +if is_torch_available(): + import torch + +if is_torchvision_available(): + from ...image_utils import pil_torch_interpolation_mapping + + +logger = logging.get_logger(__name__) + + +@requires(backends=("torchvision", "torch")) +class DepthProImageProcessor(BaseImageProcessor): + r""" + Constructs a DepthPro image processor. + + Args: + do_resize (`bool`, *optional*, defaults to `True`): + Whether to resize the image's (height, width) dimensions to the specified `(size["height"], + size["width"])`. Can be overridden by the `do_resize` parameter in the `preprocess` method. + size (`dict`, *optional*, defaults to `{"height": 1536, "width": 1536}`): + Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess` + method. + resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`): + Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the + `preprocess` method. + do_rescale (`bool`, *optional*, defaults to `True`): + Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale` + parameter in the `preprocess` method. + rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): + Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the + `preprocess` method. + do_normalize (`bool`, *optional*, defaults to `True`): + Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess` + method. + image_mean (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`): + Mean to use if normalizing the image. This is a float or list of floats the length of the number of + channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. + image_std (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`): + Standard deviation to use if normalizing the image. This is a float or list of floats the length of the + number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method. + """ + + model_input_names = ["pixel_values"] + + def __init__( + self, + do_resize: bool = True, + size: Optional[dict[str, int]] = None, + resample: PILImageResampling = PILImageResampling.BILINEAR, + do_rescale: bool = True, + rescale_factor: Union[int, float] = 1 / 255, + do_normalize: bool = True, + image_mean: Optional[Union[float, list[float]]] = None, + image_std: Optional[Union[float, list[float]]] = None, + **kwargs, + ): + super().__init__(**kwargs) + size = size if size is not None else {"height": 1536, "width": 1536} + size = get_size_dict(size) + self.do_resize = do_resize + self.do_rescale = do_rescale + self.do_normalize = do_normalize + self.size = size + self.resample = resample + self.rescale_factor = rescale_factor + self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN + self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD + + def resize( + self, + image: np.ndarray, + size: dict[str, int], + resample: PILImageResampling = PILImageResampling.BILINEAR, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ) -> np.ndarray: + """ + Resize an image to `(size["height"], size["width"])`. + + Args: + image (`np.ndarray`): + Image to resize. + size (`dict[str, int]`): + Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image. + resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`): + `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`. + data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the output image. If unset, the channel dimension format of the input + image is used. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + + Returns: + `np.ndarray`: The resized images. + """ + requires_backends(self, "torch") + + size = get_size_dict(size) + if "height" not in size or "width" not in size: + raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}") + output_size = (size["height"], size["width"]) + + # we use torch interpolation instead of image.resize because DepthProImageProcessor + # rescales, then normalizes, which may cause some values to become negative, before resizing the image. + # image.resize expects all values to be in range [0, 1] or [0, 255] and throws an exception otherwise, + # however pytorch interpolation works with negative values. + # relevant issue here: https://github.com/huggingface/transformers/issues/34920 + # input should be (B, C, H, W) + image_tensor = torch.from_numpy(image).unsqueeze(0) + resized_image = torch.nn.functional.interpolate( + input=image_tensor, + size=output_size, + mode=pil_torch_interpolation_mapping[resample].value, + ) + resized_image = resized_image.squeeze(0).numpy() + return resized_image + + def _validate_input_arguments( + self, + do_resize: bool, + size: dict[str, int], + resample: PILImageResampling, + do_rescale: bool, + rescale_factor: float, + do_normalize: bool, + image_mean: Union[float, list[float]], + image_std: Union[float, list[float]], + data_format: Union[str, ChannelDimension], + ): + if do_resize and None in (size, resample): + raise ValueError("Size and resample must be specified if do_resize is True.") + + if do_rescale and rescale_factor is None: + raise ValueError("Rescale factor must be specified if do_rescale is True.") + + if do_normalize and None in (image_mean, image_std): + raise ValueError("Image mean and standard deviation must be specified if do_normalize is True.") + + @filter_out_non_signature_kwargs() + def preprocess( + self, + images: ImageInput, + do_resize: Optional[bool] = None, + size: Optional[dict[str, int]] = None, + resample: Optional[PILImageResampling] = None, + do_rescale: Optional[bool] = None, + rescale_factor: Optional[float] = None, + do_normalize: Optional[bool] = None, + image_mean: Optional[Union[float, list[float]]] = None, + image_std: Optional[Union[float, list[float]]] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ): + """ + Preprocess an image or batch of images. + + Args: + images (`ImageInput`): + Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If + passing in images with pixel values between 0 and 1, set `do_rescale=False`. + do_resize (`bool`, *optional*, defaults to `self.do_resize`): + Whether to resize the image. + size (`dict[str, int]`, *optional*, defaults to `self.size`): + Dictionary in the format `{"height": h, "width": w}` specifying the size of the output image after + resizing. + resample (`PILImageResampling` filter, *optional*, defaults to `self.resample`): + `PILImageResampling` filter to use if resizing the image e.g. `PILImageResampling.BILINEAR`. Only has + an effect if `do_resize` is set to `True`. + do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): + Whether to rescale the image values between [0 - 1]. + rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): + Rescale factor to rescale the image by if `do_rescale` is set to `True`. + do_normalize (`bool`, *optional*, defaults to `self.do_normalize`): + Whether to normalize the image. + image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`): + Image mean to use if `do_normalize` is set to `True`. + image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`): + Image standard deviation to use if `do_normalize` is set to `True`. + return_tensors (`str` or `TensorType`, *optional*): + The type of tensors to return. Can be one of: + - Unset: Return a list of `np.ndarray`. + - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`. + - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. + - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. + - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. + data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - Unset: Use the channel dimension format of the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + """ + do_resize = do_resize if do_resize is not None else self.do_resize + do_rescale = do_rescale if do_rescale is not None else self.do_rescale + do_normalize = do_normalize if do_normalize is not None else self.do_normalize + resample = resample if resample is not None else self.resample + rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor + image_mean = image_mean if image_mean is not None else self.image_mean + image_std = image_std if image_std is not None else self.image_std + + size = size if size is not None else self.size + + images = make_flat_list_of_images(images) + + if not valid_images(images): + raise ValueError( + "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " + "torch.Tensor, tf.Tensor or jax.ndarray." + ) + self._validate_input_arguments( + do_resize=do_resize, + size=size, + resample=resample, + do_rescale=do_rescale, + rescale_factor=rescale_factor, + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + data_format=data_format, + ) + + # All transformations expect numpy arrays. + images = [to_numpy_array(image) for image in images] + + if is_scaled_image(images[0]) and do_rescale: + logger.warning_once( + "It looks like you are trying to rescale already rescaled images. If the input" + " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again." + ) + + if input_data_format is None: + # We assume that all images have the same channel dimension format. + input_data_format = infer_channel_dimension_format(images[0]) + + all_images = [] + for image in images: + if do_rescale: + image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format) + + if do_normalize: + image = self.normalize( + image=image, mean=image_mean, std=image_std, input_data_format=input_data_format + ) + + # depth-pro rescales and normalizes the image before resizing it + # uses torch interpolation which requires ChannelDimension.FIRST + if do_resize: + image = to_channel_dimension_format(image, ChannelDimension.FIRST, input_channel_dim=input_data_format) + image = self.resize(image=image, size=size, resample=resample) + image = to_channel_dimension_format(image, data_format, input_channel_dim=ChannelDimension.FIRST) + else: + image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) + + all_images.append(image) + + data = {"pixel_values": all_images} + return BatchFeature(data=data, tensor_type=return_tensors) + + def post_process_depth_estimation( + self, + outputs: "DepthProDepthEstimatorOutput", + target_sizes: Optional[Union[TensorType, list[tuple[int, int]], None]] = None, + ) -> list[dict[str, TensorType]]: + """ + Post-processes the raw depth predictions from the model to generate + final depth predictions which is caliberated using the field of view if provided + and resized to specified target sizes if provided. + + Args: + outputs ([`DepthProDepthEstimatorOutput`]): + Raw outputs of the model. + target_sizes (`Optional[Union[TensorType, list[tuple[int, int]], None]]`, *optional*, defaults to `None`): + Target sizes to resize the depth predictions. Can be a tensor of shape `(batch_size, 2)` + or a list of tuples `(height, width)` for each image in the batch. If `None`, no resizing + is performed. + + Returns: + `list[dict[str, TensorType]]`: A list of dictionaries of tensors representing the processed depth + predictions, and field of view (degrees) and focal length (pixels) if `field_of_view` is given in `outputs`. + + Raises: + `ValueError`: + If the lengths of `predicted_depths`, `fovs`, or `target_sizes` are mismatched. + """ + requires_backends(self, "torch") + + predicted_depth = outputs.predicted_depth + fov = outputs.field_of_view + + batch_size = len(predicted_depth) + + if target_sizes is not None and batch_size != len(target_sizes): + raise ValueError( + "Make sure that you pass in as many fov values as the batch dimension of the predicted depth" + ) + + results = [] + fov = [None] * batch_size if fov is None else fov + target_sizes = [None] * batch_size if target_sizes is None else target_sizes + for depth, fov_value, target_size in zip(predicted_depth, fov, target_sizes): + focal_length = None + if target_size is not None: + # scale image w.r.t fov + if fov_value is not None: + width = target_size[1] + focal_length = 0.5 * width / torch.tan(0.5 * torch.deg2rad(fov_value)) + depth = depth * width / focal_length + + # interpolate + depth = torch.nn.functional.interpolate( + # input should be (B, C, H, W) + input=depth.unsqueeze(0).unsqueeze(1), + size=target_size, + mode=pil_torch_interpolation_mapping[self.resample].value, + ).squeeze() + + # inverse the depth + depth = 1.0 / torch.clamp(depth, min=1e-4, max=1e4) + + results.append( + { + "predicted_depth": depth, + "field_of_view": fov_value, + "focal_length": focal_length, + } + ) + + return results + + +__all__ = ["DepthProImageProcessor"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/depth_pro/image_processing_depth_pro_fast.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/depth_pro/image_processing_depth_pro_fast.py new file mode 100644 index 0000000000000000000000000000000000000000..bc621e0ffc261a7a404f24dc8f0d7293e5be3a46 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/depth_pro/image_processing_depth_pro_fast.py @@ -0,0 +1,174 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Fast Image processor class for DepthPro.""" + +from typing import TYPE_CHECKING, Optional, Union + +import torch + +from ...image_processing_base import BatchFeature +from ...image_processing_utils_fast import BaseImageProcessorFast, group_images_by_shape, reorder_images +from ...image_utils import ( + IMAGENET_STANDARD_MEAN, + IMAGENET_STANDARD_STD, + PILImageResampling, + SizeDict, + pil_torch_interpolation_mapping, +) +from ...utils import ( + TensorType, + auto_docstring, + logging, + requires_backends, +) +from ...utils.import_utils import requires + + +if TYPE_CHECKING: + from .modeling_depth_pro import DepthProDepthEstimatorOutput + + +from torchvision.transforms.v2 import functional as F + + +logger = logging.get_logger(__name__) + + +@auto_docstring +@requires(backends=("torchvision", "torch")) +class DepthProImageProcessorFast(BaseImageProcessorFast): + resample = PILImageResampling.BILINEAR + image_mean = IMAGENET_STANDARD_MEAN + image_std = IMAGENET_STANDARD_STD + size = {"height": 1536, "width": 1536} + do_resize = True + do_rescale = True + do_normalize = True + + # DepthPro resizes image after rescaling and normalizing, + # which makes it different from BaseImageProcessorFast._preprocess + def _preprocess( + self, + images: list["torch.Tensor"], + do_resize: bool, + size: SizeDict, + interpolation: Optional["F.InterpolationMode"], + do_center_crop: bool, + crop_size: SizeDict, + do_rescale: bool, + rescale_factor: float, + do_normalize: bool, + image_mean: Optional[Union[float, list[float]]], + image_std: Optional[Union[float, list[float]]], + disable_grouping: Optional[bool], + return_tensors: Optional[Union[str, TensorType]], + **kwargs, + ) -> BatchFeature: + # Group images by size for batched scaling + grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping) + processed_images_grouped = {} + for shape, stacked_images in grouped_images.items(): + # Fused rescale and normalize + stacked_images = self.rescale_and_normalize( + stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std + ) + if do_resize: + stacked_images = self.resize( + image=stacked_images, + size=size, + interpolation=interpolation, + antialias=False, + ) + processed_images_grouped[shape] = stacked_images + + processed_images = reorder_images(processed_images_grouped, grouped_images_index) + processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images + + return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors) + + # Copied from transformers.models.depth_pro.image_processing_depth_pro.DepthProImageProcessor.post_process_depth_estimation + def post_process_depth_estimation( + self, + outputs: "DepthProDepthEstimatorOutput", + target_sizes: Optional[Union[TensorType, list[tuple[int, int]], None]] = None, + ) -> list[dict[str, TensorType]]: + """ + Post-processes the raw depth predictions from the model to generate + final depth predictions which is caliberated using the field of view if provided + and resized to specified target sizes if provided. + + Args: + outputs ([`DepthProDepthEstimatorOutput`]): + Raw outputs of the model. + target_sizes (`Optional[Union[TensorType, list[tuple[int, int]], None]]`, *optional*, defaults to `None`): + Target sizes to resize the depth predictions. Can be a tensor of shape `(batch_size, 2)` + or a list of tuples `(height, width)` for each image in the batch. If `None`, no resizing + is performed. + + Returns: + `list[dict[str, TensorType]]`: A list of dictionaries of tensors representing the processed depth + predictions, and field of view (degrees) and focal length (pixels) if `field_of_view` is given in `outputs`. + + Raises: + `ValueError`: + If the lengths of `predicted_depths`, `fovs`, or `target_sizes` are mismatched. + """ + requires_backends(self, "torch") + + predicted_depth = outputs.predicted_depth + fov = outputs.field_of_view + + batch_size = len(predicted_depth) + + if target_sizes is not None and batch_size != len(target_sizes): + raise ValueError( + "Make sure that you pass in as many fov values as the batch dimension of the predicted depth" + ) + + results = [] + fov = [None] * batch_size if fov is None else fov + target_sizes = [None] * batch_size if target_sizes is None else target_sizes + for depth, fov_value, target_size in zip(predicted_depth, fov, target_sizes): + focal_length = None + if target_size is not None: + # scale image w.r.t fov + if fov_value is not None: + width = target_size[1] + focal_length = 0.5 * width / torch.tan(0.5 * torch.deg2rad(fov_value)) + depth = depth * width / focal_length + + # interpolate + depth = torch.nn.functional.interpolate( + # input should be (B, C, H, W) + input=depth.unsqueeze(0).unsqueeze(1), + size=target_size, + mode=pil_torch_interpolation_mapping[self.resample].value, + ).squeeze() + + # inverse the depth + depth = 1.0 / torch.clamp(depth, min=1e-4, max=1e4) + + results.append( + { + "predicted_depth": depth, + "field_of_view": fov_value, + "focal_length": focal_length, + } + ) + + return results + + +__all__ = ["DepthProImageProcessorFast"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/depth_pro/modeling_depth_pro.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/depth_pro/modeling_depth_pro.py new file mode 100644 index 0000000000000000000000000000000000000000..7c32703b7c25526bc87beabf1f6cbb835f2b2478 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/depth_pro/modeling_depth_pro.py @@ -0,0 +1,1131 @@ +# coding=utf-8 +# Copyright 2024 The Apple Research Team Authors and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch DepthPro model.""" + +import math +from dataclasses import dataclass +from typing import Optional, Union + +import torch +import torch.nn.functional as F +from torch import nn + +from ...modeling_utils import PreTrainedModel +from ...utils import ModelOutput, auto_docstring, logging, torch_int +from ..auto import AutoModel +from .configuration_depth_pro import DepthProConfig + + +logger = logging.get_logger(__name__) + + +@dataclass +@auto_docstring( + custom_intro=""" + Base class for DepthPro's outputs. + """ +) +class DepthProOutput(ModelOutput): + r""" + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, n_patches_per_batch, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + features (`Union[torch.FloatTensor, List[torch.FloatTensor]]`, *optional*): + Features from encoders. Can be a single feature or a list of features. + """ + + last_hidden_state: Optional[torch.FloatTensor] = None + features: Union[torch.FloatTensor, list[torch.FloatTensor]] = None + hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None + attentions: Optional[tuple[torch.FloatTensor, ...]] = None + + +@dataclass +@auto_docstring( + custom_intro=""" + Base class for DepthProForDepthEstimation's output. + """ +) +class DepthProDepthEstimatorOutput(ModelOutput): + r""" + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Classification (or regression if config.num_labels==1) loss. + field_of_view (`torch.FloatTensor` of shape `(batch_size,)`, *optional*, returned when `use_fov_model` is provided): + Field of View Scaler. + """ + + loss: Optional[torch.FloatTensor] = None + predicted_depth: Optional[torch.FloatTensor] = None + field_of_view: Optional[torch.FloatTensor] = None + hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None + attentions: Optional[tuple[torch.FloatTensor, ...]] = None + + +def split_to_patches(pixel_values: torch.Tensor, patch_size: int, overlap_ratio: float) -> torch.Tensor: + """Creates Patches from Batch.""" + batch_size, num_channels, height, width = pixel_values.shape + + if height == width == patch_size: + # create patches only if scaled image is not already equal to patch size + return pixel_values + + stride = torch_int(patch_size * (1 - overlap_ratio)) + + patches = F.unfold(pixel_values, kernel_size=(patch_size, patch_size), stride=(stride, stride)) + patches = patches.permute(2, 0, 1) + patches = patches.reshape(-1, num_channels, patch_size, patch_size) + + return patches + + +def reshape_features(hidden_states: torch.Tensor) -> torch.Tensor: + """Discard class token and reshape 1D feature map to a 2D grid.""" + n_samples, seq_len, hidden_size = hidden_states.shape + size = torch_int(seq_len**0.5) + + hidden_states = hidden_states[:, -(size**2) :, :] # remove special tokens if there are any + hidden_states = hidden_states.reshape(n_samples, size, size, hidden_size) + hidden_states = hidden_states.permute(0, 3, 1, 2) + + return hidden_states + + +def merge_patches(patches: torch.Tensor, batch_size: int, padding: int) -> torch.Tensor: + """Merges smaller patches into image-like feature map.""" + n_patches, hidden_size, out_size, out_size = patches.shape + n_patches_per_batch = n_patches // batch_size + sqrt_n_patches_per_batch = torch_int(n_patches_per_batch**0.5) + new_out_size = sqrt_n_patches_per_batch * out_size + + if n_patches == batch_size: + # merge only if the patches were created from scaled image + # patches are not created when scaled image size is equal to patch size + return patches + + if n_patches_per_batch < 4: + # for each batch, at least 4 small patches are required to + # recreate a large square patch from merging them and later padding is applied + # 3 x (8x8) patches becomes 1 x ( 8x8 ) patch (extra patch ignored, no padding) + # 4 x (8x8) patches becomes 1 x (16x16) patch (padding later) + # 5 x (8x8) patches becomes 1 x (16x16) patch (extra patch ignored, padding later) + # 9 x (8x8) patches becomes 1 x (24x24) patch (padding later) + # thus the following code only rearranges the patches and removes extra ones + padding = 0 + + # make sure padding is not large enough to remove more than half of the patch + padding = min(out_size // 4, padding) + + if padding == 0: + # faster when no padding is required + merged = patches.reshape(n_patches_per_batch, batch_size, hidden_size, out_size, out_size) + merged = merged.permute(1, 2, 0, 3, 4) + merged = merged[:, :, : sqrt_n_patches_per_batch**2, :, :] + merged = merged.reshape( + batch_size, hidden_size, sqrt_n_patches_per_batch, sqrt_n_patches_per_batch, out_size, out_size + ) + merged = merged.permute(0, 1, 2, 4, 3, 5) + merged = merged.reshape(batch_size, hidden_size, new_out_size, new_out_size) + else: + # padding example: + # let out_size = 8, new_out_size = 32, padding = 2 + # each patch is separated by "|" + # and padding is applied to the merging edges of each patch + # 00 01 02 03 04 05 06 07 | 08 09 10 11 12 13 14 15 | 16 17 18 19 20 21 22 23 | 24 25 26 27 28 29 30 31 + # 00 01 02 03 04 05 -- -- | -- -- 10 11 12 13 -- -- | -- -- 18 19 20 21 -- -- | -- -- 26 27 28 29 30 31 + i = 0 + boxes = [] + for h in range(sqrt_n_patches_per_batch): + boxes_in_row = [] + for w in range(sqrt_n_patches_per_batch): + box = patches[batch_size * i : batch_size * (i + 1)] + + # collect paddings + paddings = [0, 0, 0, 0] + if h != 0: + # remove pad from height if box is not at top border + paddings[0] = padding + if w != 0: + # remove pad from width if box is not at left border + paddings[2] = padding + if h != sqrt_n_patches_per_batch - 1: + # remove pad from height if box is not at bottom border + paddings[1] = padding + if w != sqrt_n_patches_per_batch - 1: + # remove pad from width if box is not at right border + paddings[3] = padding + + # remove paddings + _, _, box_h, box_w = box.shape + pad_top, pad_bottom, pad_left, pad_right = paddings + box = box[:, :, pad_top : box_h - pad_bottom, pad_left : box_w - pad_right] + + boxes_in_row.append(box) + i += 1 + boxes_in_row = torch.cat(boxes_in_row, dim=-1) + boxes.append(boxes_in_row) + merged = torch.cat(boxes, dim=-2) + + return merged + + +def reconstruct_feature_maps( + hidden_state: torch.Tensor, batch_size: int, padding: int, output_size: tuple[float, float] +) -> torch.Tensor: + """ + Reconstructs feature maps from the hidden state produced by any of the encoder. Converts the hidden state of shape + `(n_patches_per_batch * batch_size, seq_len, hidden_size)` to feature maps of shape + `(batch_size, hidden_size, output_size[0], output_size[1])`. + + Args: + hidden_state (torch.Tensor): Input tensor of shape `(n_patches_per_batch * batch_size, seq_len, hidden_size)` + representing the encoded patches. + batch_size (int): The number of samples in a batch. + padding (int): The amount of padding to be removed when merging patches. + output_size (tuple[float, float]): The desired output size for the feature maps, specified as `(height, width)`. + + Returns: + torch.Tensor: Reconstructed feature maps of shape `(batch_size, hidden_size, output_size[0], output_size[1])`. + """ + # reshape back to image like + features = reshape_features(hidden_state) + + # merge all patches in a batch to create one large patch per batch + features = merge_patches( + features, + batch_size=batch_size, + padding=padding, + ) + + # interpolate patches to base size + features = F.interpolate( + features, + size=output_size, + mode="bilinear", + align_corners=False, + ) + + return features + + +class DepthProPatchEncoder(nn.Module): + def __init__(self, config: DepthProConfig): + super().__init__() + self.config = config + + self.intermediate_hook_ids = config.intermediate_hook_ids + self.intermediate_feature_dims = config.intermediate_feature_dims + self.scaled_images_ratios = config.scaled_images_ratios + self.scaled_images_overlap_ratios = config.scaled_images_overlap_ratios + self.scaled_images_feature_dims = config.scaled_images_feature_dims + self.merge_padding_value = config.merge_padding_value + + self.n_scaled_images = len(config.scaled_images_ratios) + self.n_intermediate_hooks = len(config.intermediate_hook_ids) + self.out_size = config.image_model_config.image_size // config.image_model_config.patch_size + + self.model = AutoModel.from_config(config.patch_model_config) + + def forward( + self, + pixel_values: torch.Tensor, + head_mask: Optional[torch.Tensor] = None, + ) -> list[torch.Tensor]: + batch_size, num_channels, height, width = pixel_values.shape + + if min(self.scaled_images_ratios) * min(height, width) < self.config.patch_size: + raise ValueError( + f"Image size {height}x{width} is too small to be scaled " + f"with scaled_images_ratios={self.scaled_images_ratios} " + f"when patch_size={self.config.patch_size}." + ) + + # STEP 1: create 3-level image + + scaled_images = [] + for ratio in self.scaled_images_ratios: + scaled_images.append( + F.interpolate( + pixel_values, + scale_factor=ratio, + mode="bilinear", + align_corners=False, + ) + ) + + # STEP 2: create patches + + for i in range(self.n_scaled_images): + scaled_images[i] = split_to_patches( + scaled_images[i], + patch_size=self.config.patch_size, + overlap_ratio=self.scaled_images_overlap_ratios[i], + ) + n_patches_per_scaled_image = [len(i) for i in scaled_images] + patches = torch.cat(scaled_images[::-1], dim=0) # -1 as patch encoder expects high res patches first + + # STEP 3: apply patch encoder + + encodings = self.model( + # each patch is processed as a separate batch + patches, + head_mask=head_mask, + # required for intermediate features + output_hidden_states=self.n_intermediate_hooks > 0, + ) + + scaled_images_last_hidden_state = torch.split_with_sizes(encodings[0], n_patches_per_scaled_image[::-1]) + # -1 (reverse list) as patch encoder returns high res patches first, we need low res first + scaled_images_last_hidden_state = scaled_images_last_hidden_state[::-1] + + # calculate base height and width + # base height and width are the dimensions of the lowest resolution features + exponent_value = torch_int(math.log2(width / self.out_size)) + base_height = height // 2**exponent_value + base_width = width // 2**exponent_value + + # STEP 4: get patch features (high_res, med_res, low_res) - (3-5) in diagram + + scaled_images_features = [] + for i in range(self.n_scaled_images): + hidden_state = scaled_images_last_hidden_state[i] + padding = torch_int(self.merge_padding_value * (1 / self.scaled_images_ratios[i])) + output_height = base_height * 2**i + output_width = base_width * 2**i + features = reconstruct_feature_maps( + hidden_state, + batch_size=batch_size, + padding=padding, + output_size=(output_height, output_width), + ) + scaled_images_features.append(features) + + # STEP 5: get intermediate features - (1-2) in diagram + + intermediate_features = [] + for i in range(self.n_intermediate_hooks): + # +1 to correct index position as hidden_states contain embedding output as well + hidden_state = encodings[2][self.intermediate_hook_ids[i] + 1] + padding = torch_int(self.merge_padding_value * (1 / self.scaled_images_ratios[-1])) + output_height = base_height * 2 ** (self.n_scaled_images - 1) + output_width = base_width * 2 ** (self.n_scaled_images - 1) + features = reconstruct_feature_maps( + hidden_state, + batch_size=batch_size, + padding=padding, + output_size=(output_height, output_width), + ) + intermediate_features.append(features) + + # STEP 7: combine all features + features = [*scaled_images_features, *intermediate_features] + + return features + + +class DepthProImageEncoder(nn.Module): + def __init__(self, config: DepthProConfig): + super().__init__() + self.config = config + self.out_size = config.image_model_config.image_size // config.image_model_config.patch_size + + self.model = AutoModel.from_config(config.image_model_config) + + def forward( + self, + pixel_values: torch.Tensor, + head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ) -> Union[tuple, DepthProOutput]: + batch_size, num_channels, height, width = pixel_values.shape + + # scale the image for image_encoder + size = self.config.image_model_config.image_size + pixel_values = F.interpolate( + pixel_values, + size=(size, size), + mode="bilinear", + align_corners=False, + ) + encodings = self.model( + pixel_values=pixel_values, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + ) + + # calculate base height and width + # base height and width are the dimensions of the lowest resolution features + exponent_value = torch_int(math.log2(width / self.out_size)) + base_height = height // 2**exponent_value + base_width = width // 2**exponent_value + + features = reconstruct_feature_maps( + encodings[0], + batch_size=batch_size, + padding=0, + output_size=(base_height, base_width), + ) + + if not return_dict: + return (encodings[0], features) + encodings[2:] # ignore last_hidden_state and poooler output + + return DepthProOutput( + last_hidden_state=encodings.last_hidden_state, + features=features, + hidden_states=encodings.hidden_states, + attentions=encodings.attentions, + ) + + +class DepthProEncoder(nn.Module): + def __init__(self, config: DepthProConfig): + super().__init__() + self.config = config + self.intermediate_hook_ids = config.intermediate_hook_ids + self.intermediate_feature_dims = config.intermediate_feature_dims + self.scaled_images_ratios = config.scaled_images_ratios + self.scaled_images_overlap_ratios = config.scaled_images_overlap_ratios + self.scaled_images_feature_dims = config.scaled_images_feature_dims + self.merge_padding_value = config.merge_padding_value + + self.n_scaled_images = len(self.scaled_images_ratios) + self.n_intermediate_hooks = len(self.intermediate_hook_ids) + + self.patch_encoder = DepthProPatchEncoder(config) + self.image_encoder = DepthProImageEncoder(config) + + def forward( + self, + pixel_values: torch.Tensor, + head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ) -> Union[tuple, DepthProOutput]: + batch_size, num_channels, height, width = pixel_values.shape + + patch_features = self.patch_encoder( + pixel_values, + head_mask=head_mask, + ) + image_encodings = self.image_encoder( + pixel_values, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + image_features = image_encodings[1] # index 1 contains features + + features = [image_features, *patch_features] + + if not return_dict: + return (image_encodings[0], features) + image_encodings[2:] + + return DepthProOutput( + last_hidden_state=image_encodings.last_hidden_state, + features=features, + hidden_states=image_encodings.hidden_states, + attentions=image_encodings.attentions, + ) + + +class DepthProFeatureUpsampleBlock(nn.Module): + def __init__( + self, + config: DepthProConfig, + input_dims: int, + intermediate_dims: int, + output_dims: int, + n_upsample_layers: int, + use_proj: bool = True, + bias: bool = False, + ): + super().__init__() + self.config = config + self.layers = nn.ModuleList() + + # create first projection layer + if use_proj: + proj = nn.Conv2d( + in_channels=input_dims, + out_channels=intermediate_dims, + kernel_size=1, + stride=1, + padding=0, + bias=bias, + ) + self.layers.append(proj) + + # create following upsample layers + for i in range(n_upsample_layers): + in_channels = intermediate_dims if i == 0 else output_dims + layer = nn.ConvTranspose2d( + in_channels=in_channels, + out_channels=output_dims, + kernel_size=2, + stride=2, + padding=0, + bias=bias, + ) + self.layers.append(layer) + + def forward(self, features: torch.Tensor) -> torch.Tensor: + for layer in self.layers: + features = layer(features) + return features + + +class DepthProFeatureUpsample(nn.Module): + def __init__(self, config: DepthProConfig): + super().__init__() + self.config = config + self.n_scaled_images = len(self.config.scaled_images_ratios) + self.n_intermediate_hooks = len(self.config.intermediate_hook_ids) + + # for image_features + self.image_block = DepthProFeatureUpsampleBlock( + config=config, + input_dims=config.image_model_config.hidden_size, + intermediate_dims=config.image_model_config.hidden_size, + output_dims=config.scaled_images_feature_dims[0], + n_upsample_layers=1, + use_proj=False, + bias=True, + ) + + # for scaled_images_features + self.scaled_images = nn.ModuleList() + for i, feature_dims in enumerate(config.scaled_images_feature_dims): + block = DepthProFeatureUpsampleBlock( + config=config, + input_dims=config.patch_model_config.hidden_size, + intermediate_dims=feature_dims, + output_dims=feature_dims, + n_upsample_layers=1, + ) + self.scaled_images.append(block) + + # for intermediate_features + self.intermediate = nn.ModuleList() + for i, feature_dims in enumerate(config.intermediate_feature_dims): + intermediate_dims = config.fusion_hidden_size if i == 0 else feature_dims + block = DepthProFeatureUpsampleBlock( + config=config, + input_dims=config.patch_model_config.hidden_size, + intermediate_dims=intermediate_dims, + output_dims=feature_dims, + n_upsample_layers=2 + i, + ) + self.intermediate.append(block) + + def forward(self, features: list[torch.Tensor]) -> list[torch.Tensor]: + features[0] = self.image_block(features[0]) + + for i in range(self.n_scaled_images): + features[i + 1] = self.scaled_images[i](features[i + 1]) + + for i in range(self.n_intermediate_hooks): + features[self.n_scaled_images + i + 1] = self.intermediate[i](features[self.n_scaled_images + i + 1]) + + return features + + +class DepthProFeatureProjection(nn.Module): + def __init__(self, config: DepthProConfig): + super().__init__() + self.config = config + + combined_feature_dims = config.scaled_images_feature_dims + config.intermediate_feature_dims + self.projections = nn.ModuleList() + for i, in_channels in enumerate(combined_feature_dims): + if i == len(combined_feature_dims) - 1 and in_channels == config.fusion_hidden_size: + # projection for last layer can be ignored if input and output channels already match + self.projections.append(nn.Identity()) + else: + self.projections.append( + nn.Conv2d( + in_channels=in_channels, + out_channels=config.fusion_hidden_size, + kernel_size=3, + stride=1, + padding=1, + bias=False, + ) + ) + + def forward(self, features: list[torch.Tensor]) -> list[torch.Tensor]: + projected_features = [] + for i, projection in enumerate(self.projections): + upsampled_feature = projection(features[i]) + projected_features.append(upsampled_feature) + return projected_features + + +class DepthProNeck(nn.Module): + def __init__(self, config: DepthProConfig): + super().__init__() + self.config = config + + self.feature_upsample = DepthProFeatureUpsample(config) + self.fuse_image_with_low_res = nn.Conv2d( + in_channels=config.scaled_images_feature_dims[0] * 2, + out_channels=config.scaled_images_feature_dims[0], + kernel_size=1, + stride=1, + padding=0, + bias=True, + ) + self.feature_projection = DepthProFeatureProjection(config) + + def forward(self, features: list[torch.Tensor]) -> list[torch.Tensor]: + features = self.feature_upsample(features) + # global features = low res features + image features + global_features = torch.cat((features[1], features[0]), dim=1) + global_features = self.fuse_image_with_low_res(global_features) + features = [global_features, *features[2:]] + features = self.feature_projection(features) + return features + + +# General docstring + + +@auto_docstring +class DepthProPreTrainedModel(PreTrainedModel): + config: DepthProConfig + base_model_prefix = "depth_pro" + main_input_name = "pixel_values" + supports_gradient_checkpointing = True + _supports_sdpa = True + _no_split_modules = ["DepthProPreActResidualLayer"] + _keys_to_ignore_on_load_unexpected = ["fov_model.*"] + + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, nn.Linear): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + elif isinstance(module, (nn.Conv2d, nn.ConvTranspose2d)): + nn.init.kaiming_normal_(module.weight, mode="fan_out", nonlinearity="relu") + if module.bias is not None: + module.bias.data.zero_() + + +@auto_docstring +class DepthProModel(DepthProPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.config = config + self.encoder = DepthProEncoder(config) + self.neck = DepthProNeck(config) + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.encoder.image_encoder.model.get_input_embeddings() + + @auto_docstring + def forward( + self, + pixel_values: torch.FloatTensor, + head_mask: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, DepthProOutput]: + r""" + Examples: + + ```python + >>> import torch + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, DepthProModel + + >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> checkpoint = "apple/DepthPro-hf" + >>> processor = AutoProcessor.from_pretrained(checkpoint) + >>> model = DepthProModel.from_pretrained(checkpoint) + + >>> # prepare image for the model + >>> inputs = processor(images=image, return_tensors="pt") + + >>> with torch.no_grad(): + ... output = model(**inputs) + + >>> output.last_hidden_state.shape + torch.Size([1, 35, 577, 1024]) + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + encodings = self.encoder( + pixel_values, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + features = encodings[1] # index 1 contains features + features = self.neck(features) + + if not return_dict: + return (encodings[0], features) + encodings[2:] + + return DepthProOutput( + last_hidden_state=encodings.last_hidden_state, + features=features, + hidden_states=encodings.hidden_states, + attentions=encodings.attentions, + ) + + +# Copied from transformers.models.dpt.modeling_dpt.DPTPreActResidualLayer DPT->DepthPro +class DepthProPreActResidualLayer(nn.Module): + """ + ResidualConvUnit, pre-activate residual unit. + + Args: + config (`[DepthProConfig]`): + Model configuration class defining the model architecture. + """ + + def __init__(self, config: DepthProConfig): + super().__init__() + + self.use_batch_norm = config.use_batch_norm_in_fusion_residual + use_bias_in_fusion_residual = ( + config.use_bias_in_fusion_residual + if config.use_bias_in_fusion_residual is not None + else not self.use_batch_norm + ) + + self.activation1 = nn.ReLU() + self.convolution1 = nn.Conv2d( + config.fusion_hidden_size, + config.fusion_hidden_size, + kernel_size=3, + stride=1, + padding=1, + bias=use_bias_in_fusion_residual, + ) + + self.activation2 = nn.ReLU() + self.convolution2 = nn.Conv2d( + config.fusion_hidden_size, + config.fusion_hidden_size, + kernel_size=3, + stride=1, + padding=1, + bias=use_bias_in_fusion_residual, + ) + + if self.use_batch_norm: + self.batch_norm1 = nn.BatchNorm2d(config.fusion_hidden_size) + self.batch_norm2 = nn.BatchNorm2d(config.fusion_hidden_size) + + def forward(self, hidden_state: torch.Tensor) -> torch.Tensor: + residual = hidden_state + hidden_state = self.activation1(hidden_state) + + hidden_state = self.convolution1(hidden_state) + + if self.use_batch_norm: + hidden_state = self.batch_norm1(hidden_state) + + hidden_state = self.activation2(hidden_state) + hidden_state = self.convolution2(hidden_state) + + if self.use_batch_norm: + hidden_state = self.batch_norm2(hidden_state) + + return hidden_state + residual + + +# Modified from transformers.models.dpt.modeling_dpt.DPTFeatureFusionLayer +# except it uses deconv and skip_add and needs no interpolation +class DepthProFeatureFusionLayer(nn.Module): + def __init__(self, config: DepthProConfig, use_deconv: bool = True): + super().__init__() + self.config = config + self.use_deconv = use_deconv + + self.residual_layer1 = DepthProPreActResidualLayer(config) + self.residual_layer2 = DepthProPreActResidualLayer(config) + + if self.use_deconv: + self.deconv = nn.ConvTranspose2d( + in_channels=config.fusion_hidden_size, + out_channels=config.fusion_hidden_size, + kernel_size=2, + stride=2, + padding=0, + bias=False, + ) + + self.projection = nn.Conv2d(config.fusion_hidden_size, config.fusion_hidden_size, kernel_size=1, bias=True) + + def forward(self, hidden_state: torch.Tensor, residual: Optional[torch.Tensor] = None) -> torch.Tensor: + if residual is not None: + residual = self.residual_layer1(residual) + hidden_state = hidden_state + residual + + hidden_state = self.residual_layer2(hidden_state) + if self.use_deconv: + hidden_state = self.deconv(hidden_state) + hidden_state = self.projection(hidden_state) + + return hidden_state + + +# Modified from transformers.models.dpt.modeling_dpt.DPTFeatureFusionStage with DPT->DepthPro +# with deconv and reversed layers +class DepthProFeatureFusionStage(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + + self.num_layers = len(config.intermediate_hook_ids) + len(config.scaled_images_ratios) + self.intermediate = nn.ModuleList() + for _ in range(self.num_layers - 1): + self.intermediate.append(DepthProFeatureFusionLayer(config)) + + # final layer does not require deconvolution + self.final = DepthProFeatureFusionLayer(config, use_deconv=False) + + def forward(self, hidden_states: list[torch.Tensor]) -> list[torch.Tensor]: + if self.num_layers != len(hidden_states): + raise ValueError( + f"num_layers={self.num_layers} in DepthProFeatureFusionStage" + f"does not match len(hidden_states)={len(hidden_states)}" + ) + + fused_hidden_states = [] + fused_hidden_state = None + for hidden_state, layer in zip(hidden_states[:-1], self.intermediate): + if fused_hidden_state is None: + # first layer only uses the last hidden_state + fused_hidden_state = layer(hidden_state) + else: + fused_hidden_state = layer(fused_hidden_state, hidden_state) + fused_hidden_states.append(fused_hidden_state) + + hidden_state = hidden_states[-1] + fused_hidden_state = self.final(fused_hidden_state, hidden_state) + fused_hidden_states.append(fused_hidden_state) + + return fused_hidden_states + + +class DepthProFovEncoder(nn.Module): + def __init__(self, config: DepthProConfig): + super().__init__() + self.config = config + self.out_size = config.image_model_config.image_size // config.image_model_config.patch_size + + self.model = AutoModel.from_config(config.fov_model_config) + self.neck = nn.Linear(config.fov_model_config.hidden_size, config.fusion_hidden_size // 2) + + def forward( + self, + pixel_values: torch.Tensor, + head_mask: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + batch_size, num_channels, height, width = pixel_values.shape + + # scale the image for fov_encoder + size = self.config.fov_model_config.image_size + pixel_values = F.interpolate( + pixel_values, + size=(size, size), + mode="bilinear", + align_corners=False, + ) + encodings = self.model( + pixel_values=pixel_values, + head_mask=head_mask, + ) + hidden_state = encodings[0] + hidden_state = self.neck(hidden_state) + + # calculate base height and width + # base height and width are the dimensions of the lowest resolution features + exponent_value = torch_int(math.log2(width / self.out_size)) + base_height = height // 2**exponent_value + base_width = width // 2**exponent_value + + features = reconstruct_feature_maps( + hidden_state, + batch_size=batch_size, + padding=0, + output_size=(base_height, base_width), + ) + + return features + + +class DepthProFovHead(nn.Module): + def __init__(self, config: DepthProConfig): + super().__init__() + self.config = config + self.fusion_hidden_size = config.fusion_hidden_size + self.out_size = config.image_model_config.image_size // config.image_model_config.patch_size + + # create initial head layers + self.layers = nn.ModuleList() + for i in range(config.num_fov_head_layers): + self.layers.append( + nn.Conv2d( + math.ceil(self.fusion_hidden_size / 2 ** (i + 1)), + math.ceil(self.fusion_hidden_size / 2 ** (i + 2)), + kernel_size=3, + stride=2, + padding=1, + ) + ) + self.layers.append(nn.ReLU(True)) + # calculate expected shapes to finally generate a scalar output from final head layer + final_in_channels = math.ceil(self.fusion_hidden_size / 2 ** (config.num_fov_head_layers + 1)) + final_kernel_size = torch_int((self.out_size - 1) / 2**config.num_fov_head_layers + 1) + self.layers.append( + nn.Conv2d( + in_channels=final_in_channels, out_channels=1, kernel_size=final_kernel_size, stride=1, padding=0 + ) + ) + + def forward(self, features: torch.Tensor) -> torch.Tensor: + features = F.interpolate( + features, + size=(self.out_size, self.out_size), + mode="bilinear", + align_corners=False, + ) + for layer in self.layers: + features = layer(features) + return features + + +class DepthProFovModel(nn.Module): + def __init__(self, config: DepthProConfig): + super().__init__() + self.config = config + self.fusion_hidden_size = config.fusion_hidden_size + + self.fov_encoder = DepthProFovEncoder(config) + self.conv = nn.Conv2d( + self.fusion_hidden_size, self.fusion_hidden_size // 2, kernel_size=3, stride=2, padding=1 + ) + self.activation = nn.ReLU(inplace=True) + self.head = DepthProFovHead(config) + + def forward( + self, + pixel_values: torch.Tensor, + global_features: torch.Tensor, + head_mask: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + fov_features = self.fov_encoder(pixel_values, head_mask) + + global_features = self.conv(global_features) + global_features = self.activation(global_features) + + fov_features = fov_features + global_features + fov_output = self.head(fov_features) + fov_output = fov_output.flatten() + + return fov_output + + +class DepthProDepthEstimationHead(nn.Module): + """ + The DepthProDepthEstimationHead module serves as the output head for depth estimation tasks. + This module comprises a sequence of convolutional and transposed convolutional layers + that process the feature map from the fusion to produce a single-channel depth map. + Key operations include dimensionality reduction and upsampling to match the input resolution. + """ + + def __init__(self, config): + super().__init__() + self.config = config + + features = config.fusion_hidden_size + self.layers = nn.ModuleList( + [ + nn.Conv2d(features, features // 2, kernel_size=3, stride=1, padding=1), + nn.ConvTranspose2d( + in_channels=features // 2, + out_channels=features // 2, + kernel_size=2, + stride=2, + padding=0, + bias=True, + ), + nn.Conv2d(features // 2, 32, kernel_size=3, stride=1, padding=1), + nn.ReLU(True), + nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0), + nn.ReLU(), + ] + ) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + for layer in self.layers: + hidden_states = layer(hidden_states) + + predicted_depth = hidden_states.squeeze(dim=1) + return predicted_depth + + +@auto_docstring( + custom_intro=""" + DepthPro Model with a depth estimation head on top (consisting of 3 convolutional layers). + """ +) +class DepthProForDepthEstimation(DepthProPreTrainedModel): + def __init__(self, config, use_fov_model=None): + r""" + use_fov_model (bool, *optional*): + Whether to use the field of view model. + """ + super().__init__(config) + self.config = config + self.use_fov_model = use_fov_model if use_fov_model is not None else self.config.use_fov_model + + # dinov2 (vit) like encoders + self.depth_pro = DepthProModel(config) + + # dpt (vit) like fusion stage + self.fusion_stage = DepthProFeatureFusionStage(config) + + # depth estimation head + self.head = DepthProDepthEstimationHead(config) + + # dinov2 (vit) like encoder + self.fov_model = DepthProFovModel(config) if self.use_fov_model else None + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + pixel_values: torch.FloatTensor, + head_mask: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple[torch.Tensor], DepthProDepthEstimatorOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*): + Ground truth depth estimation maps for computing the loss. + + Examples: + + ```python + >>> from transformers import AutoImageProcessor, DepthProForDepthEstimation + >>> import torch + >>> from PIL import Image + >>> import requests + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> checkpoint = "apple/DepthPro-hf" + >>> processor = AutoImageProcessor.from_pretrained(checkpoint) + >>> model = DepthProForDepthEstimation.from_pretrained(checkpoint) + + >>> device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + >>> model.to(device) + + >>> # prepare image for the model + >>> inputs = processor(images=image, return_tensors="pt").to(device) + + >>> with torch.no_grad(): + ... outputs = model(**inputs) + + >>> # interpolate to original size + >>> post_processed_output = processor.post_process_depth_estimation( + ... outputs, target_sizes=[(image.height, image.width)], + ... ) + + >>> # get the field of view (fov) predictions + >>> field_of_view = post_processed_output[0]["field_of_view"] + >>> focal_length = post_processed_output[0]["focal_length"] + + >>> # visualize the prediction + >>> predicted_depth = post_processed_output[0]["predicted_depth"] + >>> depth = predicted_depth * 255 / predicted_depth.max() + >>> depth = depth.detach().cpu().numpy() + >>> depth = Image.fromarray(depth.astype("uint8")) + ```""" + loss = None + if labels is not None: + raise NotImplementedError("Training is not implemented yet") + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + + depth_pro_outputs = self.depth_pro( + pixel_values=pixel_values, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=True, + ) + features = depth_pro_outputs.features + fused_hidden_states = self.fusion_stage(features) + predicted_depth = self.head(fused_hidden_states[-1]) + + if self.use_fov_model: + # frozen features from encoder are used + features_for_fov = features[0].detach() + fov = self.fov_model( + pixel_values=pixel_values, + global_features=features_for_fov, + head_mask=head_mask, + ) + else: + fov = None + + if not return_dict: + outputs = [loss, predicted_depth, fov, depth_pro_outputs.hidden_states, depth_pro_outputs.attentions] + return tuple(v for v in outputs if v is not None) + + return DepthProDepthEstimatorOutput( + loss=loss, + predicted_depth=predicted_depth, + field_of_view=fov, + hidden_states=depth_pro_outputs.hidden_states, + attentions=depth_pro_outputs.attentions, + ) + + +__all__ = ["DepthProPreTrainedModel", "DepthProModel", "DepthProForDepthEstimation"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/dinov3_convnext/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/dinov3_convnext/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8839dc7cec78158801a1b623c27dd46281aadb0d --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/dinov3_convnext/__init__.py @@ -0,0 +1,27 @@ +# Copyright 2025 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_dinov3_convnext import * + from .modeling_dinov3_convnext import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/dinov3_convnext/configuration_dinov3_convnext.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/dinov3_convnext/configuration_dinov3_convnext.py new file mode 100644 index 0000000000000000000000000000000000000000..fa593e10ec1a685091234ee7966873570716276e --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/dinov3_convnext/configuration_dinov3_convnext.py @@ -0,0 +1,103 @@ +# coding=utf-8 +# Copyright 2025 Meta Platforms, Inc. and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""ConvNeXT model configuration""" + +from typing import Optional + +from ...configuration_utils import PretrainedConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class DINOv3ConvNextConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`DINOv3ConvNextModel`]. It is used to instantiate an + DINOv3ConvNext model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of the DINOv3ConvNext + [facebook/dinov3-convnext-tiny-pretrain-lvd1689m](https://huggingface.co/facebook/dinov3-convnext-tiny-pretrain-lvd1689m) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + num_channels (`int`, *optional*, defaults to 3): + The number of input channels. + hidden_sizes (`list[int]`, *optional*, defaults to [96, 192, 384, 768]): + Dimensionality (hidden size) at each stage. + depths (`list[int]`, *optional*, defaults to [3, 3, 9, 3]): + The number of layers for each stage. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in each block. If string, `"gelu"`, `"relu"`, + `"selu"` and `"gelu_new"` are supported. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon used by the layer normalization layers. + layer_scale_init_value (`float`, *optional*, defaults to 1e-06): + The initial value for the layer scale. + drop_path_rate (`float`, *optional*, defaults to 0.0): + The drop rate for stochastic depth. + image_size (`int`, *optional*, defaults to 224): + The size (resolution) of input images. + + Example: + ```python + >>> from transformers import DINOv3ConvNextConfig, DINOv3ConvNextModel + + >>> # Initializing a DINOv3ConvNext (tiny variant) style configuration + >>> config = DINOv3ConvNextConfig() + + >>> # Initializing a model (with random weights) + >>> model = DINOv3ConvNextModel(config) + + >>> # Accessing the model config + >>> config = model.config + ```""" + + model_type = "dinov3_convnext" + + def __init__( + self, + num_channels: int = 3, + hidden_sizes: Optional[list[int]] = None, + depths: Optional[list[int]] = None, + hidden_act: str = "gelu", + initializer_range: float = 0.02, + layer_norm_eps: float = 1e-6, + layer_scale_init_value: float = 1e-6, + drop_path_rate: float = 0.0, + image_size: int = 224, + **kwargs, + ): + super().__init__(**kwargs) + + self.num_channels = num_channels + self.hidden_sizes = [96, 192, 384, 768] if hidden_sizes is None else hidden_sizes + self.depths = [3, 3, 9, 3] if depths is None else depths + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + self.layer_scale_init_value = layer_scale_init_value + self.drop_path_rate = drop_path_rate + self.image_size = image_size + + @property + def num_stages(self) -> int: + return len(self.hidden_sizes) + + +__all__ = ["DINOv3ConvNextConfig"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/dinov3_convnext/modeling_dinov3_convnext.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/dinov3_convnext/modeling_dinov3_convnext.py new file mode 100644 index 0000000000000000000000000000000000000000..df2ef491192c297fae0cad62ce5f6c626b48b51c --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/dinov3_convnext/modeling_dinov3_convnext.py @@ -0,0 +1,260 @@ +# coding=utf-8 +# Copyright 2025 Meta Platforms, Inc. and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch ConvNext model.""" + +from typing import Optional + +import numpy as np +import torch +from torch import nn + +from ...activations import ACT2FN +from ...modeling_outputs import ( + BaseModelOutputWithPoolingAndNoAttention, +) +from ...modeling_utils import PreTrainedModel +from ...utils import auto_docstring, logging +from ...utils.generic import can_return_tuple +from .configuration_dinov3_convnext import DINOv3ConvNextConfig + + +logger = logging.get_logger(__name__) + + +# Copied from transformers.models.beit.modeling_beit.drop_path +def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor: + """ + Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + + Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks, + however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... + See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the + layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the + argument. + """ + if drop_prob == 0.0 or not training: + return input + keep_prob = 1 - drop_prob + shape = (input.shape[0],) + (1,) * (input.ndim - 1) # work with diff dim tensors, not just 2D ConvNets + random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device) + random_tensor.floor_() # binarize + output = input.div(keep_prob) * random_tensor + return output + + +# Copied from transformers.models.convnext.modeling_convnext.ConvNextDropPath with ConvNext->DINOv3ConvNext +class DINOv3ConvNextDropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" + + def __init__(self, drop_prob: Optional[float] = None) -> None: + super().__init__() + self.drop_prob = drop_prob + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + return drop_path(hidden_states, self.drop_prob, self.training) + + def extra_repr(self) -> str: + return f"p={self.drop_prob}" + + +class DINOv3ConvNextLayerNorm(nn.LayerNorm): + r"""LayerNorm that supports two data formats: channels_last (default) or channels_first. + The ordering of the dimensions in the inputs. channels_last corresponds to inputs with shape (batch_size, height, + width, channels) while channels_first corresponds to inputs with shape (batch_size, channels, height, width). + """ + + def __init__(self, *args, data_format="channels_last", **kwargs): + super().__init__(*args, **kwargs) + if data_format not in ["channels_last", "channels_first"]: + raise NotImplementedError(f"Unsupported data format: {data_format}") + self.data_format = data_format + + def forward(self, features: torch.Tensor) -> torch.Tensor: + """ + Args: + features: Tensor of shape (batch_size, channels, height, width) OR (batch_size, height, width, channels) + """ + if self.data_format == "channels_first": + features = features.permute(0, 2, 3, 1) + features = super().forward(features) + features = features.permute(0, 3, 1, 2) + else: + features = super().forward(features) + return features + + +class DINOv3ConvNextLayer(nn.Module): + """This corresponds to the `Block` class in the original implementation. + + There are two equivalent implementations: + 1) DwConv, LayerNorm (channels_first), Conv, GELU, Conv (all in (N, C, H, W) format) + 2) DwConv, Permute, LayerNorm (channels_last), Linear, GELU, Linear, Permute + + The authors used (2) as they find it slightly faster in PyTorch. + + Args: + config ([`DINOv3ConvNextConfig`]): + Model config. + channels (`int`): + Number of input (and output) channels. + drop_path (`float`): + Drop path rate. Default: 0.0. + """ + + def __init__(self, config: DINOv3ConvNextConfig, channels: int, drop_path: float = 0.0): + super().__init__() + self.depthwise_conv = nn.Conv2d(channels, channels, kernel_size=7, padding=3, groups=channels) + self.layer_norm = DINOv3ConvNextLayerNorm(channels, eps=config.layer_norm_eps) + self.pointwise_conv1 = nn.Linear(channels, 4 * channels) # can be seen as a 1x1 conv + self.activation_fn = ACT2FN[config.hidden_act] + self.pointwise_conv2 = nn.Linear(4 * channels, channels) # can be seen as a 1x1 conv + self.gamma = nn.Parameter(torch.full((channels,), config.layer_scale_init_value), requires_grad=True) + self.drop_path = DINOv3ConvNextDropPath(drop_path) if drop_path > 0.0 else nn.Identity() + + def forward(self, features: torch.Tensor) -> torch.Tensor: + """ + Args: + features: Tensor of shape (batch_size, channels, height, width) + """ + residual = features + features = self.depthwise_conv(features) + features = features.permute(0, 2, 3, 1) # to channels last + features = self.layer_norm(features) + features = self.pointwise_conv1(features) + features = self.activation_fn(features) + features = self.pointwise_conv2(features) + features = features * self.gamma + features = features.permute(0, 3, 1, 2) # back to channels first + features = residual + self.drop_path(features) + return features + + +class DINOv3ConvNextStage(nn.Module): + """ """ + + def __init__(self, config: DINOv3ConvNextConfig, stage_idx: int): + super().__init__() + + in_channels = config.hidden_sizes[stage_idx - 1] if stage_idx > 0 else config.num_channels + out_channels = config.hidden_sizes[stage_idx] + + if stage_idx == 0: + self.downsample_layers = nn.ModuleList( + [ + nn.Conv2d(config.num_channels, out_channels, kernel_size=4, stride=4), + DINOv3ConvNextLayerNorm(out_channels, eps=config.layer_norm_eps, data_format="channels_first"), + ] + ) + else: + self.downsample_layers = nn.ModuleList( + [ + DINOv3ConvNextLayerNorm(in_channels, eps=config.layer_norm_eps, data_format="channels_first"), + nn.Conv2d(in_channels, out_channels, kernel_size=2, stride=2), + ] + ) + + num_stage_layers = config.depths[stage_idx] + num_previous_layers = sum(config.depths[:stage_idx]) + num_total_layers = sum(config.depths) + drop_path_rates = np.linspace(0, config.drop_path_rate, num_total_layers).tolist() + + self.layers = nn.ModuleList( + [ + DINOv3ConvNextLayer(config, channels=out_channels, drop_path=drop_path_rates[i]) + for i in range(num_previous_layers, num_previous_layers + num_stage_layers) + ] + ) + + def forward(self, features: torch.Tensor) -> torch.Tensor: + """ + Args: + features: Tensor of shape (batch_size, channels, height, width) + """ + for layer in self.downsample_layers: + features = layer(features) + for layer in self.layers: + features = layer(features) + return features + + +@auto_docstring +class DINOv3ConvNextPreTrainedModel(PreTrainedModel): + config: DINOv3ConvNextConfig + base_model_prefix = "dinov3_convnext" + main_input_name = "pixel_values" + _no_split_modules = ["DINOv3ConvNextLayer"] + + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, (nn.Linear, nn.Conv2d)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, (nn.LayerNorm, DINOv3ConvNextLayerNorm)): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + elif isinstance(module, DINOv3ConvNextLayer): + if module.gamma is not None: + module.gamma.data.fill_(self.config.layer_scale_init_value) + + +@auto_docstring +class DINOv3ConvNextModel(DINOv3ConvNextPreTrainedModel): + def __init__(self, config: DINOv3ConvNextConfig): + super().__init__(config) + self.config = config + self.stages = nn.ModuleList([DINOv3ConvNextStage(config, stage_idx) for stage_idx in range(config.num_stages)]) + self.layer_norm = nn.LayerNorm(config.hidden_sizes[-1], eps=config.layer_norm_eps) # final norm layer + self.pool = nn.AdaptiveAvgPool2d(1) + self.post_init() + + @can_return_tuple + @auto_docstring + def forward( + self, pixel_values: torch.FloatTensor, output_hidden_states: Optional[bool] = None + ) -> BaseModelOutputWithPoolingAndNoAttention: + hidden_states = pixel_values + + output_hidden_states = output_hidden_states or self.config.output_hidden_states + all_hidden_states = [hidden_states] if output_hidden_states else [] + + for stage in self.stages: + hidden_states = stage(hidden_states) + + # store intermediate stage outputs + if output_hidden_states: + all_hidden_states.append(hidden_states) + + # make global representation, a.k.a [CLS] token + pooled_output = self.pool(hidden_states) + + # (batch_size, channels, height, width) -> (batch_size, height * width, channels) + pooled_output = pooled_output.flatten(2).transpose(1, 2) + hidden_states = hidden_states.flatten(2).transpose(1, 2) + + # concat "cls" and "patch tokens" as (batch_size, 1 + height * width, channels) + hidden_states = torch.cat([pooled_output, hidden_states], dim=1) + hidden_states = self.layer_norm(hidden_states) + + return BaseModelOutputWithPoolingAndNoAttention( + last_hidden_state=hidden_states, + pooler_output=hidden_states[:, 0], + hidden_states=tuple(all_hidden_states) if output_hidden_states else None, + ) + + +__all__ = ["DINOv3ConvNextModel", "DINOv3ConvNextPreTrainedModel"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/dinov3_vit/modeling_dinov3_vit.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/dinov3_vit/modeling_dinov3_vit.py new file mode 100644 index 0000000000000000000000000000000000000000..bca25dcc1c2b9e3ffa271e13f60d2a27bf8ad409 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/dinov3_vit/modeling_dinov3_vit.py @@ -0,0 +1,538 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/dinov3_vit/modular_dinov3_vit.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_dinov3_vit.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright 2025 Meta AI and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +from typing import Callable, Optional + +import numpy as np +import torch +from torch import nn + +from ...activations import ACT2FN +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_outputs import BaseModelOutputWithPooling +from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel +from ...processing_utils import Unpack +from ...pytorch_utils import compile_compatible_method_lru_cache +from ...utils import TransformersKwargs, auto_docstring +from ...utils.generic import check_model_inputs +from .configuration_dinov3_vit import DINOv3ViTConfig + + +class DINOv3ViTEmbeddings(nn.Module): + """ + Construct the CLS token, mask token, position and patch embeddings. + """ + + def __init__(self, config: DINOv3ViTConfig): + super().__init__() + self.config = config + self.cls_token = nn.Parameter(torch.randn(1, 1, config.hidden_size)) + self.mask_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size)) + self.register_tokens = nn.Parameter(torch.empty(1, config.num_register_tokens, config.hidden_size)) + self.patch_embeddings = nn.Conv2d( + config.num_channels, config.hidden_size, kernel_size=config.patch_size, stride=config.patch_size + ) + + def forward(self, pixel_values: torch.Tensor, bool_masked_pos: Optional[torch.Tensor] = None) -> torch.Tensor: + batch_size = pixel_values.shape[0] + target_dtype = self.patch_embeddings.weight.dtype + + # (batch_size, num_channels, height, width) -> (batch_size, num_patches, hidden_size) + patch_embeddings = self.patch_embeddings(pixel_values.to(dtype=target_dtype)) + patch_embeddings = patch_embeddings.flatten(2).transpose(1, 2) + + if bool_masked_pos is not None: + mask_token = self.mask_token.to(patch_embeddings.dtype) + patch_embeddings = torch.where(bool_masked_pos.unsqueeze(-1), mask_token, patch_embeddings) + + # Add CLS and register tokens + cls_token = self.cls_token.expand(batch_size, -1, -1) + register_tokens = self.register_tokens.expand(batch_size, -1, -1) + embeddings = torch.cat([cls_token, register_tokens, patch_embeddings], dim=1) + + return embeddings + + +@compile_compatible_method_lru_cache(maxsize=32) +def get_patches_center_coordinates( + num_patches_h: int, num_patches_w: int, dtype: torch.dtype, device: torch.device +) -> torch.Tensor: + """ + Computes the 2D coordinates of the centers of image patches, normalized to the range [-1, +1]. + The center of each patch is exactly halfway between its top-left and bottom-right corners. + + Args: + num_patches_h (int): Number of patches along the vertical (height) axis. + num_patches_w (int): Number of patches along the horizontal (width) axis. + dtype (torch.dtype): The desired data type of the returned tensor. + + Returns: + torch.Tensor: A tensor of shape (height * width, 2), where each row contains the (y, x) + coordinates of a patch center, normalized to [-1, +1]. + """ + coords_h = torch.arange(0.5, num_patches_h, dtype=dtype, device=device) + coords_w = torch.arange(0.5, num_patches_w, dtype=dtype, device=device) + coords_h = coords_h / num_patches_h + coords_w = coords_w / num_patches_w + # (height, width, 2) -> (height * width, 2) + coords = torch.stack(torch.meshgrid(coords_h, coords_w, indexing="ij"), dim=-1) + coords = coords.flatten(0, 1) + # Shift range [0, 1] to [-1, +1] + coords = 2.0 * coords - 1.0 + return coords + + +def augment_patches_center_coordinates( + coords: torch.Tensor, + shift: Optional[float] = None, + jitter: Optional[float] = None, + rescale: Optional[float] = None, +) -> torch.Tensor: + # Shift coords by adding a uniform value in [-shift, shift] + if shift is not None: + shift_hw = torch.empty((1, 2), device=coords.device, dtype=coords.dtype) + shift_hw = shift_hw.uniform_(-shift, shift) + coords = coords + shift_hw + + # Jitter coords by multiplying the range [-1, 1] by a log-uniform value in [1/jitter, jitter] + if jitter is not None: + jitter_range = np.log(jitter) + jitter_hw = torch.empty((1, 2), device=coords.device, dtype=coords.dtype) + jitter_hw = jitter_hw.uniform_(-jitter_range, jitter_range).exp() + coords = coords * jitter_hw + + # Rescale coords by multiplying the range [-1, 1] by a log-uniform value in [1/rescale, rescale] + if rescale is not None: + rescale_range = np.log(rescale) + rescale_hw = torch.empty(1, device=coords.device, dtype=coords.dtype) + rescale_hw = rescale_hw.uniform_(-rescale_range, rescale_range).exp() + coords = coords * rescale_hw + + return coords + + +class DINOv3ViTRopePositionEmbedding(nn.Module): + inv_freq: torch.Tensor + + def __init__(self, config: DINOv3ViTConfig): + super().__init__() + + self.config = config + self.base = config.rope_theta + self.head_dim = config.hidden_size // config.num_attention_heads + self.num_patches_h = config.image_size // config.patch_size + self.num_patches_w = config.image_size // config.patch_size + + inv_freq = 1 / self.base ** torch.arange(0, 1, 4 / self.head_dim, dtype=torch.float32) # (head_dim / 4,) + self.register_buffer("inv_freq", inv_freq, persistent=False) + + def forward(self, pixel_values: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: + _, _, height, width = pixel_values.shape + num_patches_h = height // self.config.patch_size + num_patches_w = width // self.config.patch_size + + device = pixel_values.device + device_type = device.type if isinstance(device.type, str) and device.type != "mps" else "cpu" + + with torch.autocast(device_type=device_type, enabled=False): # Force float32 + # Although we could precompute static patch_coords from image_size and patch_size in the config, + # the model was trained with random_scale, so it can process images of varying sizes. + # Therefore, it's better to compute patch_coords dynamically (with lru_cache). + patch_coords = get_patches_center_coordinates( + num_patches_h, num_patches_w, dtype=torch.float32, device=device + ) + if self.training: + patch_coords = augment_patches_center_coordinates( + patch_coords, + shift=self.config.pos_embed_shift, + jitter=self.config.pos_embed_jitter, + rescale=self.config.pos_embed_rescale, + ) + + # (height * width, 2, head_dim / 4) -> (height * width, head_dim / 2) -> (height * width, head_dim) + angles = 2 * math.pi * patch_coords[:, :, None] * self.inv_freq[None, None, :] + angles = angles.flatten(1, 2) + angles = angles.tile(2) + + cos = torch.cos(angles) + sin = torch.sin(angles) + + dtype = pixel_values.dtype + return cos.to(dtype=dtype), sin.to(dtype=dtype) + + +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + +def eager_attention_forward( + module: nn.Module, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attention_mask: Optional[torch.Tensor], + scaling: float, + dropout: float = 0.0, + **kwargs, +): + # Take the dot product between "query" and "key" to get the raw attention scores. + attn_weights = torch.matmul(query, key.transpose(-1, -2)) * scaling + + # Normalize the attention scores to probabilities. + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) + + # Mask heads if we want to + if attention_mask is not None: + attn_weights = attn_weights * attention_mask + + attn_output = torch.matmul(attn_weights, value) + attn_output = attn_output.transpose(1, 2).contiguous() + + return attn_output, attn_weights + + +def apply_rotary_pos_emb( + q: torch.Tensor, k: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor, **kwargs +) -> tuple[torch.Tensor, torch.Tensor]: + """Applies Rotary Position Embedding to the query and key tensors, but only to the patch tokens, + ignoring the prefix tokens (cls token and register tokens). + + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + + num_tokens = q.shape[-2] + num_patches = sin.shape[-2] + num_prefix_tokens = num_tokens - num_patches # cls token + register tokens + + q_prefix_tokens, q_patches = q.split((num_prefix_tokens, num_patches), dim=-2) + k_prefix_tokens, k_patches = k.split((num_prefix_tokens, num_patches), dim=-2) + + # apply rope only to patch tokens + q_patches = (q_patches * cos) + (rotate_half(q_patches) * sin) + k_patches = (k_patches * cos) + (rotate_half(k_patches) * sin) + + q = torch.cat((q_prefix_tokens, q_patches), dim=-2) + k = torch.cat((k_prefix_tokens, k_patches), dim=-2) + + return q, k + + +class DINOv3ViTAttention(nn.Module): + """ + Multi-headed attention compatible with ALL_ATTENTION_FUNCTIONS. + """ + + def __init__(self, config: DINOv3ViTConfig): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.embed_dim // self.num_heads + self.is_causal = False + + self.scaling = self.head_dim**-0.5 + self.is_causal = False + + self.dropout = config.attention_dropout + self.k_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.key_bias) + self.v_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.value_bias) + + self.q_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.query_bias) + self.o_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.proj_bias) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: + """Input shape: Batch x Time x Channel""" + + batch_size, patches, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + query_states = query_states.view(batch_size, patches, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(batch_size, patches, self.num_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(batch_size, patches, self.num_heads, self.head_dim).transpose(1, 2) + + cos, sin = position_embeddings + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + attn_output, attn_weights = attention_interface( + self, + query_states, + key_states, + value_states, + attention_mask, + dropout=0.0 if not self.training else self.dropout, + scaling=self.scaling, + **kwargs, + ) + + attn_output = attn_output.reshape(batch_size, patches, -1).contiguous() + attn_output = self.o_proj(attn_output) + + return attn_output, attn_weights + + +class DINOv3ViTLayerScale(nn.Module): + def __init__(self, config) -> None: + super().__init__() + self.lambda1 = nn.Parameter(config.layerscale_value * torch.ones(config.hidden_size)) + + def forward(self, hidden_state: torch.Tensor) -> torch.Tensor: + return hidden_state * self.lambda1 + + +def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor: + """ + Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + + Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks, + however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... + See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the + layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the + argument. + """ + if drop_prob == 0.0 or not training: + return input + keep_prob = 1 - drop_prob + shape = (input.shape[0],) + (1,) * (input.ndim - 1) # work with diff dim tensors, not just 2D ConvNets + random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device) + random_tensor.floor_() # binarize + output = input.div(keep_prob) * random_tensor + return output + + +class DINOv3ViTDropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" + + def __init__(self, drop_prob: Optional[float] = None) -> None: + super().__init__() + self.drop_prob = drop_prob + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + return drop_path(hidden_states, self.drop_prob, self.training) + + def extra_repr(self) -> str: + return f"p={self.drop_prob}" + + +class DINOv3ViTMLP(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.intermediate_size = config.intermediate_size + self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias) + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias) + self.act_fn = ACT2FN[config.hidden_act] + + def forward(self, x): + return self.down_proj(self.act_fn(self.up_proj(x))) + + +class DINOv3ViTGatedMLP(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.intermediate_size = config.intermediate_size + self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias) + self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias) + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias) + self.act_fn = ACT2FN[config.hidden_act] + + def forward(self, x): + down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) + return down_proj + + +class DINOv3ViTLayer(GradientCheckpointingLayer): + """This corresponds to the Block class in the original implementation.""" + + def __init__(self, config: DINOv3ViTConfig): + super().__init__() + + self.norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.attention = DINOv3ViTAttention(config) + self.layer_scale1 = DINOv3ViTLayerScale(config) + self.drop_path = DINOv3ViTDropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity() + + self.norm2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + if config.use_gated_mlp: + self.mlp = DINOv3ViTGatedMLP(config) + else: + self.mlp = DINOv3ViTMLP(config) + self.layer_scale2 = DINOv3ViTLayerScale(config) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, + ) -> torch.Tensor: + # Attention with residual connection + residual = hidden_states + hidden_states = self.norm1(hidden_states) + hidden_states, _ = self.attention( + hidden_states, + attention_mask=attention_mask, + position_embeddings=position_embeddings, + ) + hidden_states = self.layer_scale1(hidden_states) + hidden_states = self.drop_path(hidden_states) + residual + + # MLP with residual connection + residual = hidden_states + hidden_states = self.norm2(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = self.layer_scale2(hidden_states) + hidden_states = self.drop_path(hidden_states) + residual + + return hidden_states + + +@auto_docstring +class DINOv3ViTPreTrainedModel(PreTrainedModel): + config: DINOv3ViTConfig + base_model_prefix = "dinov3_vit" + main_input_name = "pixel_values" + supports_gradient_checkpointing = True + _no_split_modules = ["DINOv3ViTLayer"] + _supports_sdpa = True + _supports_flash_attn = True + _supports_flex_attn = True + _supports_attention_backend = True + _can_record_outputs = { + "hidden_states": DINOv3ViTLayer, + "attentions": DINOv3ViTAttention, + } + + def _init_weights(self, module) -> None: + """Initialize the weights""" + if isinstance(module, (nn.Linear, nn.Conv2d)): + # Upcast the input in `fp32` and cast it back to desired `dtype` to avoid + # `trunc_normal_cpu` not implemented in `half` issues + module.weight.data = nn.init.trunc_normal_( + module.weight.data.to(torch.float32), + mean=0.0, + std=self.config.initializer_range, + ).to(module.weight.dtype) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + elif isinstance(module, DINOv3ViTEmbeddings): + module.cls_token.data = nn.init.trunc_normal_( + module.cls_token.data.to(torch.float32), + mean=0.0, + std=self.config.initializer_range, + ).to(module.cls_token.dtype) + if module.config.num_register_tokens > 0: + module.register_tokens.data = nn.init.trunc_normal_( + module.register_tokens.data.to(torch.float32), + mean=0.0, + std=self.config.initializer_range, + ).to(module.register_tokens.dtype) + module.mask_token.data.zero_() + elif isinstance(module, DINOv3ViTLayerScale): + module.lambda1.data.fill_(self.config.layerscale_value) + + +@auto_docstring +class DINOv3ViTModel(DINOv3ViTPreTrainedModel): + def __init__(self, config: DINOv3ViTConfig): + super().__init__(config) + self.config = config + self.embeddings = DINOv3ViTEmbeddings(config) + self.rope_embeddings = DINOv3ViTRopePositionEmbedding(config) + self.layer = nn.ModuleList([DINOv3ViTLayer(config) for _ in range(config.num_hidden_layers)]) + self.norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embeddings.patch_embeddings + + @check_model_inputs(tie_last_hidden_states=False) + @auto_docstring + def forward( + self, + pixel_values: torch.Tensor, + bool_masked_pos: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> BaseModelOutputWithPooling: + r""" + bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, sequence_length)`): + Boolean masked positions. Indicates which patches are masked (1) and which aren't (0). Only relevant for + pre-training. + """ + + pixel_values = pixel_values.to(self.embeddings.patch_embeddings.weight.dtype) + hidden_states = self.embeddings(pixel_values, bool_masked_pos=bool_masked_pos) + position_embeddings = self.rope_embeddings(pixel_values) + + for i, layer_module in enumerate(self.layer): + layer_head_mask = head_mask[i] if head_mask is not None else None + hidden_states = layer_module( + hidden_states, + attention_mask=layer_head_mask, + position_embeddings=position_embeddings, + ) + + sequence_output = self.norm(hidden_states) + pooled_output = sequence_output[:, 0, :] + + return BaseModelOutputWithPooling( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + ) + + +__all__ = ["DINOv3ViTModel", "DINOv3ViTPreTrainedModel"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/dinov3_vit/modular_dinov3_vit.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/dinov3_vit/modular_dinov3_vit.py new file mode 100644 index 0000000000000000000000000000000000000000..88597336fb191be44ff2825d152f9a3551b47b28 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/dinov3_vit/modular_dinov3_vit.py @@ -0,0 +1,428 @@ +# coding=utf-8 +# Copyright 2025 Meta AI and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch DINOv3 model.""" + +import math +from typing import Callable, Optional + +import numpy as np +import torch +from torch import nn + +from transformers.models.arcee.modeling_arcee import ArceeMLP +from transformers.models.dinov2.modeling_dinov2 import ( + Dinov2DropPath, + Dinov2LayerScale, + Dinov2PreTrainedModel, + eager_attention_forward, +) +from transformers.models.llama.modeling_llama import LlamaMLP +from transformers.models.pixtral.modeling_pixtral import PixtralAttention, rotate_half + +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_outputs import BaseModelOutputWithPooling +from ...modeling_utils import ALL_ATTENTION_FUNCTIONS +from ...processing_utils import Unpack +from ...pytorch_utils import compile_compatible_method_lru_cache +from ...utils import TransformersKwargs, auto_docstring, logging +from ...utils.generic import check_model_inputs +from .configuration_dinov3_vit import DINOv3ViTConfig + + +logger = logging.get_logger(__name__) + + +class DINOv3ViTEmbeddings(nn.Module): + """ + Construct the CLS token, mask token, position and patch embeddings. + """ + + def __init__(self, config: DINOv3ViTConfig): + super().__init__() + self.config = config + self.cls_token = nn.Parameter(torch.randn(1, 1, config.hidden_size)) + self.mask_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size)) + self.register_tokens = nn.Parameter(torch.empty(1, config.num_register_tokens, config.hidden_size)) + self.patch_embeddings = nn.Conv2d( + config.num_channels, config.hidden_size, kernel_size=config.patch_size, stride=config.patch_size + ) + + def forward(self, pixel_values: torch.Tensor, bool_masked_pos: Optional[torch.Tensor] = None) -> torch.Tensor: + batch_size = pixel_values.shape[0] + target_dtype = self.patch_embeddings.weight.dtype + + # (batch_size, num_channels, height, width) -> (batch_size, num_patches, hidden_size) + patch_embeddings = self.patch_embeddings(pixel_values.to(dtype=target_dtype)) + patch_embeddings = patch_embeddings.flatten(2).transpose(1, 2) + + if bool_masked_pos is not None: + mask_token = self.mask_token.to(patch_embeddings.dtype) + patch_embeddings = torch.where(bool_masked_pos.unsqueeze(-1), mask_token, patch_embeddings) + + # Add CLS and register tokens + cls_token = self.cls_token.expand(batch_size, -1, -1) + register_tokens = self.register_tokens.expand(batch_size, -1, -1) + embeddings = torch.cat([cls_token, register_tokens, patch_embeddings], dim=1) + + return embeddings + + +@compile_compatible_method_lru_cache(maxsize=32) +def get_patches_center_coordinates( + num_patches_h: int, num_patches_w: int, dtype: torch.dtype, device: torch.device +) -> torch.Tensor: + """ + Computes the 2D coordinates of the centers of image patches, normalized to the range [-1, +1]. + The center of each patch is exactly halfway between its top-left and bottom-right corners. + + Args: + num_patches_h (int): Number of patches along the vertical (height) axis. + num_patches_w (int): Number of patches along the horizontal (width) axis. + dtype (torch.dtype): The desired data type of the returned tensor. + + Returns: + torch.Tensor: A tensor of shape (height * width, 2), where each row contains the (y, x) + coordinates of a patch center, normalized to [-1, +1]. + """ + coords_h = torch.arange(0.5, num_patches_h, dtype=dtype, device=device) + coords_w = torch.arange(0.5, num_patches_w, dtype=dtype, device=device) + coords_h = coords_h / num_patches_h + coords_w = coords_w / num_patches_w + # (height, width, 2) -> (height * width, 2) + coords = torch.stack(torch.meshgrid(coords_h, coords_w, indexing="ij"), dim=-1) + coords = coords.flatten(0, 1) + # Shift range [0, 1] to [-1, +1] + coords = 2.0 * coords - 1.0 + return coords + + +def augment_patches_center_coordinates( + coords: torch.Tensor, + shift: Optional[float] = None, + jitter: Optional[float] = None, + rescale: Optional[float] = None, +) -> torch.Tensor: + # Shift coords by adding a uniform value in [-shift, shift] + if shift is not None: + shift_hw = torch.empty((1, 2), device=coords.device, dtype=coords.dtype) + shift_hw = shift_hw.uniform_(-shift, shift) + coords = coords + shift_hw + + # Jitter coords by multiplying the range [-1, 1] by a log-uniform value in [1/jitter, jitter] + if jitter is not None: + jitter_range = np.log(jitter) + jitter_hw = torch.empty((1, 2), device=coords.device, dtype=coords.dtype) + jitter_hw = jitter_hw.uniform_(-jitter_range, jitter_range).exp() + coords = coords * jitter_hw + + # Rescale coords by multiplying the range [-1, 1] by a log-uniform value in [1/rescale, rescale] + if rescale is not None: + rescale_range = np.log(rescale) + rescale_hw = torch.empty(1, device=coords.device, dtype=coords.dtype) + rescale_hw = rescale_hw.uniform_(-rescale_range, rescale_range).exp() + coords = coords * rescale_hw + + return coords + + +class DINOv3ViTRopePositionEmbedding(nn.Module): + inv_freq: torch.Tensor + + def __init__(self, config: DINOv3ViTConfig): + super().__init__() + + self.config = config + self.base = config.rope_theta + self.head_dim = config.hidden_size // config.num_attention_heads + self.num_patches_h = config.image_size // config.patch_size + self.num_patches_w = config.image_size // config.patch_size + + inv_freq = 1 / self.base ** torch.arange(0, 1, 4 / self.head_dim, dtype=torch.float32) # (head_dim / 4,) + self.register_buffer("inv_freq", inv_freq, persistent=False) + + def forward(self, pixel_values: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: + _, _, height, width = pixel_values.shape + num_patches_h = height // self.config.patch_size + num_patches_w = width // self.config.patch_size + + device = pixel_values.device + device_type = device.type if isinstance(device.type, str) and device.type != "mps" else "cpu" + + with torch.autocast(device_type=device_type, enabled=False): # Force float32 + # Although we could precompute static patch_coords from image_size and patch_size in the config, + # the model was trained with random_scale, so it can process images of varying sizes. + # Therefore, it's better to compute patch_coords dynamically (with lru_cache). + patch_coords = get_patches_center_coordinates( + num_patches_h, num_patches_w, dtype=torch.float32, device=device + ) + if self.training: + patch_coords = augment_patches_center_coordinates( + patch_coords, + shift=self.config.pos_embed_shift, + jitter=self.config.pos_embed_jitter, + rescale=self.config.pos_embed_rescale, + ) + + # (height * width, 2, head_dim / 4) -> (height * width, head_dim / 2) -> (height * width, head_dim) + angles = 2 * math.pi * patch_coords[:, :, None] * self.inv_freq[None, None, :] + angles = angles.flatten(1, 2) + angles = angles.tile(2) + + cos = torch.cos(angles) + sin = torch.sin(angles) + + dtype = pixel_values.dtype + return cos.to(dtype=dtype), sin.to(dtype=dtype) + + +def apply_rotary_pos_emb( + q: torch.Tensor, k: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor, **kwargs +) -> tuple[torch.Tensor, torch.Tensor]: + """Applies Rotary Position Embedding to the query and key tensors, but only to the patch tokens, + ignoring the prefix tokens (cls token and register tokens). + + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + + num_tokens = q.shape[-2] + num_patches = sin.shape[-2] + num_prefix_tokens = num_tokens - num_patches # cls token + register tokens + + q_prefix_tokens, q_patches = q.split((num_prefix_tokens, num_patches), dim=-2) + k_prefix_tokens, k_patches = k.split((num_prefix_tokens, num_patches), dim=-2) + + # apply rope only to patch tokens + q_patches = (q_patches * cos) + (rotate_half(q_patches) * sin) + k_patches = (k_patches * cos) + (rotate_half(k_patches) * sin) + + q = torch.cat((q_prefix_tokens, q_patches), dim=-2) + k = torch.cat((k_prefix_tokens, k_patches), dim=-2) + + return q, k + + +class DINOv3ViTAttention(PixtralAttention): + def __init__(self, config: DINOv3ViTConfig): + super().__init__(config) + + self.q_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.query_bias) + self.k_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.key_bias) + self.v_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.value_bias) + self.o_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.proj_bias) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: + """Input shape: Batch x Time x Channel""" + + batch_size, patches, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + query_states = query_states.view(batch_size, patches, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(batch_size, patches, self.num_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(batch_size, patches, self.num_heads, self.head_dim).transpose(1, 2) + + cos, sin = position_embeddings + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + attn_output, attn_weights = attention_interface( + self, + query_states, + key_states, + value_states, + attention_mask, + dropout=0.0 if not self.training else self.dropout, + scaling=self.scaling, + **kwargs, + ) + + attn_output = attn_output.reshape(batch_size, patches, -1).contiguous() + attn_output = self.o_proj(attn_output) + + return attn_output, attn_weights + + +class DINOv3ViTLayerScale(Dinov2LayerScale): + pass + + +class DINOv3ViTDropPath(Dinov2DropPath): + pass + + +class DINOv3ViTMLP(ArceeMLP): + pass + + +class DINOv3ViTGatedMLP(LlamaMLP): + pass + + +class DINOv3ViTLayer(GradientCheckpointingLayer): + """This corresponds to the Block class in the original implementation.""" + + def __init__(self, config: DINOv3ViTConfig): + super().__init__() + + self.norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.attention = DINOv3ViTAttention(config) + self.layer_scale1 = DINOv3ViTLayerScale(config) + self.drop_path = DINOv3ViTDropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity() + + self.norm2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + if config.use_gated_mlp: + self.mlp = DINOv3ViTGatedMLP(config) + else: + self.mlp = DINOv3ViTMLP(config) + self.layer_scale2 = DINOv3ViTLayerScale(config) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, + ) -> torch.Tensor: + # Attention with residual connection + residual = hidden_states + hidden_states = self.norm1(hidden_states) + hidden_states, _ = self.attention( + hidden_states, + attention_mask=attention_mask, + position_embeddings=position_embeddings, + ) + hidden_states = self.layer_scale1(hidden_states) + hidden_states = self.drop_path(hidden_states) + residual + + # MLP with residual connection + residual = hidden_states + hidden_states = self.norm2(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = self.layer_scale2(hidden_states) + hidden_states = self.drop_path(hidden_states) + residual + + return hidden_states + + +@auto_docstring +class DINOv3ViTPreTrainedModel(Dinov2PreTrainedModel): + _can_record_outputs = { + "hidden_states": DINOv3ViTLayer, + "attentions": DINOv3ViTAttention, + } + + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, (nn.Linear, nn.Conv2d)): + # Upcast the input in `fp32` and cast it back to desired `dtype` to avoid + # `trunc_normal_cpu` not implemented in `half` issues + module.weight.data = nn.init.trunc_normal_( + module.weight.data.to(torch.float32), + mean=0.0, + std=self.config.initializer_range, + ).to(module.weight.dtype) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + elif isinstance(module, DINOv3ViTEmbeddings): + module.cls_token.data = nn.init.trunc_normal_( + module.cls_token.data.to(torch.float32), + mean=0.0, + std=self.config.initializer_range, + ).to(module.cls_token.dtype) + if module.config.num_register_tokens > 0: + module.register_tokens.data = nn.init.trunc_normal_( + module.register_tokens.data.to(torch.float32), + mean=0.0, + std=self.config.initializer_range, + ).to(module.register_tokens.dtype) + module.mask_token.data.zero_() + elif isinstance(module, DINOv3ViTLayerScale): + module.lambda1.data.fill_(self.config.layerscale_value) + + +@auto_docstring +class DINOv3ViTModel(DINOv3ViTPreTrainedModel): + def __init__(self, config: DINOv3ViTConfig): + super().__init__(config) + self.config = config + self.embeddings = DINOv3ViTEmbeddings(config) + self.rope_embeddings = DINOv3ViTRopePositionEmbedding(config) + self.layer = nn.ModuleList([DINOv3ViTLayer(config) for _ in range(config.num_hidden_layers)]) + self.norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embeddings.patch_embeddings + + @check_model_inputs(tie_last_hidden_states=False) + @auto_docstring + def forward( + self, + pixel_values: torch.Tensor, + bool_masked_pos: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> BaseModelOutputWithPooling: + r""" + bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, sequence_length)`): + Boolean masked positions. Indicates which patches are masked (1) and which aren't (0). Only relevant for + pre-training. + """ + + pixel_values = pixel_values.to(self.embeddings.patch_embeddings.weight.dtype) + hidden_states = self.embeddings(pixel_values, bool_masked_pos=bool_masked_pos) + position_embeddings = self.rope_embeddings(pixel_values) + + for i, layer_module in enumerate(self.layer): + layer_head_mask = head_mask[i] if head_mask is not None else None + hidden_states = layer_module( + hidden_states, + attention_mask=layer_head_mask, + position_embeddings=position_embeddings, + ) + + sequence_output = self.norm(hidden_states) + pooled_output = sequence_output[:, 0, :] + + return BaseModelOutputWithPooling( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + ) + + +__all__ = ["DINOv3ViTModel", "DINOv3ViTPreTrainedModel"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/distilbert/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/distilbert/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..4d6fae2e0236e7619988f0cfa3502ed49d0f90b0 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/distilbert/__init__.py @@ -0,0 +1,31 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_distilbert import * + from .modeling_distilbert import * + from .modeling_flax_distilbert import * + from .modeling_tf_distilbert import * + from .tokenization_distilbert import * + from .tokenization_distilbert_fast import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/distilbert/configuration_distilbert.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/distilbert/configuration_distilbert.py new file mode 100644 index 0000000000000000000000000000000000000000..0aa6d2dfd7c589c891204b8301d7677b097e29d8 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/distilbert/configuration_distilbert.py @@ -0,0 +1,141 @@ +# coding=utf-8 +# Copyright 2019-present, the HuggingFace Inc. team, The Google AI Language Team and Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""DistilBERT model configuration""" + +from collections import OrderedDict +from collections.abc import Mapping + +from ...configuration_utils import PretrainedConfig +from ...onnx import OnnxConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class DistilBertConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`DistilBertModel`] or a [`TFDistilBertModel`]. It + is used to instantiate a DistilBERT model according to the specified arguments, defining the model architecture. + Instantiating a configuration with the defaults will yield a similar configuration to that of the DistilBERT + [distilbert-base-uncased](https://huggingface.co/distilbert-base-uncased) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + vocab_size (`int`, *optional*, defaults to 30522): + Vocabulary size of the DistilBERT model. Defines the number of different tokens that can be represented by + the `inputs_ids` passed when calling [`DistilBertModel`] or [`TFDistilBertModel`]. + max_position_embeddings (`int`, *optional*, defaults to 512): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + sinusoidal_pos_embds (`boolean`, *optional*, defaults to `False`): + Whether to use sinusoidal positional embeddings. + n_layers (`int`, *optional*, defaults to 6): + Number of hidden layers in the Transformer encoder. + n_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + dim (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + hidden_dim (`int`, *optional*, defaults to 3072): + The size of the "intermediate" (often named feed-forward) layer in the Transformer encoder. + dropout (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_dropout (`float`, *optional*, defaults to 0.1): + The dropout ratio for the attention probabilities. + activation (`str` or `Callable`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"silu"` and `"gelu_new"` are supported. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + qa_dropout (`float`, *optional*, defaults to 0.1): + The dropout probabilities used in the question answering model [`DistilBertForQuestionAnswering`]. + seq_classif_dropout (`float`, *optional*, defaults to 0.2): + The dropout probabilities used in the sequence classification and the multiple choice model + [`DistilBertForSequenceClassification`]. + + Examples: + + ```python + >>> from transformers import DistilBertConfig, DistilBertModel + + >>> # Initializing a DistilBERT configuration + >>> configuration = DistilBertConfig() + + >>> # Initializing a model (with random weights) from the configuration + >>> model = DistilBertModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "distilbert" + attribute_map = { + "hidden_size": "dim", + "num_attention_heads": "n_heads", + "num_hidden_layers": "n_layers", + } + + def __init__( + self, + vocab_size=30522, + max_position_embeddings=512, + sinusoidal_pos_embds=False, + n_layers=6, + n_heads=12, + dim=768, + hidden_dim=4 * 768, + dropout=0.1, + attention_dropout=0.1, + activation="gelu", + initializer_range=0.02, + qa_dropout=0.1, + seq_classif_dropout=0.2, + pad_token_id=0, + **kwargs, + ): + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.sinusoidal_pos_embds = sinusoidal_pos_embds + self.n_layers = n_layers + self.n_heads = n_heads + self.dim = dim + self.hidden_dim = hidden_dim + self.dropout = dropout + self.attention_dropout = attention_dropout + self.activation = activation + self.initializer_range = initializer_range + self.qa_dropout = qa_dropout + self.seq_classif_dropout = seq_classif_dropout + super().__init__(**kwargs, pad_token_id=pad_token_id) + + +class DistilBertOnnxConfig(OnnxConfig): + @property + def inputs(self) -> Mapping[str, Mapping[int, str]]: + if self.task == "multiple-choice": + dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"} + else: + dynamic_axis = {0: "batch", 1: "sequence"} + return OrderedDict( + [ + ("input_ids", dynamic_axis), + ("attention_mask", dynamic_axis), + ] + ) + + +__all__ = ["DistilBertConfig", "DistilBertOnnxConfig"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/distilbert/modeling_distilbert.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/distilbert/modeling_distilbert.py new file mode 100644 index 0000000000000000000000000000000000000000..bc116b231af102fead8ed826e42c30c900c59524 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/distilbert/modeling_distilbert.py @@ -0,0 +1,1287 @@ +# coding=utf-8 +# Copyright 2019-present, the HuggingFace Inc. team, The Google AI Language Team and Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +PyTorch DistilBERT model adapted in part from Facebook, Inc XLM model (https://github.com/facebookresearch/XLM) and in +part from HuggingFace PyTorch version of Google AI Bert model (https://github.com/google-research/bert) +""" + +import math +from typing import Optional, Union + +import numpy as np +import torch +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from ...activations import get_activation +from ...configuration_utils import PretrainedConfig +from ...integrations.deepspeed import is_deepspeed_zero3_enabled +from ...modeling_attn_mask_utils import _prepare_4d_attention_mask_for_sdpa +from ...modeling_flash_attention_utils import flash_attn_supports_top_left_mask, is_flash_attn_available +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_outputs import ( + BaseModelOutput, + MaskedLMOutput, + MultipleChoiceModelOutput, + QuestionAnsweringModelOutput, + SequenceClassifierOutput, + TokenClassifierOutput, +) +from ...modeling_utils import PreTrainedModel +from ...pytorch_utils import ( + apply_chunking_to_forward, + find_pruneable_heads_and_indices, + prune_linear_layer, +) +from ...utils import ( + auto_docstring, + logging, +) +from .configuration_distilbert import DistilBertConfig + + +if is_flash_attn_available(): + from ...modeling_flash_attention_utils import _flash_attention_forward + + +logger = logging.get_logger(__name__) + + +# UTILS AND BUILDING BLOCKS OF THE ARCHITECTURE # + + +def create_sinusoidal_embeddings(n_pos: int, dim: int, out: torch.Tensor): + if is_deepspeed_zero3_enabled(): + import deepspeed + + with deepspeed.zero.GatheredParameters(out, modifier_rank=0): + if torch.distributed.get_rank() == 0: + _create_sinusoidal_embeddings(n_pos=n_pos, dim=dim, out=out) + else: + _create_sinusoidal_embeddings(n_pos=n_pos, dim=dim, out=out) + + +def _create_sinusoidal_embeddings(n_pos: int, dim: int, out: torch.Tensor): + position_enc = np.array([[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)]) + out.requires_grad = False + out[:, 0::2] = torch.FloatTensor(np.sin(position_enc[:, 0::2])) + out[:, 1::2] = torch.FloatTensor(np.cos(position_enc[:, 1::2])) + out.detach_() + + +class Embeddings(nn.Module): + def __init__(self, config: PretrainedConfig): + super().__init__() + self.word_embeddings = nn.Embedding(config.vocab_size, config.dim, padding_idx=config.pad_token_id) + self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.dim) + + self.LayerNorm = nn.LayerNorm(config.dim, eps=1e-12) + self.dropout = nn.Dropout(config.dropout) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) + + def forward(self, input_ids: torch.Tensor, input_embeds: Optional[torch.Tensor] = None) -> torch.Tensor: + """ + Parameters: + input_ids (torch.Tensor): + torch.tensor(bs, max_seq_length) The token ids to embed. + input_embeds (*optional*, torch.Tensor): + The pre-computed word embeddings. Can only be passed if the input ids are `None`. + + + Returns: torch.tensor(bs, max_seq_length, dim) The embedded tokens (plus position embeddings, no token_type + embeddings) + """ + if input_ids is not None: + input_embeds = self.word_embeddings(input_ids) # (bs, max_seq_length, dim) + + seq_length = input_embeds.size(1) + + # Setting the position-ids to the registered buffer in constructor, it helps + # when tracing the model without passing position-ids, solves + # issues similar to issue #5664 + if hasattr(self, "position_ids"): + position_ids = self.position_ids[:, :seq_length] + else: + position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device) # (max_seq_length) + position_ids = position_ids.unsqueeze(0).expand_as(input_ids) # (bs, max_seq_length) + + position_embeddings = self.position_embeddings(position_ids) # (bs, max_seq_length, dim) + + embeddings = input_embeds + position_embeddings # (bs, max_seq_length, dim) + embeddings = self.LayerNorm(embeddings) # (bs, max_seq_length, dim) + embeddings = self.dropout(embeddings) # (bs, max_seq_length, dim) + return embeddings + + +class MultiHeadSelfAttention(nn.Module): + def __init__(self, config: PretrainedConfig): + super().__init__() + self.config = config + + self.n_heads = config.n_heads + self.dim = config.dim + self.dropout = nn.Dropout(p=config.attention_dropout) + self.is_causal = False + + # Have an even number of multi heads that divide the dimensions + if self.dim % self.n_heads != 0: + # Raise value errors for even multi-head attention nodes + raise ValueError(f"self.n_heads: {self.n_heads} must divide self.dim: {self.dim} evenly") + + self.q_lin = nn.Linear(in_features=config.dim, out_features=config.dim) + self.k_lin = nn.Linear(in_features=config.dim, out_features=config.dim) + self.v_lin = nn.Linear(in_features=config.dim, out_features=config.dim) + self.out_lin = nn.Linear(in_features=config.dim, out_features=config.dim) + + self.pruned_heads: set[int] = set() + self.attention_head_size = self.dim // self.n_heads + + def prune_heads(self, heads: list[int]): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.n_heads, self.attention_head_size, self.pruned_heads + ) + # Prune linear layers + self.q_lin = prune_linear_layer(self.q_lin, index) + self.k_lin = prune_linear_layer(self.k_lin, index) + self.v_lin = prune_linear_layer(self.v_lin, index) + self.out_lin = prune_linear_layer(self.out_lin, index, dim=1) + # Update hyper params + self.n_heads = self.n_heads - len(heads) + self.dim = self.attention_head_size * self.n_heads + self.pruned_heads = self.pruned_heads.union(heads) + + def forward( + self, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + mask: torch.Tensor, + head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + ) -> tuple[torch.Tensor, ...]: + """ + Parameters: + query: torch.tensor(bs, seq_length, dim) + key: torch.tensor(bs, seq_length, dim) + value: torch.tensor(bs, seq_length, dim) + mask: torch.tensor(bs, seq_length) + + Returns: + weights: torch.tensor(bs, n_heads, seq_length, seq_length) Attention weights context: torch.tensor(bs, + seq_length, dim) Contextualized layer. Optional: only if `output_attentions=True` + """ + bs, q_length, dim = query.size() + k_length = key.size(1) + # assert dim == self.dim, f'Dimensions do not match: {dim} input vs {self.dim} configured' + # assert key.size() == value.size() + + dim_per_head = self.dim // self.n_heads + + mask_reshp = (bs, 1, 1, k_length) + + def shape(x: torch.Tensor) -> torch.Tensor: + """separate heads""" + return x.view(bs, -1, self.n_heads, dim_per_head).transpose(1, 2) + + def unshape(x: torch.Tensor) -> torch.Tensor: + """group heads""" + return x.transpose(1, 2).contiguous().view(bs, -1, self.n_heads * dim_per_head) + + q = shape(self.q_lin(query)) # (bs, n_heads, q_length, dim_per_head) + k = shape(self.k_lin(key)) # (bs, n_heads, k_length, dim_per_head) + v = shape(self.v_lin(value)) # (bs, n_heads, k_length, dim_per_head) + + q = q / math.sqrt(dim_per_head) # (bs, n_heads, q_length, dim_per_head) + scores = torch.matmul(q, k.transpose(2, 3)) # (bs, n_heads, q_length, k_length) + mask = (mask == 0).view(mask_reshp).expand_as(scores) # (bs, n_heads, q_length, k_length) + scores = scores.masked_fill( + mask, torch.tensor(torch.finfo(scores.dtype).min) + ) # (bs, n_heads, q_length, k_length) + + weights = nn.functional.softmax(scores, dim=-1) # (bs, n_heads, q_length, k_length) + weights = self.dropout(weights) # (bs, n_heads, q_length, k_length) + + # Mask heads if we want to + if head_mask is not None: + weights = weights * head_mask + + context = torch.matmul(weights, v) # (bs, n_heads, q_length, dim_per_head) + context = unshape(context) # (bs, q_length, dim) + context = self.out_lin(context) # (bs, q_length, dim) + + if output_attentions: + return (context, weights) + else: + return (context,) + + +class DistilBertFlashAttention2(MultiHeadSelfAttention): + """ + DistilBert flash attention module. This module inherits from `MultiHeadSelfAttention` as the weights of the module + stays untouched. The only required change would be on the forward pass where it needs to correctly call the public + API of flash attention and deal with padding tokens in case the input contains any of them. + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1. + # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0. + # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left). + self._flash_attn_uses_top_left_mask = flash_attn_supports_top_left_mask() + + def forward( + self, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + mask: torch.Tensor, + head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + ) -> tuple[torch.Tensor, ...]: + """ + Parameters: + query: torch.tensor(bs, seq_length, dim) + key: torch.tensor(bs, seq_length, dim) + value: torch.tensor(bs, seq_length, dim) + mask: torch.tensor(bs, seq_length) + + Returns: + weights: torch.tensor(bs, n_heads, seq_length, seq_length) Attention weights context: torch.tensor(bs, + seq_length, dim) Contextualized layer. Optional: only if `output_attentions=True` + """ + batch_size, q_length, dim = query.size() + + dim_per_head = self.dim // self.n_heads + + def reshape(x: torch.Tensor) -> torch.Tensor: + """separate heads""" + return x.view(batch_size, -1, self.n_heads, dim_per_head) + + # Flash attention requires the input to have the shape + # batch_size x seq_length x head_dim x hidden_dim + query_states = reshape(self.q_lin(query)) + key_states = reshape(self.k_lin(key)) + value_states = reshape(self.v_lin(value)) + + attn_dropout = self.config.attention_dropout if self.training else 0.0 + + # In PEFT, usually we cast the layer norms in float32 for training stability reasons + # therefore the input hidden states gets silently casted in float32. Hence, we need + # cast them back in the correct dtype just to be sure everything works as expected. + # This might slowdown training & inference so it is recommended to not cast the LayerNorms + # in fp32. (LlamaRMSNorm handles it correctly) + + device_type = query_states.device.type if query_states.device.type != "mps" else "cpu" + if query_states.dtype == torch.float32: + if torch.is_autocast_enabled(): + target_dtype = ( + torch.get_autocast_dtype(device_type) + if hasattr(torch, "get_autocast_dtype") + else torch.get_autocast_gpu_dtype() + ) + # Handle the case where the model is quantized + elif hasattr(self.config, "_pre_quantization_dtype"): + target_dtype = self.config._pre_quantization_dtype + else: + target_dtype = self.q_lin.weight.dtype + + logger.warning_once( + f"The input hidden states seems to be silently casted in float32, this might be related to" + f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in" + f" {target_dtype}." + ) + + query_states = query_states.to(target_dtype) + key_states = key_states.to(target_dtype) + value_states = value_states.to(target_dtype) + + attn_weights = _flash_attention_forward( + query_states, + key_states, + value_states, + mask, + q_length, + dropout=attn_dropout, + use_top_left_mask=self._flash_attn_uses_top_left_mask, + is_causal=self.is_causal, + ) + + attn_weights_reshaped = attn_weights.reshape(batch_size, q_length, self.n_heads * dim_per_head) + attn_output = self.out_lin(attn_weights_reshaped) + + if output_attentions: + return (attn_output, attn_weights) + else: + return (attn_output,) + + +class DistilBertSdpaAttention(MultiHeadSelfAttention): + def __init__(self, config: PretrainedConfig): + super().__init__(config=config) + self.dropout_prob = config.attention_dropout + + def forward( + self, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + mask: torch.Tensor, + head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + ) -> tuple[torch.Tensor, ...]: + """ + Parameters: + query: torch.tensor(bs, seq_length, dim) + key: torch.tensor(bs, seq_length, dim) + value: torch.tensor(bs, seq_length, dim) + mask: torch.tensor(bs, seq_length) + + Returns: + weights: torch.tensor(bs, n_heads, seq_length, seq_length) Attention weights context: torch.tensor(bs, + seq_length, dim) Contextualized layer. Optional: only if `output_attentions=True` + """ + if output_attentions or head_mask is not None: + logger.warning_once( + "DistilBertSdpaAttention is used but `torch.nn.functional.scaled_dot_product_attention` does not support" + " `output_attentions=True` or `head_mask`. Falling back to the manual attention implementation, but specifying" + " the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be" + ' removed using the argument `attn_implementation="eager"` when loading the model.' + ) + return super().forward( + query, + key, + value, + mask, + head_mask, + output_attentions, + ) + + batch_size, _, _ = query.size() + dim_per_head = self.dim // self.n_heads + + def shape(x: torch.Tensor) -> torch.Tensor: + """separate heads""" + return x.view(batch_size, -1, self.n_heads, dim_per_head).transpose(1, 2) + + def unshape(x: torch.Tensor) -> torch.Tensor: + """group heads""" + return x.transpose(1, 2).contiguous().view(batch_size, -1, self.n_heads * dim_per_head) + + q = shape(self.q_lin(query)) # (bs, n_heads, q_length, dim_per_head) + k = shape(self.k_lin(key)) # (bs, n_heads, k_length, dim_per_head) + v = shape(self.v_lin(value)) # (bs, n_heads, k_length, dim_per_head) + + attn_output = torch.nn.functional.scaled_dot_product_attention( + q, + k, + v, + attn_mask=mask, + dropout_p=self.dropout_prob if self.training else 0.0, + is_causal=False, + ) + + attn_output = unshape(attn_output) + attn_output = self.out_lin(attn_output) + + return (attn_output,) + + +class FFN(nn.Module): + def __init__(self, config: PretrainedConfig): + super().__init__() + self.dropout = nn.Dropout(p=config.dropout) + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 + self.lin1 = nn.Linear(in_features=config.dim, out_features=config.hidden_dim) + self.lin2 = nn.Linear(in_features=config.hidden_dim, out_features=config.dim) + self.activation = get_activation(config.activation) + + def forward(self, input: torch.Tensor) -> torch.Tensor: + return apply_chunking_to_forward(self.ff_chunk, self.chunk_size_feed_forward, self.seq_len_dim, input) + + def ff_chunk(self, input: torch.Tensor) -> torch.Tensor: + x = self.lin1(input) + x = self.activation(x) + x = self.lin2(x) + x = self.dropout(x) + return x + + +DISTILBERT_ATTENTION_CLASSES = { + "eager": MultiHeadSelfAttention, + "flash_attention_2": DistilBertFlashAttention2, + "sdpa": DistilBertSdpaAttention, +} + + +class TransformerBlock(GradientCheckpointingLayer): + def __init__(self, config: PretrainedConfig): + super().__init__() + + # Have an even number of Configure multi-heads + if config.dim % config.n_heads != 0: + raise ValueError(f"config.n_heads {config.n_heads} must divide config.dim {config.dim} evenly") + + self.attention = DISTILBERT_ATTENTION_CLASSES[config._attn_implementation](config) + self.sa_layer_norm = nn.LayerNorm(normalized_shape=config.dim, eps=1e-12) + + self.ffn = FFN(config) + self.output_layer_norm = nn.LayerNorm(normalized_shape=config.dim, eps=1e-12) + + def forward( + self, + x: torch.Tensor, + attn_mask: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + ) -> tuple[torch.Tensor, ...]: + """ + Parameters: + x: torch.tensor(bs, seq_length, dim) + attn_mask: torch.tensor(bs, seq_length) + + Returns: + sa_weights: torch.tensor(bs, n_heads, seq_length, seq_length) The attention weights ffn_output: + torch.tensor(bs, seq_length, dim) The output of the transformer block contextualization. + """ + # Self-Attention + sa_output = self.attention( + query=x, + key=x, + value=x, + mask=attn_mask, + head_mask=head_mask, + output_attentions=output_attentions, + ) + if output_attentions: + sa_output, sa_weights = sa_output # (bs, seq_length, dim), (bs, n_heads, seq_length, seq_length) + else: # To handle these `output_attentions` or `output_hidden_states` cases returning tuples + if type(sa_output) is not tuple: + raise TypeError(f"sa_output must be a tuple but it is {type(sa_output)} type") + + sa_output = sa_output[0] + sa_output = self.sa_layer_norm(sa_output + x) # (bs, seq_length, dim) + + # Feed Forward Network + ffn_output = self.ffn(sa_output) # (bs, seq_length, dim) + ffn_output: torch.Tensor = self.output_layer_norm(ffn_output + sa_output) # (bs, seq_length, dim) + + output = (ffn_output,) + if output_attentions: + output = (sa_weights,) + output + return output + + +class Transformer(nn.Module): + def __init__(self, config: PretrainedConfig): + super().__init__() + self.n_layers = config.n_layers + self.layer = nn.ModuleList([TransformerBlock(config) for _ in range(config.n_layers)]) + self.gradient_checkpointing = False + + def forward( + self, + x: torch.Tensor, + attn_mask: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: Optional[bool] = None, + ) -> Union[BaseModelOutput, tuple[torch.Tensor, ...]]: # docstyle-ignore + """ + Parameters: + x: torch.tensor(bs, seq_length, dim) Input sequence embedded. + attn_mask: torch.tensor(bs, seq_length) Attention mask on the sequence. + + Returns: + hidden_state: torch.tensor(bs, seq_length, dim) Sequence of hidden states in the last (top) + layer all_hidden_states: tuple[torch.tensor(bs, seq_length, dim)] + Tuple of length n_layers with the hidden states from each layer. + Optional: only if output_hidden_states=True + all_attentions: tuple[torch.tensor(bs, n_heads, seq_length, seq_length)] + Tuple of length n_layers with the attention weights from each layer + Optional: only if output_attentions=True + """ + all_hidden_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + + hidden_state = x + for i, layer_module in enumerate(self.layer): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_state,) + + layer_outputs = layer_module( + hidden_state, + attn_mask, + head_mask[i], + output_attentions, + ) + + hidden_state = layer_outputs[-1] + + if output_attentions: + if len(layer_outputs) != 2: + raise ValueError(f"The length of the layer_outputs should be 2, but it is {len(layer_outputs)}") + + attentions = layer_outputs[0] + all_attentions = all_attentions + (attentions,) + else: + if len(layer_outputs) != 1: + raise ValueError(f"The length of the layer_outputs should be 1, but it is {len(layer_outputs)}") + + # Add last layer + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_state,) + + if not return_dict: + return tuple(v for v in [hidden_state, all_hidden_states, all_attentions] if v is not None) + return BaseModelOutput( + last_hidden_state=hidden_state, hidden_states=all_hidden_states, attentions=all_attentions + ) + + +# INTERFACE FOR ENCODER AND TASK SPECIFIC MODEL # +@auto_docstring +class DistilBertPreTrainedModel(PreTrainedModel): + config: DistilBertConfig + load_tf_weights = None + base_model_prefix = "distilbert" + supports_gradient_checkpointing = True + _supports_flash_attn = True + _supports_sdpa = True + + def _init_weights(self, module: nn.Module): + """Initialize the weights.""" + if isinstance(module, nn.Linear): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + elif isinstance(module, Embeddings) and self.config.sinusoidal_pos_embds: + create_sinusoidal_embeddings( + self.config.max_position_embeddings, self.config.dim, module.position_embeddings.weight + ) + + +@auto_docstring +class DistilBertModel(DistilBertPreTrainedModel): + def __init__(self, config: PretrainedConfig): + super().__init__(config) + + self.embeddings = Embeddings(config) # Embeddings + self.transformer = Transformer(config) # Encoder + + # Initialize weights and apply final processing + self.post_init() + + def get_position_embeddings(self) -> nn.Embedding: + """ + Returns the position embeddings + """ + return self.embeddings.position_embeddings + + def resize_position_embeddings(self, new_num_position_embeddings: int): + """ + Resizes position embeddings of the model if `new_num_position_embeddings != config.max_position_embeddings`. + + Arguments: + new_num_position_embeddings (`int`): + The number of new position embedding matrix. If position embeddings are learned, increasing the size + will add newly initialized vectors at the end, whereas reducing the size will remove vectors from the + end. If position embeddings are not learned (*e.g.* sinusoidal position embeddings), increasing the + size will add correct vectors at the end following the position encoding algorithm, whereas reducing + the size will remove vectors from the end. + """ + num_position_embeds_diff = new_num_position_embeddings - self.config.max_position_embeddings + + # no resizing needs to be done if the length stays the same + if num_position_embeds_diff == 0: + return + + logger.info(f"Setting `config.max_position_embeddings={new_num_position_embeddings}`...") + self.config.max_position_embeddings = new_num_position_embeddings + + old_position_embeddings_weight = self.embeddings.position_embeddings.weight.clone() + + self.embeddings.position_embeddings = nn.Embedding(self.config.max_position_embeddings, self.config.dim) + + if self.config.sinusoidal_pos_embds: + create_sinusoidal_embeddings( + n_pos=self.config.max_position_embeddings, dim=self.config.dim, out=self.position_embeddings.weight + ) + else: + with torch.no_grad(): + if num_position_embeds_diff > 0: + self.embeddings.position_embeddings.weight[:-num_position_embeds_diff] = nn.Parameter( + old_position_embeddings_weight + ) + else: + self.embeddings.position_embeddings.weight = nn.Parameter( + old_position_embeddings_weight[:num_position_embeds_diff] + ) + # move position_embeddings to correct device + self.embeddings.position_embeddings.to(self.device) + + def get_input_embeddings(self) -> nn.Embedding: + return self.embeddings.word_embeddings + + def set_input_embeddings(self, new_embeddings: nn.Embedding): + self.embeddings.word_embeddings = new_embeddings + + def _prune_heads(self, heads_to_prune: dict[int, list[list[int]]]): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.transformer.layer[layer].attention.prune_heads(heads) + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[BaseModelOutput, tuple[torch.Tensor, ...]]: + r""" + input_ids (`torch.LongTensor` of shape `(batch_size, num_choices)`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask) + input_shape = input_ids.size() + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + device = input_ids.device if input_ids is not None else inputs_embeds.device + + head_mask_is_none = head_mask is None + # Prepare head mask if needed + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + + embeddings = self.embeddings(input_ids, inputs_embeds) # (bs, seq_length, dim) + + if self.config._attn_implementation == "flash_attention_2": + attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None + else: + if attention_mask is None: + attention_mask = torch.ones(input_shape, device=device) # (bs, seq_length) + + if self.config._attn_implementation == "sdpa" and head_mask_is_none and not output_attentions: + attention_mask = _prepare_4d_attention_mask_for_sdpa( + attention_mask, embeddings.dtype, tgt_len=input_shape[1] + ) + + return self.transformer( + x=embeddings, + attn_mask=attention_mask, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + +@auto_docstring( + custom_intro=""" + DistilBert Model with a `masked language modeling` head on top. + """ +) +class DistilBertForMaskedLM(DistilBertPreTrainedModel): + _tied_weights_keys = ["vocab_projector.weight"] + + def __init__(self, config: PretrainedConfig): + super().__init__(config) + + self.activation = get_activation(config.activation) + + self.distilbert = DistilBertModel(config) + self.vocab_transform = nn.Linear(config.dim, config.dim) + self.vocab_layer_norm = nn.LayerNorm(config.dim, eps=1e-12) + self.vocab_projector = nn.Linear(config.dim, config.vocab_size) + + # Initialize weights and apply final processing + self.post_init() + + self.mlm_loss_fct = nn.CrossEntropyLoss() + + def get_position_embeddings(self) -> nn.Embedding: + """ + Returns the position embeddings + """ + return self.distilbert.get_position_embeddings() + + def resize_position_embeddings(self, new_num_position_embeddings: int): + """ + Resizes position embeddings of the model if `new_num_position_embeddings != config.max_position_embeddings`. + + Arguments: + new_num_position_embeddings (`int`): + The number of new position embedding matrix. If position embeddings are learned, increasing the size + will add newly initialized vectors at the end, whereas reducing the size will remove vectors from the + end. If position embeddings are not learned (*e.g.* sinusoidal position embeddings), increasing the + size will add correct vectors at the end following the position encoding algorithm, whereas reducing + the size will remove vectors from the end. + """ + self.distilbert.resize_position_embeddings(new_num_position_embeddings) + + def get_output_embeddings(self) -> nn.Module: + return self.vocab_projector + + def set_output_embeddings(self, new_embeddings: nn.Module): + self.vocab_projector = new_embeddings + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[MaskedLMOutput, tuple[torch.Tensor, ...]]: + r""" + input_ids (`torch.LongTensor` of shape `(batch_size, num_choices)`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., + config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the + loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + dlbrt_output = self.distilbert( + input_ids=input_ids, + attention_mask=attention_mask, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = dlbrt_output[0] # (bs, seq_length, dim) + prediction_logits = self.vocab_transform(hidden_states) # (bs, seq_length, dim) + prediction_logits = self.activation(prediction_logits) # (bs, seq_length, dim) + prediction_logits = self.vocab_layer_norm(prediction_logits) # (bs, seq_length, dim) + prediction_logits = self.vocab_projector(prediction_logits) # (bs, seq_length, vocab_size) + + mlm_loss = None + if labels is not None: + mlm_loss = self.mlm_loss_fct(prediction_logits.view(-1, prediction_logits.size(-1)), labels.view(-1)) + + if not return_dict: + output = (prediction_logits,) + dlbrt_output[1:] + return ((mlm_loss,) + output) if mlm_loss is not None else output + + return MaskedLMOutput( + loss=mlm_loss, + logits=prediction_logits, + hidden_states=dlbrt_output.hidden_states, + attentions=dlbrt_output.attentions, + ) + + +@auto_docstring( + custom_intro=""" + DistilBert Model transformer with a sequence classification/regression head on top (a linear layer on top of the + pooled output) e.g. for GLUE tasks. + """ +) +class DistilBertForSequenceClassification(DistilBertPreTrainedModel): + def __init__(self, config: PretrainedConfig): + super().__init__(config) + self.num_labels = config.num_labels + self.config = config + + self.distilbert = DistilBertModel(config) + self.pre_classifier = nn.Linear(config.dim, config.dim) + self.classifier = nn.Linear(config.dim, config.num_labels) + self.dropout = nn.Dropout(config.seq_classif_dropout) + + # Initialize weights and apply final processing + self.post_init() + + def get_position_embeddings(self) -> nn.Embedding: + """ + Returns the position embeddings + """ + return self.distilbert.get_position_embeddings() + + def resize_position_embeddings(self, new_num_position_embeddings: int): + """ + Resizes position embeddings of the model if `new_num_position_embeddings != config.max_position_embeddings`. + + Arguments: + new_num_position_embeddings (`int`): + The number of new position embedding matrix. If position embeddings are learned, increasing the size + will add newly initialized vectors at the end, whereas reducing the size will remove vectors from the + end. If position embeddings are not learned (*e.g.* sinusoidal position embeddings), increasing the + size will add correct vectors at the end following the position encoding algorithm, whereas reducing + the size will remove vectors from the end. + """ + self.distilbert.resize_position_embeddings(new_num_position_embeddings) + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[SequenceClassifierOutput, tuple[torch.Tensor, ...]]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + distilbert_output = self.distilbert( + input_ids=input_ids, + attention_mask=attention_mask, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_state = distilbert_output[0] # (bs, seq_len, dim) + pooled_output = hidden_state[:, 0] # (bs, dim) + pooled_output = self.pre_classifier(pooled_output) # (bs, dim) + pooled_output = nn.ReLU()(pooled_output) # (bs, dim) + pooled_output = self.dropout(pooled_output) # (bs, dim) + logits = self.classifier(pooled_output) # (bs, num_labels) + + loss = None + if labels is not None: + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits, labels) + + if not return_dict: + output = (logits,) + distilbert_output[1:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutput( + loss=loss, + logits=logits, + hidden_states=distilbert_output.hidden_states, + attentions=distilbert_output.attentions, + ) + + +@auto_docstring +class DistilBertForQuestionAnswering(DistilBertPreTrainedModel): + def __init__(self, config: PretrainedConfig): + super().__init__(config) + + self.distilbert = DistilBertModel(config) + self.qa_outputs = nn.Linear(config.dim, config.num_labels) + if config.num_labels != 2: + raise ValueError(f"config.num_labels should be 2, but it is {config.num_labels}") + + self.dropout = nn.Dropout(config.qa_dropout) + + # Initialize weights and apply final processing + self.post_init() + + def get_position_embeddings(self) -> nn.Embedding: + """ + Returns the position embeddings + """ + return self.distilbert.get_position_embeddings() + + def resize_position_embeddings(self, new_num_position_embeddings: int): + """ + Resizes position embeddings of the model if `new_num_position_embeddings != config.max_position_embeddings`. + + Arguments: + new_num_position_embeddings (`int`): + The number of new position embedding matrix. If position embeddings are learned, increasing the size + will add newly initialized vectors at the end, whereas reducing the size will remove vectors from the + end. If position embeddings are not learned (*e.g.* sinusoidal position embeddings), increasing the + size will add correct vectors at the end following the position encoding algorithm, whereas reducing + the size will remove vectors from the end. + """ + self.distilbert.resize_position_embeddings(new_num_position_embeddings) + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + start_positions: Optional[torch.Tensor] = None, + end_positions: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[QuestionAnsweringModelOutput, tuple[torch.Tensor, ...]]: + r""" + input_ids (`torch.LongTensor` of shape `(batch_size, num_choices)`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + distilbert_output = self.distilbert( + input_ids=input_ids, + attention_mask=attention_mask, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = distilbert_output[0] # (bs, max_query_len, dim) + + hidden_states = self.dropout(hidden_states) # (bs, max_query_len, dim) + logits = self.qa_outputs(hidden_states) # (bs, max_query_len, 2) + start_logits, end_logits = logits.split(1, dim=-1) + start_logits = start_logits.squeeze(-1).contiguous() # (bs, max_query_len) + end_logits = end_logits.squeeze(-1).contiguous() # (bs, max_query_len) + + total_loss = None + if start_positions is not None and end_positions is not None: + # If we are on multi-GPU, split add a dimension + if len(start_positions.size()) > 1: + start_positions = start_positions.squeeze(-1) + if len(end_positions.size()) > 1: + end_positions = end_positions.squeeze(-1) + # sometimes the start/end positions are outside our model inputs, we ignore these terms + ignored_index = start_logits.size(1) + start_positions = start_positions.clamp(0, ignored_index) + end_positions = end_positions.clamp(0, ignored_index) + + loss_fct = nn.CrossEntropyLoss(ignore_index=ignored_index) + start_loss = loss_fct(start_logits, start_positions) + end_loss = loss_fct(end_logits, end_positions) + total_loss = (start_loss + end_loss) / 2 + + if not return_dict: + output = (start_logits, end_logits) + distilbert_output[1:] + return ((total_loss,) + output) if total_loss is not None else output + + return QuestionAnsweringModelOutput( + loss=total_loss, + start_logits=start_logits, + end_logits=end_logits, + hidden_states=distilbert_output.hidden_states, + attentions=distilbert_output.attentions, + ) + + +@auto_docstring +class DistilBertForTokenClassification(DistilBertPreTrainedModel): + def __init__(self, config: PretrainedConfig): + super().__init__(config) + self.num_labels = config.num_labels + + self.distilbert = DistilBertModel(config) + self.dropout = nn.Dropout(config.dropout) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + + # Initialize weights and apply final processing + self.post_init() + + def get_position_embeddings(self) -> nn.Embedding: + """ + Returns the position embeddings + """ + return self.distilbert.get_position_embeddings() + + def resize_position_embeddings(self, new_num_position_embeddings: int): + """ + Resizes position embeddings of the model if `new_num_position_embeddings != config.max_position_embeddings`. + + Arguments: + new_num_position_embeddings (`int`): + The number of new position embedding matrix. If position embeddings are learned, increasing the size + will add newly initialized vectors at the end, whereas reducing the size will remove vectors from the + end. If position embeddings are not learned (*e.g.* sinusoidal position embeddings), increasing the + size will add correct vectors at the end following the position encoding algorithm, whereas reducing + the size will remove vectors from the end. + """ + self.distilbert.resize_position_embeddings(new_num_position_embeddings) + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[TokenClassifierOutput, tuple[torch.Tensor, ...]]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.distilbert( + input_ids, + attention_mask=attention_mask, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + sequence_output = self.dropout(sequence_output) + logits = self.classifier(sequence_output) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + + if not return_dict: + output = (logits,) + outputs[1:] + return ((loss,) + output) if loss is not None else output + + return TokenClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@auto_docstring +class DistilBertForMultipleChoice(DistilBertPreTrainedModel): + def __init__(self, config: PretrainedConfig): + super().__init__(config) + + self.distilbert = DistilBertModel(config) + self.pre_classifier = nn.Linear(config.dim, config.dim) + self.classifier = nn.Linear(config.dim, 1) + self.dropout = nn.Dropout(config.seq_classif_dropout) + + # Initialize weights and apply final processing + self.post_init() + + def get_position_embeddings(self) -> nn.Embedding: + """ + Returns the position embeddings + """ + return self.distilbert.get_position_embeddings() + + def resize_position_embeddings(self, new_num_position_embeddings: int): + """ + Resizes position embeddings of the model if `new_num_position_embeddings != config.max_position_embeddings`. + + Arguments: + new_num_position_embeddings (`int`) + The number of new position embeddings. If position embeddings are learned, increasing the size will add + newly initialized vectors at the end, whereas reducing the size will remove vectors from the end. If + position embeddings are not learned (*e.g.* sinusoidal position embeddings), increasing the size will + add correct vectors at the end following the position encoding algorithm, whereas reducing the size + will remove vectors from the end. + """ + self.distilbert.resize_position_embeddings(new_num_position_embeddings) + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[MultipleChoiceModelOutput, tuple[torch.Tensor, ...]]: + r""" + input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., + num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See + `input_ids` above) + + Examples: + + ```python + >>> from transformers import AutoTokenizer, DistilBertForMultipleChoice + >>> import torch + + >>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased") + >>> model = DistilBertForMultipleChoice.from_pretrained("distilbert-base-cased") + + >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced." + >>> choice0 = "It is eaten with a fork and a knife." + >>> choice1 = "It is eaten while held in the hand." + >>> labels = torch.tensor(0).unsqueeze(0) # choice0 is correct (according to Wikipedia ;)), batch size 1 + + >>> encoding = tokenizer([[prompt, choice0], [prompt, choice1]], return_tensors="pt", padding=True) + >>> outputs = model(**{k: v.unsqueeze(0) for k, v in encoding.items()}, labels=labels) # batch size is 1 + + >>> # the linear classifier still needs to be trained + >>> loss = outputs.loss + >>> logits = outputs.logits + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] + + input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None + attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None + inputs_embeds = ( + inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1)) + if inputs_embeds is not None + else None + ) + + outputs = self.distilbert( + input_ids, + attention_mask=attention_mask, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_state = outputs[0] # (bs * num_choices, seq_len, dim) + pooled_output = hidden_state[:, 0] # (bs * num_choices, dim) + pooled_output = self.pre_classifier(pooled_output) # (bs * num_choices, dim) + pooled_output = nn.ReLU()(pooled_output) # (bs * num_choices, dim) + pooled_output = self.dropout(pooled_output) # (bs * num_choices, dim) + logits = self.classifier(pooled_output) # (bs * num_choices, 1) + + reshaped_logits = logits.view(-1, num_choices) # (bs, num_choices) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(reshaped_logits, labels) + + if not return_dict: + output = (reshaped_logits,) + outputs[1:] + return ((loss,) + output) if loss is not None else output + + return MultipleChoiceModelOutput( + loss=loss, + logits=reshaped_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +__all__ = [ + "DistilBertForMaskedLM", + "DistilBertForMultipleChoice", + "DistilBertForQuestionAnswering", + "DistilBertForSequenceClassification", + "DistilBertForTokenClassification", + "DistilBertModel", + "DistilBertPreTrainedModel", +] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/distilbert/modeling_flax_distilbert.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/distilbert/modeling_flax_distilbert.py new file mode 100644 index 0000000000000000000000000000000000000000..fba3dfd9d332d5f34dfea23a20cec7a7871de9e4 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/distilbert/modeling_flax_distilbert.py @@ -0,0 +1,906 @@ +# coding=utf-8 +# Copyright 2019-present, the HuggingFace Inc. team, The Google AI Language Team and Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +from typing import Callable, Optional + +import flax.linen as nn +import jax +import jax.numpy as jnp +import numpy as np +from flax.core.frozen_dict import FrozenDict, freeze, unfreeze +from flax.traverse_util import flatten_dict, unflatten_dict +from jax import lax + +from ...modeling_flax_outputs import ( + FlaxBaseModelOutput, + FlaxMaskedLMOutput, + FlaxMultipleChoiceModelOutput, + FlaxQuestionAnsweringModelOutput, + FlaxSequenceClassifierOutput, + FlaxTokenClassifierOutput, +) +from ...modeling_flax_utils import ACT2FN, FlaxPreTrainedModel, append_call_sample_docstring, overwrite_call_docstring +from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging +from .configuration_distilbert import DistilBertConfig + + +logger = logging.get_logger(__name__) + +_CHECKPOINT_FOR_DOC = "distilbert-base-uncased" +_CONFIG_FOR_DOC = "DistilBertConfig" + + +FLAX_DISTILBERT_START_DOCSTRING = r""" + + This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading, saving and converting weights from PyTorch models) + + This model is also a + [flax.linen.Module](https://flax.readthedocs.io/en/latest/api_reference/flax.linen/module.html) subclass. Use it as + a regular Flax linen Module and refer to the Flax documentation for all matter related to general usage and + behavior. + + Finally, this model supports inherent JAX features such as: + + - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit) + - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation) + - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap) + - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap) + + Parameters: + config ([`DistilBertConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +DISTILBERT_INPUTS_DOCSTRING = r""" + Args: + input_ids (`numpy.ndarray` of shape `({0})`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`numpy.ndarray` of shape `({0})`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +def get_angles(pos, i, d_model): + angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model)) + return pos * angle_rates + + +def positional_encoding(position, d_model): + # create the sinusoidal pattern for the positional encoding + angle_rads = get_angles(np.arange(position)[:, np.newaxis], np.arange(d_model)[np.newaxis, :], d_model) + + # apply sin to even indices in the array; 2i + angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2]) + + # apply cos to odd indices in the array; 2i+1 + angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2]) + + pos_encoding = angle_rads[np.newaxis, ...] + + return jnp.array(pos_encoding) + + +class FlaxEmbeddings(nn.Module): + """Construct the embeddings from word, position and token_type embeddings.""" + + config: DistilBertConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.word_embeddings = nn.Embed( + self.config.vocab_size, + self.config.dim, + embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range), + ) + if not self.config.sinusoidal_pos_embds: + self.position_embeddings = nn.Embed( + self.config.max_position_embeddings, + self.config.dim, + embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range), + ) + else: + self.pos_encoding = positional_encoding(self.config.max_position_embeddings, self.config.dim) + self.LayerNorm = nn.LayerNorm(epsilon=1e-12, dtype=self.dtype) + self.dropout = nn.Dropout(rate=self.config.dropout) + + def __call__(self, input_ids, deterministic: bool = True): + # Embed + batch_size, seq_length = input_ids.shape + inputs_embeds = self.word_embeddings(input_ids.astype("i4")) + if not self.config.sinusoidal_pos_embds: + position_ids = jnp.arange(seq_length).astype("i4") + position_ids = jnp.broadcast_to(position_ids, shape=(batch_size, seq_length)) + position_embeds = self.position_embeddings(position_ids.astype("i4")) + else: + position_embeds = self.pos_encoding[:, :seq_length, :] + # explicitly cast the positions here, since self.embed_positions are not registered as parameters + position_embeds = position_embeds.astype(inputs_embeds.dtype) + + # Sum all embeddings + hidden_states = inputs_embeds + position_embeds + + # Layer Norm + hidden_states = self.LayerNorm(hidden_states) + hidden_states = self.dropout(hidden_states, deterministic=deterministic) + return hidden_states + + +class FlaxMultiHeadSelfAttention(nn.Module): + config: DistilBertConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.n_heads = self.config.n_heads + self.dim = self.config.dim + self.dropout = nn.Dropout(rate=self.config.attention_dropout) + + if not (self.dim % self.n_heads == 0): + raise ValueError(f"Hidden size {self.dim} not dividable by number of heads {self.n_heads}") + + self.q_lin = nn.Dense( + self.dim, + dtype=self.dtype, + kernel_init=jax.nn.initializers.normal(stddev=self.config.initializer_range), + ) + self.k_lin = nn.Dense( + self.dim, + dtype=self.dtype, + kernel_init=jax.nn.initializers.normal(stddev=self.config.initializer_range), + ) + self.v_lin = nn.Dense( + self.dim, + dtype=self.dtype, + kernel_init=jax.nn.initializers.normal(stddev=self.config.initializer_range), + ) + self.out_lin = nn.Dense( + self.dim, + dtype=self.dtype, + kernel_init=jax.nn.initializers.normal(stddev=self.config.initializer_range), + ) + + def __call__( + self, + query, + key, + value, + mask, + deterministic: bool = True, + output_attentions: bool = False, + ): + bs, q_len, dim = query.shape + k_len = key.shape[1] + # assert dim == self.dim, f'Dimensions do not match: {dim} input vs {self.dim} configured' + # assert key.size() == value.size() + + dim_per_head = self.dim // self.n_heads + + mask_reshp = (bs, 1, 1, k_len) + + def shape(x): + """separate heads""" + return x.reshape(bs, -1, self.n_heads, dim_per_head).transpose(0, 2, 1, 3) + + def unshape(x): + """group heads""" + return x.transpose(0, 2, 1, 3).reshape(bs, -1, self.n_heads * dim_per_head) + + q = shape(self.q_lin(query)) # (bs, n_heads, q_len, dim_per_head) + k = shape(self.k_lin(key)) # (bs, n_heads, k_len, dim_per_head) + v = shape(self.v_lin(value)) # (bs, n_heads, k_len, dim_per_head) + + q = q / math.sqrt(dim_per_head) # (bs, n_heads, q_len, dim_per_head) + scores = jnp.matmul(q, k.transpose(0, 1, 3, 2)) # (bs, n_heads, q_len, k_len) + mask = jnp.reshape(mask, mask_reshp) + + mask = mask.astype(scores.dtype) + scores = scores - 1e30 * (1.0 - mask) + + weights = nn.softmax(scores, axis=-1) # (bs, n_heads, q_len, k_len) + weights = self.dropout(weights, deterministic=deterministic) + + context = jnp.matmul(weights, v) # (bs, n_heads, q_len, dim_per_head) + context = unshape(context) # (bs, q_len, dim) + context = self.out_lin(context) # (bs, q_len, dim) + + if output_attentions: + return (context, weights) + else: + return (context,) + + +class FlaxFFN(nn.Module): + config: DistilBertConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.dropout = nn.Dropout(rate=self.config.dropout) + self.chunk_size_feed_forward = self.config.chunk_size_feed_forward + self.seq_len_dim = 1 + self.lin1 = nn.Dense( + self.config.hidden_dim, + dtype=self.dtype, + kernel_init=jax.nn.initializers.normal(stddev=self.config.initializer_range), + ) + self.lin2 = nn.Dense( + self.config.dim, + dtype=self.dtype, + kernel_init=jax.nn.initializers.normal(stddev=self.config.initializer_range), + ) + + self.activation = ACT2FN[self.config.activation] + + def __call__(self, hidden_states, deterministic: bool = True): + hidden_states = self.lin1(hidden_states) + hidden_states = self.activation(hidden_states) + hidden_states = self.lin2(hidden_states) + hidden_states = self.dropout(hidden_states, deterministic=deterministic) + return hidden_states + + +class FlaxTransformerBlock(nn.Module): + config: DistilBertConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + assert self.config.dim % self.config.n_heads == 0, ( + f"Hidden size {self.config.dim} not dividable by number of heads {self.config.n_heads}" + ) + + self.attention = FlaxMultiHeadSelfAttention(self.config, dtype=self.dtype) + self.sa_layer_norm = nn.LayerNorm(epsilon=1e-12, dtype=self.dtype) + + self.ffn = FlaxFFN(self.config, dtype=self.dtype) + self.output_layer_norm = nn.LayerNorm(epsilon=1e-12, dtype=self.dtype) + + def __call__( + self, + hidden_states, + attn_mask, + output_attentions: bool = False, + deterministic: bool = True, + ): + # Self-Attention + sa_output = self.attention( + query=hidden_states, + key=hidden_states, + value=hidden_states, + mask=attn_mask, + output_attentions=output_attentions, + deterministic=deterministic, + ) + if output_attentions: + sa_output, sa_weights = sa_output + else: + assert type(sa_output) is tuple + sa_output = sa_output[0] + sa_output = self.sa_layer_norm(sa_output + hidden_states) + + # Feed Forward Network + ffn_output = self.ffn(sa_output, deterministic=deterministic) + ffn_output = self.output_layer_norm(ffn_output + sa_output) + output = (ffn_output,) + if output_attentions: + output = (sa_weights,) + output + return output + + +class FlaxTransformer(nn.Module): + config: DistilBertConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.layers = [ + FlaxTransformerBlock(self.config, name=str(i), dtype=self.dtype) for i in range(self.config.n_layers) + ] + + def __call__( + self, + hidden_states, + attention_mask, + output_attentions: bool = False, + output_hidden_states: bool = False, + deterministic: bool = True, + return_dict: bool = False, + ): + all_hidden_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + + for layer_module in self.layers: + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_outputs = layer_module( + hidden_states=hidden_states, + attn_mask=attention_mask, + output_attentions=output_attentions, + deterministic=deterministic, + ) + hidden_states = layer_outputs[-1] + + if output_attentions: + assert len(layer_outputs) == 2 + attentions = layer_outputs[0] + all_attentions = all_attentions + (attentions,) + else: + assert len(layer_outputs) == 1 + + # Add last layer + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, all_attentions, all_hidden_states] if v is not None) + return FlaxBaseModelOutput( + last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions + ) + + +class FlaxTransformerEncoder(nn.Module): + config: DistilBertConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.layer = FlaxTransformer(self.config, dtype=self.dtype) + + def __call__( + self, + hidden_states, + attention_mask, + output_attentions: bool = False, + output_hidden_states: bool = False, + deterministic: bool = True, + return_dict: bool = False, + ): + return self.layer( + hidden_states=hidden_states, + attention_mask=attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + deterministic=deterministic, + return_dict=return_dict, + ) + + +class FlaxDistilBertLMDecoder(nn.Module): + config: DistilBertConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + bias_init: Callable[..., np.ndarray] = jax.nn.initializers.zeros + + def setup(self): + self.bias = self.param("bias", self.bias_init, (self.config.vocab_size,)) + + def __call__(self, inputs, kernel): + inputs = jnp.asarray(inputs, self.dtype) + kernel = jnp.asarray(kernel, self.dtype) + y = lax.dot_general(inputs, kernel, (((inputs.ndim - 1,), (0,)), ((), ()))) + bias = jnp.asarray(self.bias, self.dtype) + y = y + bias + return y + + +class FlaxDistilBertPreTrainedModel(FlaxPreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = DistilBertConfig + base_model_prefix = "distilbert" + module_class: nn.Module = None + + def __init__( + self, + config: DistilBertConfig, + input_shape: tuple = (1, 1), + seed: int = 0, + dtype: jnp.dtype = jnp.float32, + _do_init: bool = True, + **kwargs, + ): + module = self.module_class(config=config, dtype=dtype, **kwargs) + super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init) + + def init_weights(self, rng: jax.random.PRNGKey, input_shape: tuple, params: FrozenDict = None) -> FrozenDict: + # init input tensors + input_ids = jnp.zeros(input_shape, dtype="i4") + attention_mask = jnp.ones_like(input_ids) + + params_rng, dropout_rng = jax.random.split(rng) + rngs = {"params": params_rng, "dropout": dropout_rng} + + random_params = self.module.init(rngs, input_ids, attention_mask, return_dict=False)["params"] + + if params is not None: + random_params = flatten_dict(unfreeze(random_params)) + params = flatten_dict(unfreeze(params)) + for missing_key in self._missing_keys: + params[missing_key] = random_params[missing_key] + self._missing_keys = set() + return freeze(unflatten_dict(params)) + else: + return random_params + + @add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + def __call__( + self, + input_ids, + attention_mask=None, + head_mask=None, + params: Optional[dict] = None, + dropout_rng: jax.random.PRNGKey = None, + train: bool = False, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ): + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.return_dict + + if attention_mask is None: + attention_mask = jnp.ones_like(input_ids) + + # Handle any PRNG if needed + rngs = {} + if dropout_rng is not None: + rngs["dropout"] = dropout_rng + + return self.module.apply( + {"params": params or self.params}, + jnp.array(input_ids, dtype="i4"), + jnp.array(attention_mask, dtype="i4"), + not train, + output_attentions, + output_hidden_states, + return_dict, + rngs=rngs, + ) + + +class FlaxDistilBertModule(nn.Module): + config: DistilBertConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.embeddings = FlaxEmbeddings(self.config, dtype=self.dtype) + self.transformer = FlaxTransformerEncoder(self.config, dtype=self.dtype) + + def __call__( + self, + input_ids, + attention_mask, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.return_dict + + input_embeds = self.embeddings(input_ids, deterministic=deterministic) + return self.transformer( + hidden_states=input_embeds, + attention_mask=attention_mask, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + +@add_start_docstrings( + "The bare DistilBert Model transformer outputting raw hidden-states without any specific head on top.", + FLAX_DISTILBERT_START_DOCSTRING, +) +class FlaxDistilBertModel(FlaxDistilBertPreTrainedModel): + module_class = FlaxDistilBertModule + + +append_call_sample_docstring(FlaxDistilBertModel, _CHECKPOINT_FOR_DOC, None, _CONFIG_FOR_DOC) + + +class FlaxDistilBertForMaskedLMModule(nn.Module): + config: DistilBertConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.distilbert = FlaxDistilBertModule(self.config, dtype=self.dtype) + self.vocab_transform = nn.Dense( + self.config.dim, + dtype=self.dtype, + kernel_init=jax.nn.initializers.normal(stddev=self.config.initializer_range), + ) + self.vocab_layer_norm = nn.LayerNorm(epsilon=1e-12, dtype=self.dtype) + if self.config.tie_word_embeddings: + self.vocab_projector = FlaxDistilBertLMDecoder( + self.config, + dtype=self.dtype, + ) + else: + self.vocab_projector = nn.Dense( + self.config.vocab_size, + dtype=self.dtype, + kernel_init=jax.nn.initializers.normal(stddev=self.config.initializer_range), + ) + + def __call__( + self, + input_ids, + attention_mask, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + dlbrt_output = self.distilbert( + input_ids=input_ids, + attention_mask=attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + deterministic=deterministic, + return_dict=return_dict, + ) + hidden_states = dlbrt_output[0] + prediction_logits = self.vocab_transform(hidden_states) + prediction_logits = ACT2FN[self.config.activation](prediction_logits) + prediction_logits = self.vocab_layer_norm(prediction_logits) + + if self.config.tie_word_embeddings: + shared_embedding = self.distilbert.variables["params"]["embeddings"]["word_embeddings"]["embedding"] + prediction_logits = self.vocab_projector(prediction_logits, shared_embedding.T) + else: + prediction_logits = self.vocab_projector(prediction_logits) + + if not return_dict: + output = (prediction_logits,) + dlbrt_output[1:] + return output + + return FlaxMaskedLMOutput( + logits=prediction_logits, + hidden_states=dlbrt_output.hidden_states, + attentions=dlbrt_output.attentions, + ) + + +@add_start_docstrings("""DistilBert Model with a `language modeling` head on top.""", FLAX_DISTILBERT_START_DOCSTRING) +class FlaxDistilBertForMaskedLM(FlaxDistilBertPreTrainedModel): + module_class = FlaxDistilBertForMaskedLMModule + + +append_call_sample_docstring(FlaxDistilBertForMaskedLM, _CHECKPOINT_FOR_DOC, FlaxMaskedLMOutput, _CONFIG_FOR_DOC) + + +class FlaxDistilBertForSequenceClassificationModule(nn.Module): + config: DistilBertConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self): + self.distilbert = FlaxDistilBertModule(config=self.config, dtype=self.dtype) + self.pre_classifier = nn.Dense( + self.config.dim, + dtype=self.dtype, + kernel_init=jax.nn.initializers.normal(stddev=self.config.initializer_range), + ) + self.dropout = nn.Dropout(rate=self.config.seq_classif_dropout) + self.classifier = nn.Dense( + self.config.num_labels, + dtype=self.dtype, + ) + + def __call__( + self, + input_ids, + attention_mask, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + # Model + distilbert_output = self.distilbert( + input_ids, + attention_mask, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_state = distilbert_output[0] # (bs, seq_len, dim) + pooled_output = hidden_state[:, 0] # (bs, dim) + pooled_output = self.pre_classifier(pooled_output) # (bs, dim) + pooled_output = ACT2FN["relu"](pooled_output) + pooled_output = self.dropout(pooled_output, deterministic=deterministic) + logits = self.classifier(pooled_output) # (bs, dim) + + if not return_dict: + return (logits,) + distilbert_output[1:] + + return FlaxSequenceClassifierOutput( + logits=logits, + hidden_states=distilbert_output.hidden_states, + attentions=distilbert_output.attentions, + ) + + +@add_start_docstrings( + """ + DistilBert Model transformer with a sequence classification/regression head on top (a linear layer on top of the + pooled output) e.g. for GLUE tasks. + """, + FLAX_DISTILBERT_START_DOCSTRING, +) +class FlaxDistilBertForSequenceClassification(FlaxDistilBertPreTrainedModel): + module_class = FlaxDistilBertForSequenceClassificationModule + + +append_call_sample_docstring( + FlaxDistilBertForSequenceClassification, + _CHECKPOINT_FOR_DOC, + FlaxSequenceClassifierOutput, + _CONFIG_FOR_DOC, +) + + +class FlaxDistilBertForMultipleChoiceModule(nn.Module): + config: DistilBertConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self): + self.distilbert = FlaxDistilBertModule(config=self.config, dtype=self.dtype) + self.pre_classifier = nn.Dense( + self.config.dim, + dtype=self.dtype, + kernel_init=jax.nn.initializers.normal(stddev=self.config.initializer_range), + ) + self.dropout = nn.Dropout(rate=self.config.seq_classif_dropout) + self.classifier = nn.Dense( + 1, + dtype=self.dtype, + ) + + def __call__( + self, + input_ids, + attention_mask, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + num_choices = input_ids.shape[1] + input_ids = input_ids.reshape(-1, input_ids.shape[-1]) if input_ids is not None else None + attention_mask = attention_mask.reshape(-1, attention_mask.shape[-1]) if attention_mask is not None else None + + # Model + outputs = self.distilbert( + input_ids, + attention_mask, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_state = outputs[0] + pooled_output = hidden_state[:, 0] + pooled_output = self.pre_classifier(pooled_output) + pooled_output = ACT2FN["relu"](pooled_output) + pooled_output = self.dropout(pooled_output, deterministic=deterministic) + logits = self.classifier(pooled_output) + + reshaped_logits = logits.reshape(-1, num_choices) + + if not return_dict: + return (reshaped_logits,) + outputs[2:] + + return FlaxMultipleChoiceModelOutput( + logits=reshaped_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + DistilBert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and + a softmax) e.g. for RocStories/SWAG tasks. + """, + FLAX_DISTILBERT_START_DOCSTRING, +) +class FlaxDistilBertForMultipleChoice(FlaxDistilBertPreTrainedModel): + module_class = FlaxDistilBertForMultipleChoiceModule + + +overwrite_call_docstring( + FlaxDistilBertForMultipleChoice, DISTILBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length") +) +append_call_sample_docstring( + FlaxDistilBertForMultipleChoice, + _CHECKPOINT_FOR_DOC, + FlaxMultipleChoiceModelOutput, + _CONFIG_FOR_DOC, +) + + +class FlaxDistilBertForTokenClassificationModule(nn.Module): + config: DistilBertConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self): + self.distilbert = FlaxDistilBertModule(config=self.config, dtype=self.dtype) + self.dropout = nn.Dropout(rate=self.config.dropout) + self.classifier = nn.Dense(self.config.num_labels, dtype=self.dtype) + + def __call__( + self, + input_ids, + attention_mask, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + # Model + outputs = self.distilbert( + input_ids, + attention_mask, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs[0] + hidden_states = self.dropout(hidden_states, deterministic=deterministic) + logits = self.classifier(hidden_states) + + if not return_dict: + return (logits,) + outputs[1:] + + return FlaxTokenClassifierOutput( + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + DistilBert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. + for Named-Entity-Recognition (NER) tasks. + """, + FLAX_DISTILBERT_START_DOCSTRING, +) +class FlaxDistilBertForTokenClassification(FlaxDistilBertPreTrainedModel): + module_class = FlaxDistilBertForTokenClassificationModule + + +append_call_sample_docstring( + FlaxDistilBertForTokenClassification, + _CHECKPOINT_FOR_DOC, + FlaxTokenClassifierOutput, + _CONFIG_FOR_DOC, +) + + +class FlaxDistilBertForQuestionAnsweringModule(nn.Module): + config: DistilBertConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self): + self.distilbert = FlaxDistilBertModule(config=self.config, dtype=self.dtype) + self.qa_outputs = nn.Dense(self.config.num_labels, dtype=self.dtype) + assert self.config.num_labels == 2 + self.dropout = nn.Dropout(rate=self.config.qa_dropout) + + def __call__( + self, + input_ids, + attention_mask, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # Model + distilbert_output = self.distilbert( + input_ids, + attention_mask, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = distilbert_output[0] + + hidden_states = self.dropout(hidden_states, deterministic=deterministic) + logits = self.qa_outputs(hidden_states) + start_logits, end_logits = jnp.split(logits, self.config.num_labels, axis=-1) + start_logits = start_logits.squeeze(-1) + end_logits = end_logits.squeeze(-1) + + if not return_dict: + return (start_logits, end_logits) + distilbert_output[1:] + + return FlaxQuestionAnsweringModelOutput( + start_logits=start_logits, + end_logits=end_logits, + hidden_states=distilbert_output.hidden_states, + attentions=distilbert_output.attentions, + ) + + +@add_start_docstrings( + """ + DistilBert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a + linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). + """, + FLAX_DISTILBERT_START_DOCSTRING, +) +class FlaxDistilBertForQuestionAnswering(FlaxDistilBertPreTrainedModel): + module_class = FlaxDistilBertForQuestionAnsweringModule + + +append_call_sample_docstring( + FlaxDistilBertForQuestionAnswering, + _CHECKPOINT_FOR_DOC, + FlaxQuestionAnsweringModelOutput, + _CONFIG_FOR_DOC, +) + + +__all__ = [ + "FlaxDistilBertForMaskedLM", + "FlaxDistilBertForMultipleChoice", + "FlaxDistilBertForQuestionAnswering", + "FlaxDistilBertForSequenceClassification", + "FlaxDistilBertForTokenClassification", + "FlaxDistilBertModel", + "FlaxDistilBertPreTrainedModel", +] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/distilbert/modeling_tf_distilbert.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/distilbert/modeling_tf_distilbert.py new file mode 100644 index 0000000000000000000000000000000000000000..a2efa1105c1cde9236e9c32627a5773cb38cc862 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/distilbert/modeling_tf_distilbert.py @@ -0,0 +1,1146 @@ +# coding=utf-8 +# Copyright 2019-present, the HuggingFace Inc. team, The Google AI Language Team and Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +TF 2.0 DistilBERT model +""" + +from __future__ import annotations + +import warnings + +import numpy as np +import tensorflow as tf + +from ...activations_tf import get_tf_activation +from ...modeling_tf_outputs import ( + TFBaseModelOutput, + TFMaskedLMOutput, + TFMultipleChoiceModelOutput, + TFQuestionAnsweringModelOutput, + TFSequenceClassifierOutput, + TFTokenClassifierOutput, +) +from ...modeling_tf_utils import ( + TFMaskedLanguageModelingLoss, + TFModelInputType, + TFMultipleChoiceLoss, + TFPreTrainedModel, + TFQuestionAnsweringLoss, + TFSequenceClassificationLoss, + TFTokenClassificationLoss, + get_initializer, + keras, + keras_serializable, + unpack_inputs, +) +from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax +from ...utils import ( + add_code_sample_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, +) +from .configuration_distilbert import DistilBertConfig + + +logger = logging.get_logger(__name__) + +_CHECKPOINT_FOR_DOC = "distilbert-base-uncased" +_CONFIG_FOR_DOC = "DistilBertConfig" + + +class TFEmbeddings(keras.layers.Layer): + """Construct the embeddings from word, position and token_type embeddings.""" + + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + self.config = config + self.dim = config.dim + self.initializer_range = config.initializer_range + self.max_position_embeddings = config.max_position_embeddings + self.LayerNorm = keras.layers.LayerNormalization(epsilon=1e-12, name="LayerNorm") + self.dropout = keras.layers.Dropout(rate=config.dropout) + + def build(self, input_shape=None): + with tf.name_scope("word_embeddings"): + self.weight = self.add_weight( + name="weight", + shape=[self.config.vocab_size, self.dim], + initializer=get_initializer(initializer_range=self.initializer_range), + ) + + with tf.name_scope("position_embeddings"): + self.position_embeddings = self.add_weight( + name="embeddings", + shape=[self.max_position_embeddings, self.dim], + initializer=get_initializer(initializer_range=self.initializer_range), + ) + + if self.built: + return + self.built = True + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.dim]) + + def call(self, input_ids=None, position_ids=None, inputs_embeds=None, training=False): + """ + Applies embedding based on inputs tensor. + + Returns: + final_embeddings (`tf.Tensor`): output embedding tensor. + """ + assert not (input_ids is None and inputs_embeds is None) + + if input_ids is not None: + check_embeddings_within_bounds(input_ids, self.config.vocab_size) + inputs_embeds = tf.gather(params=self.weight, indices=input_ids) + + input_shape = shape_list(inputs_embeds)[:-1] + + if position_ids is None: + position_ids = tf.expand_dims(tf.range(start=0, limit=input_shape[-1]), axis=0) + + position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids) + final_embeddings = inputs_embeds + position_embeds + final_embeddings = self.LayerNorm(inputs=final_embeddings) + final_embeddings = self.dropout(inputs=final_embeddings, training=training) + + return final_embeddings + + +class TFMultiHeadSelfAttention(keras.layers.Layer): + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + + self.n_heads = config.n_heads + self.dim = config.dim + self.dropout = keras.layers.Dropout(config.attention_dropout) + self.output_attentions = config.output_attentions + + assert self.dim % self.n_heads == 0, f"Hidden size {self.dim} not dividable by number of heads {self.n_heads}" + + self.q_lin = keras.layers.Dense( + config.dim, kernel_initializer=get_initializer(config.initializer_range), name="q_lin" + ) + self.k_lin = keras.layers.Dense( + config.dim, kernel_initializer=get_initializer(config.initializer_range), name="k_lin" + ) + self.v_lin = keras.layers.Dense( + config.dim, kernel_initializer=get_initializer(config.initializer_range), name="v_lin" + ) + self.out_lin = keras.layers.Dense( + config.dim, kernel_initializer=get_initializer(config.initializer_range), name="out_lin" + ) + + self.pruned_heads = set() + self.config = config + + def prune_heads(self, heads): + raise NotImplementedError + + def call(self, query, key, value, mask, head_mask, output_attentions, training=False): + """ + Parameters: + query: tf.Tensor(bs, seq_length, dim) + key: tf.Tensor(bs, seq_length, dim) + value: tf.Tensor(bs, seq_length, dim) + mask: tf.Tensor(bs, seq_length) + + Returns: + weights: tf.Tensor(bs, n_heads, seq_length, seq_length) Attention weights context: tf.Tensor(bs, + seq_length, dim) Contextualized layer. Optional: only if `output_attentions=True` + """ + bs, q_length, dim = shape_list(query) + k_length = shape_list(key)[1] + # assert dim == self.dim, f'Dimensions do not match: {dim} input vs {self.dim} configured' + # assert key.size() == value.size() + dim_per_head = int(self.dim / self.n_heads) + dim_per_head = tf.cast(dim_per_head, dtype=tf.int32) + mask_reshape = [bs, 1, 1, k_length] + + def shape(x): + """separate heads""" + return tf.transpose(tf.reshape(x, (bs, -1, self.n_heads, dim_per_head)), perm=(0, 2, 1, 3)) + + def unshape(x): + """group heads""" + return tf.reshape(tf.transpose(x, perm=(0, 2, 1, 3)), (bs, -1, self.n_heads * dim_per_head)) + + q = shape(self.q_lin(query)) # (bs, n_heads, q_length, dim_per_head) + k = shape(self.k_lin(key)) # (bs, n_heads, k_length, dim_per_head) + v = shape(self.v_lin(value)) # (bs, n_heads, k_length, dim_per_head) + q = tf.cast(q, dtype=tf.float32) + q = tf.multiply(q, tf.math.rsqrt(tf.cast(dim_per_head, dtype=tf.float32))) + k = tf.cast(k, dtype=q.dtype) + scores = tf.matmul(q, k, transpose_b=True) # (bs, n_heads, q_length, k_length) + mask = tf.reshape(mask, mask_reshape) # (bs, n_heads, qlen, klen) + # scores.masked_fill_(mask, -float('inf')) # (bs, n_heads, q_length, k_length) + + mask = tf.cast(mask, dtype=scores.dtype) + scores = scores - 1e30 * (1.0 - mask) + weights = stable_softmax(scores, axis=-1) # (bs, n_heads, qlen, klen) + weights = self.dropout(weights, training=training) # (bs, n_heads, qlen, klen) + + # Mask heads if we want to + if head_mask is not None: + weights = weights * head_mask + + context = tf.matmul(weights, v) # (bs, n_heads, qlen, dim_per_head) + context = unshape(context) # (bs, q_length, dim) + context = self.out_lin(context) # (bs, q_length, dim) + + if output_attentions: + return (context, weights) + else: + return (context,) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "q_lin", None) is not None: + with tf.name_scope(self.q_lin.name): + self.q_lin.build([None, None, self.config.dim]) + if getattr(self, "k_lin", None) is not None: + with tf.name_scope(self.k_lin.name): + self.k_lin.build([None, None, self.config.dim]) + if getattr(self, "v_lin", None) is not None: + with tf.name_scope(self.v_lin.name): + self.v_lin.build([None, None, self.config.dim]) + if getattr(self, "out_lin", None) is not None: + with tf.name_scope(self.out_lin.name): + self.out_lin.build([None, None, self.config.dim]) + + +class TFFFN(keras.layers.Layer): + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + self.dropout = keras.layers.Dropout(config.dropout) + self.lin1 = keras.layers.Dense( + config.hidden_dim, kernel_initializer=get_initializer(config.initializer_range), name="lin1" + ) + self.lin2 = keras.layers.Dense( + config.dim, kernel_initializer=get_initializer(config.initializer_range), name="lin2" + ) + self.activation = get_tf_activation(config.activation) + self.config = config + + def call(self, input, training=False): + x = self.lin1(input) + x = self.activation(x) + x = self.lin2(x) + x = self.dropout(x, training=training) + return x + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "lin1", None) is not None: + with tf.name_scope(self.lin1.name): + self.lin1.build([None, None, self.config.dim]) + if getattr(self, "lin2", None) is not None: + with tf.name_scope(self.lin2.name): + self.lin2.build([None, None, self.config.hidden_dim]) + + +class TFTransformerBlock(keras.layers.Layer): + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + + self.n_heads = config.n_heads + self.dim = config.dim + self.hidden_dim = config.hidden_dim + self.dropout = keras.layers.Dropout(config.dropout) + self.activation = config.activation + self.output_attentions = config.output_attentions + + assert config.dim % config.n_heads == 0, ( + f"Hidden size {config.dim} not dividable by number of heads {config.n_heads}" + ) + + self.attention = TFMultiHeadSelfAttention(config, name="attention") + self.sa_layer_norm = keras.layers.LayerNormalization(epsilon=1e-12, name="sa_layer_norm") + + self.ffn = TFFFN(config, name="ffn") + self.output_layer_norm = keras.layers.LayerNormalization(epsilon=1e-12, name="output_layer_norm") + self.config = config + + def call(self, x, attn_mask, head_mask, output_attentions, training=False): # removed: src_enc=None, src_len=None + """ + Parameters: + x: tf.Tensor(bs, seq_length, dim) + attn_mask: tf.Tensor(bs, seq_length) + + Outputs: sa_weights: tf.Tensor(bs, n_heads, seq_length, seq_length) The attention weights ffn_output: + tf.Tensor(bs, seq_length, dim) The output of the transformer block contextualization. + """ + # Self-Attention + sa_output = self.attention(x, x, x, attn_mask, head_mask, output_attentions, training=training) + if output_attentions: + sa_output, sa_weights = sa_output # (bs, seq_length, dim), (bs, n_heads, seq_length, seq_length) + else: # To handle these `output_attentions` or `output_hidden_states` cases returning tuples + # assert type(sa_output) == tuple + sa_output = sa_output[0] + sa_output = self.sa_layer_norm(sa_output + x) # (bs, seq_length, dim) + + # Feed Forward Network + ffn_output = self.ffn(sa_output, training=training) # (bs, seq_length, dim) + ffn_output = self.output_layer_norm(ffn_output + sa_output) # (bs, seq_length, dim) + + output = (ffn_output,) + if output_attentions: + output = (sa_weights,) + output + return output + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "attention", None) is not None: + with tf.name_scope(self.attention.name): + self.attention.build(None) + if getattr(self, "sa_layer_norm", None) is not None: + with tf.name_scope(self.sa_layer_norm.name): + self.sa_layer_norm.build([None, None, self.config.dim]) + if getattr(self, "ffn", None) is not None: + with tf.name_scope(self.ffn.name): + self.ffn.build(None) + if getattr(self, "output_layer_norm", None) is not None: + with tf.name_scope(self.output_layer_norm.name): + self.output_layer_norm.build([None, None, self.config.dim]) + + +class TFTransformer(keras.layers.Layer): + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + self.n_layers = config.n_layers + self.output_hidden_states = config.output_hidden_states + self.output_attentions = config.output_attentions + + self.layer = [TFTransformerBlock(config, name=f"layer_._{i}") for i in range(config.n_layers)] + + def call(self, x, attn_mask, head_mask, output_attentions, output_hidden_states, return_dict, training=False): + # docstyle-ignore + """ + Parameters: + x: tf.Tensor(bs, seq_length, dim) Input sequence embedded. + attn_mask: tf.Tensor(bs, seq_length) Attention mask on the sequence. + + Returns: + hidden_state: tf.Tensor(bs, seq_length, dim) + Sequence of hidden states in the last (top) layer + all_hidden_states: tuple[tf.Tensor(bs, seq_length, dim)] + Tuple of length n_layers with the hidden states from each layer. + Optional: only if output_hidden_states=True + all_attentions: tuple[tf.Tensor(bs, n_heads, seq_length, seq_length)] + Tuple of length n_layers with the attention weights from each layer + Optional: only if output_attentions=True + """ + all_hidden_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + + hidden_state = x + for i, layer_module in enumerate(self.layer): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_state,) + + layer_outputs = layer_module(hidden_state, attn_mask, head_mask[i], output_attentions, training=training) + hidden_state = layer_outputs[-1] + + if output_attentions: + assert len(layer_outputs) == 2 + attentions = layer_outputs[0] + all_attentions = all_attentions + (attentions,) + else: + assert len(layer_outputs) == 1, f"Incorrect number of outputs {len(layer_outputs)} instead of 1" + + # Add last layer + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_state,) + + if not return_dict: + return tuple(v for v in [hidden_state, all_hidden_states, all_attentions] if v is not None) + return TFBaseModelOutput( + last_hidden_state=hidden_state, hidden_states=all_hidden_states, attentions=all_attentions + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "layer", None) is not None: + for layer in self.layer: + with tf.name_scope(layer.name): + layer.build(None) + + +@keras_serializable +class TFDistilBertMainLayer(keras.layers.Layer): + config_class = DistilBertConfig + + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + + self.config = config + self.num_hidden_layers = config.num_hidden_layers + self.output_attentions = config.output_attentions + self.output_hidden_states = config.output_hidden_states + self.return_dict = config.use_return_dict + + self.embeddings = TFEmbeddings(config, name="embeddings") # Embeddings + self.transformer = TFTransformer(config, name="transformer") # Encoder + + def get_input_embeddings(self): + return self.embeddings + + def set_input_embeddings(self, value): + self.embeddings.weight = value + self.embeddings.vocab_size = value.shape[0] + + def _prune_heads(self, heads_to_prune): + raise NotImplementedError + + @unpack_inputs + def call( + self, + input_ids=None, + attention_mask=None, + head_mask=None, + inputs_embeds=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + training=False, + ): + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + input_shape = shape_list(input_ids) + elif inputs_embeds is not None: + input_shape = shape_list(inputs_embeds)[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + if attention_mask is None: + attention_mask = tf.ones(input_shape) # (bs, seq_length) + + attention_mask = tf.cast(attention_mask, dtype=tf.float32) + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + if head_mask is not None: + raise NotImplementedError + else: + head_mask = [None] * self.num_hidden_layers + + embedding_output = self.embeddings(input_ids, inputs_embeds=inputs_embeds) # (bs, seq_length, dim) + tfmr_output = self.transformer( + embedding_output, + attention_mask, + head_mask, + output_attentions, + output_hidden_states, + return_dict, + training=training, + ) + + return tfmr_output # last-layer hidden-state, (all hidden_states), (all attentions) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "embeddings", None) is not None: + with tf.name_scope(self.embeddings.name): + self.embeddings.build(None) + if getattr(self, "transformer", None) is not None: + with tf.name_scope(self.transformer.name): + self.transformer.build(None) + + +# INTERFACE FOR ENCODER AND TASK SPECIFIC MODEL # +class TFDistilBertPreTrainedModel(TFPreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = DistilBertConfig + base_model_prefix = "distilbert" + + +DISTILBERT_START_DOCSTRING = r""" + + This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and + behavior. + + + + TensorFlow models and layers in `transformers` accept two formats as input: + + - having all inputs as keyword arguments (like PyTorch models), or + - having all inputs as a list, tuple or dict in the first positional argument. + + The reason the second format is supported is that Keras methods prefer this format when passing inputs to models + and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just + pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second + format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with + the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first + positional argument: + + - a single Tensor with `input_ids` only and nothing else: `model(input_ids)` + - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring: + `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])` + - a dictionary with one or several input Tensors associated to the input names given in the docstring: + `model({"input_ids": input_ids, "token_type_ids": token_type_ids})` + + Note that when creating models and layers with + [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry + about any of this, as you can just pass inputs like you would to any other Python function! + + + + Parameters: + config ([`DistilBertConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +DISTILBERT_INPUTS_DOCSTRING = r""" + Args: + input_ids (`Numpy array` or `tf.Tensor` of shape `({0})`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and + [`PreTrainedTokenizer.encode`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + head_mask (`Numpy array` or `tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + inputs_embeds (`tf.Tensor` of shape `({0}, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the + config will be used instead. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. This argument can be used only in eager mode, in graph mode the value in the config will be + used instead. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in + eager mode, in graph mode the value will always be set to True. + training (`bool`, *optional*, defaults to `False`): + Whether or not to use the model in training mode (some modules like dropout modules have different + behaviors between training and evaluation). +""" + + +@add_start_docstrings( + "The bare DistilBERT encoder/transformer outputting raw hidden-states without any specific head on top.", + DISTILBERT_START_DOCSTRING, +) +class TFDistilBertModel(TFDistilBertPreTrainedModel): + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + self.distilbert = TFDistilBertMainLayer(config, name="distilbert") # Embeddings + + @unpack_inputs + @add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFBaseModelOutput, + config_class=_CONFIG_FOR_DOC, + ) + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + training: bool | None = False, + ) -> TFBaseModelOutput | tuple[tf.Tensor]: + outputs = self.distilbert( + input_ids=input_ids, + attention_mask=attention_mask, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + return outputs + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "distilbert", None) is not None: + with tf.name_scope(self.distilbert.name): + self.distilbert.build(None) + + +class TFDistilBertLMHead(keras.layers.Layer): + def __init__(self, config, input_embeddings, **kwargs): + super().__init__(**kwargs) + + self.config = config + self.dim = config.dim + + # The output weights are the same as the input embeddings, but there is + # an output-only bias for each token. + self.input_embeddings = input_embeddings + + def build(self, input_shape): + self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias") + + super().build(input_shape) + + def get_output_embeddings(self): + return self.input_embeddings + + def set_output_embeddings(self, value): + self.input_embeddings.weight = value + self.input_embeddings.vocab_size = shape_list(value)[0] + + def get_bias(self): + return {"bias": self.bias} + + def set_bias(self, value): + self.bias = value["bias"] + self.config.vocab_size = shape_list(value["bias"])[0] + + def call(self, hidden_states): + seq_length = shape_list(tensor=hidden_states)[1] + hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.dim]) + hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True) + hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size]) + hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias) + + return hidden_states + + +@add_start_docstrings( + """DistilBert Model with a `masked language modeling` head on top.""", + DISTILBERT_START_DOCSTRING, +) +class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel, TFMaskedLanguageModelingLoss): + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + self.config = config + + self.distilbert = TFDistilBertMainLayer(config, name="distilbert") + self.vocab_transform = keras.layers.Dense( + config.dim, kernel_initializer=get_initializer(config.initializer_range), name="vocab_transform" + ) + self.act = get_tf_activation(config.activation) + self.vocab_layer_norm = keras.layers.LayerNormalization(epsilon=1e-12, name="vocab_layer_norm") + self.vocab_projector = TFDistilBertLMHead(config, self.distilbert.embeddings, name="vocab_projector") + + def get_lm_head(self): + return self.vocab_projector + + def get_prefix_bias_name(self): + warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning) + return self.name + "/" + self.vocab_projector.name + + @unpack_inputs + @add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFMaskedLMOutput, + config_class=_CONFIG_FOR_DOC, + ) + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + labels: np.ndarray | tf.Tensor | None = None, + training: bool | None = False, + ) -> TFMaskedLMOutput | tuple[tf.Tensor]: + r""" + labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., + config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the + loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` + """ + distilbert_output = self.distilbert( + input_ids=input_ids, + attention_mask=attention_mask, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + hidden_states = distilbert_output[0] # (bs, seq_length, dim) + prediction_logits = self.vocab_transform(hidden_states) # (bs, seq_length, dim) + prediction_logits = self.act(prediction_logits) # (bs, seq_length, dim) + prediction_logits = self.vocab_layer_norm(prediction_logits) # (bs, seq_length, dim) + prediction_logits = self.vocab_projector(prediction_logits) + + loss = None if labels is None else self.hf_compute_loss(labels, prediction_logits) + + if not return_dict: + output = (prediction_logits,) + distilbert_output[1:] + return ((loss,) + output) if loss is not None else output + + return TFMaskedLMOutput( + loss=loss, + logits=prediction_logits, + hidden_states=distilbert_output.hidden_states, + attentions=distilbert_output.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "distilbert", None) is not None: + with tf.name_scope(self.distilbert.name): + self.distilbert.build(None) + if getattr(self, "vocab_transform", None) is not None: + with tf.name_scope(self.vocab_transform.name): + self.vocab_transform.build([None, None, self.config.dim]) + if getattr(self, "vocab_layer_norm", None) is not None: + with tf.name_scope(self.vocab_layer_norm.name): + self.vocab_layer_norm.build([None, None, self.config.dim]) + if getattr(self, "vocab_projector", None) is not None: + with tf.name_scope(self.vocab_projector.name): + self.vocab_projector.build(None) + + +@add_start_docstrings( + """ + DistilBert Model transformer with a sequence classification/regression head on top (a linear layer on top of the + pooled output) e.g. for GLUE tasks. + """, + DISTILBERT_START_DOCSTRING, +) +class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel, TFSequenceClassificationLoss): + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + self.num_labels = config.num_labels + + self.distilbert = TFDistilBertMainLayer(config, name="distilbert") + self.pre_classifier = keras.layers.Dense( + config.dim, + kernel_initializer=get_initializer(config.initializer_range), + activation="relu", + name="pre_classifier", + ) + self.classifier = keras.layers.Dense( + config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" + ) + self.dropout = keras.layers.Dropout(config.seq_classif_dropout) + self.config = config + + @unpack_inputs + @add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFSequenceClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + labels: np.ndarray | tf.Tensor | None = None, + training: bool | None = False, + ) -> TFSequenceClassifierOutput | tuple[tf.Tensor]: + r""" + labels (`tf.Tensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + distilbert_output = self.distilbert( + input_ids=input_ids, + attention_mask=attention_mask, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + hidden_state = distilbert_output[0] # (bs, seq_len, dim) + pooled_output = hidden_state[:, 0] # (bs, dim) + pooled_output = self.pre_classifier(pooled_output) # (bs, dim) + pooled_output = self.dropout(pooled_output, training=training) # (bs, dim) + logits = self.classifier(pooled_output) # (bs, dim) + + loss = None if labels is None else self.hf_compute_loss(labels, logits) + + if not return_dict: + output = (logits,) + distilbert_output[1:] + return ((loss,) + output) if loss is not None else output + + return TFSequenceClassifierOutput( + loss=loss, + logits=logits, + hidden_states=distilbert_output.hidden_states, + attentions=distilbert_output.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "distilbert", None) is not None: + with tf.name_scope(self.distilbert.name): + self.distilbert.build(None) + if getattr(self, "pre_classifier", None) is not None: + with tf.name_scope(self.pre_classifier.name): + self.pre_classifier.build([None, None, self.config.dim]) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build([None, None, self.config.dim]) + + +@add_start_docstrings( + """ + DistilBert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. + for Named-Entity-Recognition (NER) tasks. + """, + DISTILBERT_START_DOCSTRING, +) +class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel, TFTokenClassificationLoss): + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + self.num_labels = config.num_labels + + self.distilbert = TFDistilBertMainLayer(config, name="distilbert") + self.dropout = keras.layers.Dropout(config.dropout) + self.classifier = keras.layers.Dense( + config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" + ) + self.config = config + + @unpack_inputs + @add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFTokenClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + labels: np.ndarray | tf.Tensor | None = None, + training: bool | None = False, + ) -> TFTokenClassifierOutput | tuple[tf.Tensor]: + r""" + labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. + """ + outputs = self.distilbert( + input_ids=input_ids, + attention_mask=attention_mask, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + sequence_output = outputs[0] + sequence_output = self.dropout(sequence_output, training=training) + logits = self.classifier(sequence_output) + loss = None if labels is None else self.hf_compute_loss(labels, logits) + + if not return_dict: + output = (logits,) + outputs[1:] + return ((loss,) + output) if loss is not None else output + + return TFTokenClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "distilbert", None) is not None: + with tf.name_scope(self.distilbert.name): + self.distilbert.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build([None, None, self.config.hidden_size]) + + +@add_start_docstrings( + """ + DistilBert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and + a softmax) e.g. for RocStories/SWAG tasks. + """, + DISTILBERT_START_DOCSTRING, +) +class TFDistilBertForMultipleChoice(TFDistilBertPreTrainedModel, TFMultipleChoiceLoss): + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + self.distilbert = TFDistilBertMainLayer(config, name="distilbert") + self.dropout = keras.layers.Dropout(config.seq_classif_dropout) + self.pre_classifier = keras.layers.Dense( + config.dim, + kernel_initializer=get_initializer(config.initializer_range), + activation="relu", + name="pre_classifier", + ) + self.classifier = keras.layers.Dense( + 1, kernel_initializer=get_initializer(config.initializer_range), name="classifier" + ) + self.config = config + + @unpack_inputs + @add_start_docstrings_to_model_forward( + DISTILBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length") + ) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFMultipleChoiceModelOutput, + config_class=_CONFIG_FOR_DOC, + ) + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + labels: np.ndarray | tf.Tensor | None = None, + training: bool | None = False, + ) -> TFMultipleChoiceModelOutput | tuple[tf.Tensor]: + r""" + labels (`tf.Tensor` of shape `(batch_size,)`, *optional*): + Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices]` + where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above) + """ + if input_ids is not None: + num_choices = shape_list(input_ids)[1] + seq_length = shape_list(input_ids)[2] + else: + num_choices = shape_list(inputs_embeds)[1] + seq_length = shape_list(inputs_embeds)[2] + + flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None + flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None + flat_inputs_embeds = ( + tf.reshape(inputs_embeds, (-1, seq_length, shape_list(inputs_embeds)[3])) + if inputs_embeds is not None + else None + ) + distilbert_output = self.distilbert( + flat_input_ids, + flat_attention_mask, + head_mask, + flat_inputs_embeds, + output_attentions, + output_hidden_states, + return_dict=return_dict, + training=training, + ) + hidden_state = distilbert_output[0] # (bs, seq_len, dim) + pooled_output = hidden_state[:, 0] # (bs, dim) + pooled_output = self.pre_classifier(pooled_output) # (bs, dim) + pooled_output = self.dropout(pooled_output, training=training) # (bs, dim) + logits = self.classifier(pooled_output) + reshaped_logits = tf.reshape(logits, (-1, num_choices)) + + loss = None if labels is None else self.hf_compute_loss(labels, reshaped_logits) + + if not return_dict: + output = (reshaped_logits,) + distilbert_output[1:] + return ((loss,) + output) if loss is not None else output + + return TFMultipleChoiceModelOutput( + loss=loss, + logits=reshaped_logits, + hidden_states=distilbert_output.hidden_states, + attentions=distilbert_output.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "distilbert", None) is not None: + with tf.name_scope(self.distilbert.name): + self.distilbert.build(None) + if getattr(self, "pre_classifier", None) is not None: + with tf.name_scope(self.pre_classifier.name): + self.pre_classifier.build([None, None, self.config.dim]) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build([None, None, self.config.dim]) + + +@add_start_docstrings( + """ + DistilBert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a + linear layer on top of the hidden-states output to compute `span start logits` and `span end logits`). + """, + DISTILBERT_START_DOCSTRING, +) +class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel, TFQuestionAnsweringLoss): + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + self.distilbert = TFDistilBertMainLayer(config, name="distilbert") + self.qa_outputs = keras.layers.Dense( + config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" + ) + assert config.num_labels == 2, f"Incorrect number of labels {config.num_labels} instead of 2" + self.dropout = keras.layers.Dropout(config.qa_dropout) + self.config = config + + @unpack_inputs + @add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFQuestionAnsweringModelOutput, + config_class=_CONFIG_FOR_DOC, + ) + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + start_positions: np.ndarray | tf.Tensor | None = None, + end_positions: np.ndarray | tf.Tensor | None = None, + training: bool | None = False, + ) -> TFQuestionAnsweringModelOutput | tuple[tf.Tensor]: + r""" + start_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*): + Labels for position (index) of the start of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence + are not taken into account for computing the loss. + end_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*): + Labels for position (index) of the end of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence + are not taken into account for computing the loss. + """ + distilbert_output = self.distilbert( + input_ids=input_ids, + attention_mask=attention_mask, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + hidden_states = distilbert_output[0] # (bs, max_query_len, dim) + hidden_states = self.dropout(hidden_states, training=training) # (bs, max_query_len, dim) + logits = self.qa_outputs(hidden_states) # (bs, max_query_len, 2) + start_logits, end_logits = tf.split(logits, 2, axis=-1) + start_logits = tf.squeeze(start_logits, axis=-1) + end_logits = tf.squeeze(end_logits, axis=-1) + + loss = None + if start_positions is not None and end_positions is not None: + labels = {"start_position": start_positions} + labels["end_position"] = end_positions + loss = self.hf_compute_loss(labels, (start_logits, end_logits)) + + if not return_dict: + output = (start_logits, end_logits) + distilbert_output[1:] + return ((loss,) + output) if loss is not None else output + + return TFQuestionAnsweringModelOutput( + loss=loss, + start_logits=start_logits, + end_logits=end_logits, + hidden_states=distilbert_output.hidden_states, + attentions=distilbert_output.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "distilbert", None) is not None: + with tf.name_scope(self.distilbert.name): + self.distilbert.build(None) + if getattr(self, "qa_outputs", None) is not None: + with tf.name_scope(self.qa_outputs.name): + self.qa_outputs.build([None, None, self.config.dim]) + + +__all__ = [ + "TFDistilBertForMaskedLM", + "TFDistilBertForMultipleChoice", + "TFDistilBertForQuestionAnswering", + "TFDistilBertForSequenceClassification", + "TFDistilBertForTokenClassification", + "TFDistilBertMainLayer", + "TFDistilBertModel", + "TFDistilBertPreTrainedModel", +] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/distilbert/tokenization_distilbert.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/distilbert/tokenization_distilbert.py new file mode 100644 index 0000000000000000000000000000000000000000..4e44468ab1d574666053b0ba772f2f0e098d5263 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/distilbert/tokenization_distilbert.py @@ -0,0 +1,492 @@ +# coding=utf-8 +# Copyright 2018 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes for DistilBERT.""" + +import collections +import os +import unicodedata +from typing import Optional + +from ...tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace +from ...utils import logging + + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"} + + +# Copied from transformers.models.bert.tokenization_bert.load_vocab +def load_vocab(vocab_file): + """Loads a vocabulary file into a dictionary.""" + vocab = collections.OrderedDict() + with open(vocab_file, "r", encoding="utf-8") as reader: + tokens = reader.readlines() + for index, token in enumerate(tokens): + token = token.rstrip("\n") + vocab[token] = index + return vocab + + +# Copied from transformers.models.bert.tokenization_bert.whitespace_tokenize +def whitespace_tokenize(text): + """Runs basic whitespace cleaning and splitting on a piece of text.""" + text = text.strip() + if not text: + return [] + tokens = text.split() + return tokens + + +class DistilBertTokenizer(PreTrainedTokenizer): + r""" + Construct a DistilBERT tokenizer. Based on WordPiece. + + This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to + this superclass for more information regarding those methods. + + Args: + vocab_file (`str`): + File containing the vocabulary. + do_lower_case (`bool`, *optional*, defaults to `True`): + Whether or not to lowercase the input when tokenizing. + do_basic_tokenize (`bool`, *optional*, defaults to `True`): + Whether or not to do basic tokenization before WordPiece. + never_split (`Iterable`, *optional*): + Collection of tokens which will never be split during tokenization. Only has an effect when + `do_basic_tokenize=True` + unk_token (`str`, *optional*, defaults to `"[UNK]"`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + sep_token (`str`, *optional*, defaults to `"[SEP]"`): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for + sequence classification or for a text and a question for question answering. It is also used as the last + token of a sequence built with special tokens. + pad_token (`str`, *optional*, defaults to `"[PAD]"`): + The token used for padding, for example when batching sequences of different lengths. + cls_token (`str`, *optional*, defaults to `"[CLS]"`): + The classifier token which is used when doing sequence classification (classification of the whole sequence + instead of per-token classification). It is the first token of the sequence when built with special tokens. + mask_token (`str`, *optional*, defaults to `"[MASK]"`): + The token used for masking values. This is the token used when training this model with masked language + modeling. This is the token which the model will try to predict. + tokenize_chinese_chars (`bool`, *optional*, defaults to `True`): + Whether or not to tokenize Chinese characters. + + This should likely be deactivated for Japanese (see this + [issue](https://github.com/huggingface/transformers/issues/328)). + strip_accents (`bool`, *optional*): + Whether or not to strip all accents. If this option is not specified, then it will be determined by the + value for `lowercase` (as in the original BERT). + clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`): + Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like + extra spaces. + """ + + vocab_files_names = VOCAB_FILES_NAMES + model_input_names = ["input_ids", "attention_mask"] + + def __init__( + self, + vocab_file, + do_lower_case=True, + do_basic_tokenize=True, + never_split=None, + unk_token="[UNK]", + sep_token="[SEP]", + pad_token="[PAD]", + cls_token="[CLS]", + mask_token="[MASK]", + tokenize_chinese_chars=True, + strip_accents=None, + clean_up_tokenization_spaces=True, + **kwargs, + ): + if not os.path.isfile(vocab_file): + raise ValueError( + f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained" + " model use `tokenizer = DistilBertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`" + ) + self.vocab = load_vocab(vocab_file) + self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()]) + self.do_basic_tokenize = do_basic_tokenize + if do_basic_tokenize: + self.basic_tokenizer = BasicTokenizer( + do_lower_case=do_lower_case, + never_split=never_split, + tokenize_chinese_chars=tokenize_chinese_chars, + strip_accents=strip_accents, + ) + self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token)) + + super().__init__( + do_lower_case=do_lower_case, + do_basic_tokenize=do_basic_tokenize, + never_split=never_split, + unk_token=unk_token, + sep_token=sep_token, + pad_token=pad_token, + cls_token=cls_token, + mask_token=mask_token, + tokenize_chinese_chars=tokenize_chinese_chars, + strip_accents=strip_accents, + clean_up_tokenization_spaces=clean_up_tokenization_spaces, + **kwargs, + ) + + @property + # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.do_lower_case + def do_lower_case(self): + return self.basic_tokenizer.do_lower_case + + @property + # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.vocab_size + def vocab_size(self): + return len(self.vocab) + + # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.get_vocab + def get_vocab(self): + return dict(self.vocab, **self.added_tokens_encoder) + + # Copied from transformers.models.bert.tokenization_bert.BertTokenizer._tokenize + def _tokenize(self, text, split_special_tokens=False): + split_tokens = [] + if self.do_basic_tokenize: + for token in self.basic_tokenizer.tokenize( + text, never_split=self.all_special_tokens if not split_special_tokens else None + ): + # If the token is part of the never_split set + if token in self.basic_tokenizer.never_split: + split_tokens.append(token) + else: + split_tokens += self.wordpiece_tokenizer.tokenize(token) + else: + split_tokens = self.wordpiece_tokenizer.tokenize(text) + return split_tokens + + # Copied from transformers.models.bert.tokenization_bert.BertTokenizer._convert_token_to_id + def _convert_token_to_id(self, token): + """Converts a token (str) in an id using the vocab.""" + return self.vocab.get(token, self.vocab.get(self.unk_token)) + + # Copied from transformers.models.bert.tokenization_bert.BertTokenizer._convert_id_to_token + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.ids_to_tokens.get(index, self.unk_token) + + # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.convert_tokens_to_string + def convert_tokens_to_string(self, tokens): + """Converts a sequence of tokens (string) in a single string.""" + out_string = " ".join(tokens).replace(" ##", "").strip() + return out_string + + # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.build_inputs_with_special_tokens + def build_inputs_with_special_tokens( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None + ) -> list[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A BERT sequence has the following format: + + - single sequence: `[CLS] X [SEP]` + - pair of sequences: `[CLS] A [SEP] B [SEP]` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + if token_ids_1 is None: + return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] + cls = [self.cls_token_id] + sep = [self.sep_token_id] + return cls + token_ids_0 + sep + token_ids_1 + sep + + # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.get_special_tokens_mask + def get_special_tokens_mask( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False + ) -> list[int]: + """ + Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer `prepare_for_model` method. + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not the token list is already formatted with special tokens for the model. + + Returns: + `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + + if already_has_special_tokens: + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) + + if token_ids_1 is not None: + return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] + return [1] + ([0] * len(token_ids_0)) + [1] + + # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.save_vocabulary + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]: + index = 0 + if os.path.isdir(save_directory): + vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + else: + vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory + with open(vocab_file, "w", encoding="utf-8") as writer: + for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]): + if index != token_index: + logger.warning( + f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive." + " Please check that the vocabulary is not corrupted!" + ) + index = token_index + writer.write(token + "\n") + index += 1 + return (vocab_file,) + + +# Copied from transformers.models.bert.tokenization_bert.BasicTokenizer +class BasicTokenizer: + """ + Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.). + + Args: + do_lower_case (`bool`, *optional*, defaults to `True`): + Whether or not to lowercase the input when tokenizing. + never_split (`Iterable`, *optional*): + Collection of tokens which will never be split during tokenization. Only has an effect when + `do_basic_tokenize=True` + tokenize_chinese_chars (`bool`, *optional*, defaults to `True`): + Whether or not to tokenize Chinese characters. + + This should likely be deactivated for Japanese (see this + [issue](https://github.com/huggingface/transformers/issues/328)). + strip_accents (`bool`, *optional*): + Whether or not to strip all accents. If this option is not specified, then it will be determined by the + value for `lowercase` (as in the original BERT). + do_split_on_punc (`bool`, *optional*, defaults to `True`): + In some instances we want to skip the basic punctuation splitting so that later tokenization can capture + the full context of the words, such as contractions. + """ + + def __init__( + self, + do_lower_case=True, + never_split=None, + tokenize_chinese_chars=True, + strip_accents=None, + do_split_on_punc=True, + ): + if never_split is None: + never_split = [] + self.do_lower_case = do_lower_case + self.never_split = set(never_split) + self.tokenize_chinese_chars = tokenize_chinese_chars + self.strip_accents = strip_accents + self.do_split_on_punc = do_split_on_punc + + def tokenize(self, text, never_split=None): + """ + Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer. + + Args: + never_split (`List[str]`, *optional*) + Kept for backward compatibility purposes. Now implemented directly at the base class level (see + [`PreTrainedTokenizer.tokenize`]) List of token not to split. + """ + # union() returns a new set by concatenating the two sets. + never_split = self.never_split.union(set(never_split)) if never_split else self.never_split + text = self._clean_text(text) + + # This was added on November 1st, 2018 for the multilingual and Chinese + # models. This is also applied to the English models now, but it doesn't + # matter since the English models were not trained on any Chinese data + # and generally don't have any Chinese data in them (there are Chinese + # characters in the vocabulary because Wikipedia does have some Chinese + # words in the English Wikipedia.). + if self.tokenize_chinese_chars: + text = self._tokenize_chinese_chars(text) + # prevents treating the same character with different unicode codepoints as different characters + unicode_normalized_text = unicodedata.normalize("NFC", text) + orig_tokens = whitespace_tokenize(unicode_normalized_text) + split_tokens = [] + for token in orig_tokens: + if token not in never_split: + if self.do_lower_case: + token = token.lower() + if self.strip_accents is not False: + token = self._run_strip_accents(token) + elif self.strip_accents: + token = self._run_strip_accents(token) + split_tokens.extend(self._run_split_on_punc(token, never_split)) + + output_tokens = whitespace_tokenize(" ".join(split_tokens)) + return output_tokens + + def _run_strip_accents(self, text): + """Strips accents from a piece of text.""" + text = unicodedata.normalize("NFD", text) + output = [] + for char in text: + cat = unicodedata.category(char) + if cat == "Mn": + continue + output.append(char) + return "".join(output) + + def _run_split_on_punc(self, text, never_split=None): + """Splits punctuation on a piece of text.""" + if not self.do_split_on_punc or (never_split is not None and text in never_split): + return [text] + chars = list(text) + i = 0 + start_new_word = True + output = [] + while i < len(chars): + char = chars[i] + if _is_punctuation(char): + output.append([char]) + start_new_word = True + else: + if start_new_word: + output.append([]) + start_new_word = False + output[-1].append(char) + i += 1 + + return ["".join(x) for x in output] + + def _tokenize_chinese_chars(self, text): + """Adds whitespace around any CJK character.""" + output = [] + for char in text: + cp = ord(char) + if self._is_chinese_char(cp): + output.append(" ") + output.append(char) + output.append(" ") + else: + output.append(char) + return "".join(output) + + def _is_chinese_char(self, cp): + """Checks whether CP is the codepoint of a CJK character.""" + # This defines a "chinese character" as anything in the CJK Unicode block: + # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) + # + # Note that the CJK Unicode block is NOT all Japanese and Korean characters, + # despite its name. The modern Korean Hangul alphabet is a different block, + # as is Japanese Hiragana and Katakana. Those alphabets are used to write + # space-separated words, so they are not treated specially and handled + # like the all of the other languages. + if ( + (cp >= 0x4E00 and cp <= 0x9FFF) + or (cp >= 0x3400 and cp <= 0x4DBF) + or (cp >= 0x20000 and cp <= 0x2A6DF) + or (cp >= 0x2A700 and cp <= 0x2B73F) + or (cp >= 0x2B740 and cp <= 0x2B81F) + or (cp >= 0x2B820 and cp <= 0x2CEAF) + or (cp >= 0xF900 and cp <= 0xFAFF) + or (cp >= 0x2F800 and cp <= 0x2FA1F) + ): + return True + + return False + + def _clean_text(self, text): + """Performs invalid character removal and whitespace cleanup on text.""" + output = [] + for char in text: + cp = ord(char) + if cp == 0 or cp == 0xFFFD or _is_control(char): + continue + if _is_whitespace(char): + output.append(" ") + else: + output.append(char) + return "".join(output) + + +# Copied from transformers.models.bert.tokenization_bert.WordpieceTokenizer +class WordpieceTokenizer: + """Runs WordPiece tokenization.""" + + def __init__(self, vocab, unk_token, max_input_chars_per_word=100): + self.vocab = vocab + self.unk_token = unk_token + self.max_input_chars_per_word = max_input_chars_per_word + + def tokenize(self, text): + """ + Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform + tokenization using the given vocabulary. + + For example, `input = "unaffable"` will return as output `["un", "##aff", "##able"]`. + + Args: + text: A single token or whitespace separated tokens. This should have + already been passed through *BasicTokenizer*. + + Returns: + A list of wordpiece tokens. + """ + + output_tokens = [] + for token in whitespace_tokenize(text): + chars = list(token) + if len(chars) > self.max_input_chars_per_word: + output_tokens.append(self.unk_token) + continue + + is_bad = False + start = 0 + sub_tokens = [] + while start < len(chars): + end = len(chars) + cur_substr = None + while start < end: + substr = "".join(chars[start:end]) + if start > 0: + substr = "##" + substr + if substr in self.vocab: + cur_substr = substr + break + end -= 1 + if cur_substr is None: + is_bad = True + break + sub_tokens.append(cur_substr) + start = end + + if is_bad: + output_tokens.append(self.unk_token) + else: + output_tokens.extend(sub_tokens) + return output_tokens + + +__all__ = ["DistilBertTokenizer"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/distilbert/tokenization_distilbert_fast.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/distilbert/tokenization_distilbert_fast.py new file mode 100644 index 0000000000000000000000000000000000000000..c174804dc530c863b14c3ab6e56c18117d3fa4c2 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/distilbert/tokenization_distilbert_fast.py @@ -0,0 +1,149 @@ +# coding=utf-8 +# Copyright 2018 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes for DistilBERT.""" + +import json +from typing import Optional + +from tokenizers import normalizers + +from ...tokenization_utils_fast import PreTrainedTokenizerFast +from ...utils import logging +from .tokenization_distilbert import DistilBertTokenizer + + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"} + + +class DistilBertTokenizerFast(PreTrainedTokenizerFast): + r""" + Construct a "fast" DistilBERT tokenizer (backed by HuggingFace's *tokenizers* library). Based on WordPiece. + + This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should + refer to this superclass for more information regarding those methods. + + Args: + vocab_file (`str`): + File containing the vocabulary. + do_lower_case (`bool`, *optional*, defaults to `True`): + Whether or not to lowercase the input when tokenizing. + unk_token (`str`, *optional*, defaults to `"[UNK]"`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + sep_token (`str`, *optional*, defaults to `"[SEP]"`): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for + sequence classification or for a text and a question for question answering. It is also used as the last + token of a sequence built with special tokens. + pad_token (`str`, *optional*, defaults to `"[PAD]"`): + The token used for padding, for example when batching sequences of different lengths. + cls_token (`str`, *optional*, defaults to `"[CLS]"`): + The classifier token which is used when doing sequence classification (classification of the whole sequence + instead of per-token classification). It is the first token of the sequence when built with special tokens. + mask_token (`str`, *optional*, defaults to `"[MASK]"`): + The token used for masking values. This is the token used when training this model with masked language + modeling. This is the token which the model will try to predict. + clean_text (`bool`, *optional*, defaults to `True`): + Whether or not to clean the text before tokenization by removing any control characters and replacing all + whitespaces by the classic one. + tokenize_chinese_chars (`bool`, *optional*, defaults to `True`): + Whether or not to tokenize Chinese characters. This should likely be deactivated for Japanese (see [this + issue](https://github.com/huggingface/transformers/issues/328)). + strip_accents (`bool`, *optional*): + Whether or not to strip all accents. If this option is not specified, then it will be determined by the + value for `lowercase` (as in the original BERT). + wordpieces_prefix (`str`, *optional*, defaults to `"##"`): + The prefix for subwords. + """ + + vocab_files_names = VOCAB_FILES_NAMES + model_input_names = ["input_ids", "attention_mask"] + slow_tokenizer_class = DistilBertTokenizer + + def __init__( + self, + vocab_file=None, + tokenizer_file=None, + do_lower_case=True, + unk_token="[UNK]", + sep_token="[SEP]", + pad_token="[PAD]", + cls_token="[CLS]", + mask_token="[MASK]", + tokenize_chinese_chars=True, + strip_accents=None, + **kwargs, + ): + super().__init__( + vocab_file, + tokenizer_file=tokenizer_file, + do_lower_case=do_lower_case, + unk_token=unk_token, + sep_token=sep_token, + pad_token=pad_token, + cls_token=cls_token, + mask_token=mask_token, + tokenize_chinese_chars=tokenize_chinese_chars, + strip_accents=strip_accents, + **kwargs, + ) + + normalizer_state = json.loads(self.backend_tokenizer.normalizer.__getstate__()) + if ( + normalizer_state.get("lowercase", do_lower_case) != do_lower_case + or normalizer_state.get("strip_accents", strip_accents) != strip_accents + or normalizer_state.get("handle_chinese_chars", tokenize_chinese_chars) != tokenize_chinese_chars + ): + normalizer_class = getattr(normalizers, normalizer_state.pop("type")) + normalizer_state["lowercase"] = do_lower_case + normalizer_state["strip_accents"] = strip_accents + normalizer_state["handle_chinese_chars"] = tokenize_chinese_chars + self.backend_tokenizer.normalizer = normalizer_class(**normalizer_state) + + self.do_lower_case = do_lower_case + + # Copied from transformers.models.bert.tokenization_bert_fast.BertTokenizerFast.build_inputs_with_special_tokens + def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A BERT sequence has the following format: + + - single sequence: `[CLS] X [SEP]` + - pair of sequences: `[CLS] A [SEP] B [SEP]` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id] + + if token_ids_1 is not None: + output += token_ids_1 + [self.sep_token_id] + + return output + + # Copied from transformers.models.bert.tokenization_bert_fast.BertTokenizerFast.save_vocabulary + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]: + files = self._tokenizer.model.save(save_directory, name=filename_prefix) + return tuple(files) + + +__all__ = ["DistilBertTokenizerFast"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/doge/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/doge/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..91aeca0340f162439b6b7485f325f5343802dd6e --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/doge/__init__.py @@ -0,0 +1,28 @@ +# coding=utf-8 +# Copyright 2025 Jingze Shi and the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_doge import * + from .modeling_doge import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/doge/configuration_doge.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/doge/configuration_doge.py new file mode 100644 index 0000000000000000000000000000000000000000..f3a93fa198f2073ca0e34e613a7d9bed01f892d0 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/doge/configuration_doge.py @@ -0,0 +1,241 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/doge/modular_doge.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_doge.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright 2025 Jingze Shi and the HuggingFace Inc. team. All rights reserved. +# +# The Doge family of small language models is trained by SmallDoge Team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from ...configuration_utils import PretrainedConfig +from ...modeling_rope_utils import rope_config_validation + + +class DogeConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`DogeModel`]. It is used to instantiate an Doge + model according to the specified arguments, defining the model architecture like [SmallDoge/Doge-320M](https://huggingface.co/SmallDoge/Doge-320M). + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + vocab_size (`int`, *optional*, defaults to 32768): + Vocabulary size of the Doge2 model. Defines the number of different tokens that can be represented by the `inputs_ids` passed when calling [`DogeModel`] + hidden_size (`int`, *optional*, defaults to 1024): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 2048): + Dimension of the MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 32): + Number of hidden layers in the Transformer decoder. + hidden_dropout (`float`, *optional*, defaults to 0.0): + Dropout probability for each sequence transformation and state transformation module. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the decoder. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + rms_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon used by the rms normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether the model's input and output word embeddings should be tied. + max_position_embeddings (`int`, *optional*, defaults to 2048): + The maximum sequence length that this model might ever be used with. + rope_theta (`float`, *optional*, defaults to 10000.0): + The base period of the RoPE embeddings. + rope_scaling (`Dict`, *optional*): + Dictionary containing the scaling configuration for the RoPE embeddings. + NOTE: if you apply new rope type and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value accordingly. + Doge family of small models use `{ 'rope_type': 'dynamic', 'factor': 4.0, 'original_max_position_embeddings': 2048 }` as the default value. + Expected contents: + `rope_type` (`str`): + The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', 'llama3'], with 'default' being the original RoPE implementation. + `factor` (`float`, *optional*): + Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. + In most scaling types, a `factor` of x will enable the model to handle sequences of length x * original maximum pre-trained length. + `original_max_position_embeddings` (`int`, *optional*): + Used with 'dynamic', 'longrope' and 'llama3'. + The original max position embeddings used during pretraining. + `attention_factor` (`float`, *optional*): + Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention + computation. + If unspecified, it defaults to value recommended by the implementation, using the `factor` field to infer the suggested value. + `beta_fast` (`float`, *optional*): + Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear + ramp function. If unspecified, it defaults to 32. + `beta_slow` (`float`, *optional*): + Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear + ramp function. If unspecified, it defaults to 1. + `short_factor` (`List[float]`, *optional*): + Only used with 'longrope'. The scaling factor to be applied to short contexts (<`original_max_position_embeddings`). + Must be a list of numbers with the same length as the hidden size divided by the number of attention heads divided by 2 + `long_factor` (`List[float]`, *optional*): + Only used with 'longrope'. The scaling factor to be applied to long contexts (<`original_max_position_embeddings`). + Must be a list of numbers with the same length as the hidden size divided by the number of attention heads divided by 2 + `low_freq_factor` (`float`, *optional*): + Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE + `high_freq_factor` (`float`, *optional*): + Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE + num_attention_heads (`int`, *optional*, defaults to 8): + Number of attention heads for each attention layer in the Transformer decoder. + num_key_value_heads (`int`, *optional*): + This is the number of key_value heads that should be used to implement Grouped Query Attention. + If `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. + When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed by meanpooling all the original heads within that group. + For more details checkout [this paper](https://huggingface.co/papers/2305.13245). + If it is not specified, will default to `num_attention_heads`. + attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`): + Whether to use a bias in the query, key, value and output projection layers during self-attention. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + mlp_bias (`bool`, *optional*, defaults to `False`): + Whether to use a bias in up_proj, down_proj and gate_proj layers in the MLP layers. + sliding_window (`int`, *optional*): + Sliding window attention window size. If not specified, will default to `None`. + keep_window_size (`int`, *optional*, defaults to 2048): + The window size of tokens that are not dynamically masked, and dynamic masking is only performed when the sequence length exceeds this value. + is_moe (`bool`, *optional*, defaults to `False`): + Whether to use the Cross Domain Mixture of Experts, if `True`, the MoE will inherit the MLP to initialize. + num_experts (`int`, *optional*, defaults to 16384): + Number of routed experts in the model. This is only used when `is_moe=True`. + num_experts_per_tok (`int`, *optional*, defaults to 64): + Number of selected experts to route per-token. + norm_topk_prob (`bool`, *optional*, defaults to `False`): + Whether to normalize the topk probabilities. + output_router_logits (`bool`, *optional*, defaults to `False`): + Whether or not the router logits should be returned by the model. Enabling this will also + allow the model to output the auxiliary loss, including load balancing loss and router z-loss. + router_aux_loss_coef (`float`, *optional*, defaults to 0.001): + The aux loss factor for the total loss. + + ```python + >>> from transformers import DogeConfig, DogeModel + + >>> # Initializing a Doge-320M style configuration + >>> configuration = DogeConfig() + + >>> # Initializing a model from the Doge-320M style configuration + >>> model = DogeModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "doge" + keys_to_ignore_at_inference = ["past_key_values"] + # Default tensor parallel plan for base model `DogeModel` + base_model_tp_plan = { + "layers.*.self_attn.q_proj": "colwise", + "layers.*.self_attn.k_proj": "colwise", + "layers.*.self_attn.v_proj": "colwise", + "layers.*.self_attn.dt_proj": "rowwise", + "layers.*.self_attn.o_proj": "rowwise", + "layers.*.input_layernorm.weight": "sequence_parallel", + "layers.*.input_residual.weight": "sequence_parallel", + "layers.*.post_attention_layernorm.weight": "sequence_parallel", + "layers.*.post_attention_residual.weight": "sequence_parallel", + "norm.weight": "sequence_parallel", + "layers.*.mlp.gate_proj": "colwise", + "layers.*.mlp.up_proj": "colwise", + "layers.*.mlp.down_proj": "rowwise", + "layers.*.mlp.router_gate": "colwise_rep", + "layers.*.mlp.down_embed": "rowwise_rep", + "layers.*.mlp.up_embed": "rowwise_rep", + } + base_model_pp_plan = { + "embed_tokens": (["input_ids"], ["inputs_embeds"]), + "layers": (["hidden_states", "attention_mask"], ["hidden_states"]), + "norm": (["hidden_states"], ["hidden_states"]), + } + + def __init__( + self, + vocab_size=32768, + hidden_size=1024, + intermediate_size=2048, + num_hidden_layers=32, + hidden_dropout=0.0, + hidden_act="silu", + initializer_range=0.02, + rms_norm_eps=1e-06, + use_cache=True, + tie_word_embeddings=False, + max_position_embeddings=2048, + rope_theta=10000.0, + rope_scaling=None, + num_attention_heads=8, + num_key_value_heads=None, + attention_bias=False, + attention_dropout=0.0, + mlp_bias=False, + sliding_window=None, + keep_window_size=2048, + is_moe=False, + num_experts=16384, + num_experts_per_tok=64, + norm_topk_prob=False, + output_router_logits=False, + router_aux_loss_coef=0.001, + **kwargs, + ): + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + + self.hidden_dropout = hidden_dropout + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + self.use_cache = use_cache + + self.max_position_embeddings = max_position_embeddings + self.rope_theta = rope_theta + self.rope_scaling = rope_scaling + self.num_attention_heads = num_attention_heads + self.num_key_value_heads = num_key_value_heads + self.attention_bias = attention_bias + self.attention_dropout = attention_dropout + self.mlp_bias = mlp_bias + self.sliding_window = sliding_window + self.keep_window_size = keep_window_size + self.is_moe = is_moe + self.num_experts = num_experts + self.num_experts_per_tok = num_experts_per_tok + self.norm_topk_prob = norm_topk_prob + self.output_router_logits = output_router_logits + self.router_aux_loss_coef = router_aux_loss_coef + + # Validate the correctness of rotary position embeddings parameters + # BC: if there is a 'type' field, copy it it to 'rope_type'. + if self.rope_scaling is not None and "type" in self.rope_scaling: + self.rope_scaling["rope_type"] = self.rope_scaling["type"] + rope_config_validation(self) + + # for backward compatibility + if num_key_value_heads is None: + self.num_key_value_heads = num_attention_heads + + super().__init__( + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) + + +__all__ = ["DogeConfig"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/doge/modeling_doge.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/doge/modeling_doge.py new file mode 100644 index 0000000000000000000000000000000000000000..5822cad62017c8e390806afb3bb67101e4568837 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/doge/modeling_doge.py @@ -0,0 +1,812 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/doge/modular_doge.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_doge.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright 2025 Jingze Shi and the HuggingFace Inc. team. All rights reserved. +# +# The Doge family of small language models is trained by SmallDoge Team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +from typing import Callable, Optional, Union + +import torch +import torch.nn.functional as F +from torch import nn + +from ...activations import ACT2FN +from ...cache_utils import Cache, DynamicCache +from ...generation import GenerationMixin +from ...integrations import use_kernel_forward_from_hub +from ...integrations.flex_attention import compile_friendly_flex_attention +from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask +from ...modeling_layers import GenericForSequenceClassification, GradientCheckpointingLayer +from ...modeling_outputs import MoeCausalLMOutputWithPast, MoeModelOutputWithPast +from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update +from ...modeling_utils import AttentionInterface, PreTrainedModel +from ...processing_utils import Unpack +from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, is_torch_flex_attn_available +from ...utils.deprecation import deprecate_kwarg +from ...utils.generic import OutputRecorder, check_model_inputs +from .configuration_doge import DogeConfig + + +if is_torch_flex_attn_available(): + from torch.nn.attention.flex_attention import BlockMask + + +@use_kernel_forward_from_hub("RMSNorm") +class DogeRMSNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + """ + DogeRMSNorm is equivalent to T5LayerNorm + """ + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + return self.weight * hidden_states.to(input_dtype) + + def extra_repr(self): + return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}" + + +class DogeRotaryEmbedding(nn.Module): + inv_freq: torch.Tensor # fix linting for `register_buffer` + + def __init__(self, config: DogeConfig, device=None): + super().__init__() + # BC: "rope_type" was originally "type" + if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): + self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) + else: + self.rope_type = "default" + self.max_seq_len_cached = config.max_position_embeddings + self.original_max_seq_len = config.max_position_embeddings + + self.config = config + self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + + inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) + self.original_inv_freq = self.inv_freq + + @torch.no_grad() + @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) + def forward(self, x, position_ids): + inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device) + position_ids_expanded = position_ids[:, None, :].float() + + device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" + with torch.autocast(device_type=device_type, enabled=False): # Force float32 + freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) + emb = torch.cat((freqs, freqs), dim=-1) + cos = emb.cos() * self.attention_scaling + sin = emb.sin() * self.attention_scaling + + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) + + +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + +def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1): + """Applies Rotary Position Embedding to the query and key tensors. + + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + position_ids (`torch.Tensor`, *optional*): + Deprecated and unused. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + cos = cos.unsqueeze(unsqueeze_dim) + sin = sin.unsqueeze(unsqueeze_dim) + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + +def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: + """ + This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, + num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) + """ + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) + + +def eager_attention_forward( + module: nn.Module, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attention_mask: Optional[torch.Tensor], + scaling: float, + dropout: float = 0.0, + **kwargs: Unpack[TransformersKwargs], +): + key_states = repeat_kv(key, module.num_key_value_groups) + value_states = repeat_kv(value, module.num_key_value_groups) + + attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling + if attention_mask is not None: + causal_mask = attention_mask[:, :, :, : key_states.shape[-2]] + attn_weights = attn_weights + causal_mask + + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype) + attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) + attn_output = torch.matmul(attn_weights, value_states) + attn_output = attn_output.transpose(1, 2).contiguous() + + return attn_output, attn_weights + + +def flex_attention_forward( + module: nn.Module, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attention_mask: Union[torch.Tensor, "BlockMask"], + scaling: Optional[float] = None, + softcap: Optional[float] = None, + head_mask: Optional[torch.Tensor] = None, + **kwargs, +) -> tuple[torch.Tensor, torch.Tensor]: + block_mask = None + causal_mask = None + if isinstance(attention_mask, BlockMask): + block_mask = attention_mask + else: + causal_mask = attention_mask + + if causal_mask is not None: + causal_mask = causal_mask[:, :, :, : key.shape[-2]] + + def score_mod(score, batch_idx, head_idx, q_idx, kv_idx): + if softcap is not None: + score = softcap * torch.tanh(score / softcap) + if causal_mask is not None: + score = score + causal_mask[batch_idx][head_idx][q_idx][kv_idx] + if head_mask is not None: + score = score + head_mask[batch_idx][head_idx][0][0] + return score + + attn_output, attention_weights = compile_friendly_flex_attention( + query, + key, + value, + score_mod=score_mod, + block_mask=block_mask, + enable_gqa=True, + scale=scaling, + # Last time checked on PyTorch == 2.5.1: Flex Attention always computes the lse regardless. + # For simplification, we thus always return it as no additional computations are introduced. + return_lse=True, + ) + # lse is returned in float32 + attention_weights = attention_weights.to(value.dtype) + attn_output = attn_output.transpose(1, 2).contiguous() + + return attn_output, attention_weights + + +ALL_ATTENTION_FUNCTIONS = AttentionInterface() +ALL_ATTENTION_FUNCTIONS["doge_flex_attention"] = flex_attention_forward + + +class DogeAttention(nn.Module): + def __init__(self, config: DogeConfig, layer_idx: Optional[int] = None): + super().__init__() + self.config = config + self.layer_idx = layer_idx + self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads) + self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads + self.scaling = self.head_dim**-0.5 + self.attention_dropout = config.attention_dropout + self.keep_window_size = config.keep_window_size + + self.q_proj = nn.Linear( + config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias + ) + self.k_proj = nn.Linear( + config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias + ) + self.v_proj = nn.Linear( + config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias + ) + # dynamic mask for the QK^T attention weights matrix + self.A = nn.Parameter(torch.zeros(config.num_key_value_heads)) + self.dt_proj = nn.Linear( + config.num_key_value_heads * self.head_dim, config.num_key_value_heads, bias=config.attention_bias + ) + self.o_proj = nn.Linear( + config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias + ) + self.q_norm = DogeRMSNorm(self.head_dim, eps=config.rms_norm_eps) + self.k_norm = DogeRMSNorm(self.head_dim, eps=config.rms_norm_eps) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + position_embeddings: tuple[torch.Tensor, torch.Tensor], + attention_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[Cache] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs, + ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]: + input_shape = hidden_states.shape[:-1] + hidden_shape = (*input_shape, -1, self.head_dim) + + query_states = self.q_norm(self.q_proj(hidden_states).view(hidden_shape)).transpose(1, 2) + key_states = self.k_norm(self.k_proj(hidden_states).view(hidden_shape)).transpose(1, 2) + value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2) + + cos, sin = position_embeddings + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) + + if past_key_values is not None: + # sin and cos are specific to RoPE models; cache_position needed for the static cache + cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} + key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs) + + # calculate dynamic mask from value_states + dt_states = self.dt_proj( + value_states.transpose(1, 2).reshape(value_states.shape[0], value_states.shape[-2], -1) + ) + dt_states = torch.exp(self.A * F.softplus(dt_states)).transpose(-1, -2) + attn_mask = self.prepare_dynamic_mask( + hidden_states=hidden_states, + dt_states=dt_states, + keep_window_size=self.keep_window_size, + attention_mask=attention_mask, + ) + attn_mask = repeat_kv(attn_mask, self.num_key_value_groups) + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + attn_output, attn_weights = attention_interface( + self, + query_states, + key_states, + value_states, + attention_mask=attn_mask, + dropout=0.0 if not self.training else self.attention_dropout, + scaling=self.scaling, + **kwargs, + ) + + attn_output = attn_output.reshape(*input_shape, -1).contiguous() + attn_output = self.o_proj(attn_output) + return attn_output, attn_weights + + def prepare_dynamic_mask( + self, + hidden_states: torch.Tensor, + dt_states: torch.Tensor, + keep_window_size: int = 2048, + attention_mask: Optional[torch.Tensor] = None, + ): + """ + The core idea of DMA is to calculate the dynamic attention mask to mask the tokens that should be masked, so as to form sparse attention. + + Combine `dt_states` with `attention_mask` to generate the final `attn_mask`. + + Args: + hidden_states (`torch.Tensor`): The input hidden_states, used to determine the minimum value of the current input precision. + dt_states (`torch.Tensor`): dt_states of shape `(batch_size, num_heads, key_sequence_length)`. + keep_window_size (`int`): The window size of tokens that are not dynamically masked, and dynamic masking is only performed when the sequence length exceeds this value. + attention_mask (`torch.Tensor`, *optional*): attention mask of shape `(batch_size, 1, query_sequence_length, key_sequence_length)`. + """ + min_dtype = torch.finfo(hidden_states.dtype).min + dtype = hidden_states.dtype + attn_mask = dt_states[:, :, None, :].expand( + -1, -1, hidden_states.shape[1], -1 + ) # [batch_size, num_heads, query_len, key_len] + if attention_mask is not None and not isinstance(attention_mask, BlockMask): + if attention_mask.dtype == torch.bool: + dtype = hidden_states.dtype + attention_mask = torch.where( + attention_mask, torch.tensor(0.0, device=attention_mask.device, dtype=dtype), min_dtype + ) + attn_mask = attn_mask.masked_fill(attention_mask[:, :, :, : attn_mask.shape[-1]] != 0, min_dtype) + if attn_mask.shape[-1] > keep_window_size: + active_mask = torch.zeros_like(attn_mask, dtype=dtype, device=attn_mask.device) + topk_indices = torch.topk(attn_mask, keep_window_size, dim=-1, largest=True, sorted=False).indices + active_mask = active_mask.scatter(-1, topk_indices, 1.0) + attn_mask = attn_mask.masked_fill(active_mask == 0.0, min_dtype) + return attn_mask + + +class DogeMLP(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.intermediate_size = config.intermediate_size + self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias) + self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias) + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias) + self.act_fn = ACT2FN[config.hidden_act] + + def forward(self, x): + down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) + return down_proj + + +class DogeCDMoE(nn.Module): + def __init__(self, config: DogeConfig): + super().__init__() + self.hidden_size = config.hidden_size + self.intermediate_size = config.intermediate_size + self.act_fn = ACT2FN[config.hidden_act] + + self.num_experts = config.num_experts + self.num_keys = math.floor(math.sqrt(self.num_experts)) + self.top_k = config.num_experts_per_tok + self.norm_topk_prob = config.norm_topk_prob + + # shared expert + self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias) + self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias) + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias) + + # router gate for retrieval experts + self.router_gate = nn.Linear(self.hidden_size, self.num_keys * 2, bias=False) + + # routed experts + self.down_embed = nn.Embedding(self.num_experts, self.hidden_size) + self.up_embed = nn.Embedding(self.num_experts, self.hidden_size) + + def forward( + self, + hidden_states: torch.Tensor, + **kwargs, + ) -> torch.Tensor: + bsz, seq_len, _ = hidden_states.shape + + # get routing logits with router gate + router_logits = self.router_gate(hidden_states).view(2, bsz * seq_len, -1) + + # get experts with the highest routing logits + (scores_x, scores_y), (indices_x, indices_y) = router_logits.topk(self.num_keys, dim=-1) + all_scores = scores_x.unsqueeze(-1) + scores_y.unsqueeze(-2) + all_indices = indices_x.unsqueeze(-1) * self.num_keys + indices_y.unsqueeze(-2) + all_scores = all_scores.view(*all_scores.shape[:-2], -1) + all_indices = all_indices.view(*all_indices.shape[:-2], -1) + scores, position_indices = all_scores.topk(self.top_k, dim=-1) + indices = all_indices.gather(-1, position_indices) + routing_weights = F.softmax(scores, dim=-1) + if self.norm_topk_prob: + routing_weights /= routing_weights.sum(dim=-1, keepdim=True) + + # mix routed experts states with shared expert states + down_embed = self.down_embed(indices) + up_embed = self.up_embed(indices) + experts_weights = torch.matmul(down_embed, hidden_states.view(bsz * seq_len, -1, 1)).view(bsz * seq_len, -1) + experts_weights = self.act_fn(experts_weights) * routing_weights + experts_states = torch.matmul(experts_weights.view(bsz * seq_len, 1, -1), up_embed).view(bsz, seq_len, -1) + hidden_states = self.down_proj(self.act_fn(self.gate_proj(hidden_states)) * self.up_proj(hidden_states)) + hidden_states = hidden_states + experts_states + return hidden_states, router_logits + + +class DogeDecoderLayer(GradientCheckpointingLayer): + def __init__(self, config: DogeConfig, layer_idx: Optional[int] = None): + super().__init__() + self.hidden_dropout = config.hidden_dropout + + self.input_layernorm = DogeRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.self_attn = DogeAttention(config=config, layer_idx=layer_idx) + self.input_residual = nn.Parameter(torch.ones(config.hidden_size)) + + self.post_attention_layernorm = DogeRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.mlp = DogeMLP(config) if not config.is_moe else DogeCDMoE(config) + self.post_attention_residual = nn.Parameter(torch.ones(config.hidden_size)) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + position_embeddings: tuple[torch.Tensor, torch.Tensor], + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + use_cache: Optional[bool] = False, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]: + # sequence transformation + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + hidden_states, self_attn_weights = self.self_attn( + hidden_states=hidden_states, + position_embeddings=position_embeddings, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + use_cache=use_cache, + cache_position=cache_position, + **kwargs, + ) + hidden_states = F.dropout(hidden_states, p=self.hidden_dropout, training=self.training) + hidden_states = self.input_residual * residual + hidden_states + + # state transformation + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = F.dropout(hidden_states, p=self.hidden_dropout, training=self.training) + hidden_states = self.post_attention_residual * residual + hidden_states + + return hidden_states + + +@auto_docstring +class DogePreTrainedModel(PreTrainedModel): + config: DogeConfig + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["DogeDecoderLayer"] + _skip_keys_device_placement = ["past_key_values"] + _supports_flash_attn = False + _supports_sdpa = True + _supports_flex_attn = True + _can_compile_fullgraph = False + _supports_attention_backend = True + _can_record_outputs = { + "router_logits": OutputRecorder(DogeCDMoE, index=1), + "hidden_states": DogeDecoderLayer, + "attentions": DogeAttention, + } + + def _init_weights(self, module): + """Initialize the weights""" + super()._init_weights(module) + if isinstance(module, DogeAttention): + if hasattr(module, "A"): + module.A.data.zero_() + elif isinstance(module, DogeDecoderLayer): + if hasattr(module, "input_residual"): + module.input_residual.data.fill_(1.0) + if hasattr(module, "post_attention_residual"): + module.post_attention_residual.data.fill_(1.0) + + +@auto_docstring +class DogeModel(DogePreTrainedModel): + def __init__(self, config: DogeConfig): + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) + self.layers = nn.ModuleList( + [DogeDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + ) + self.norm = DogeRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.rotary_emb = DogeRotaryEmbedding(config=config) + self.gradient_checkpointing = False + + # Initialize weights and apply final processing + self.post_init() + + @check_model_inputs + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> MoeModelOutputWithPast: + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") + + if use_cache and past_key_values is None: + past_key_values = DynamicCache(config=self.config) + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + if cache_position is None: + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + cache_position = torch.arange( + past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device + ) + if position_ids is None: + position_ids = cache_position.unsqueeze(0) + + mask_function = create_causal_mask if self.config.sliding_window is None else create_sliding_window_causal_mask + causal_mask = mask_function( + config=self.config, + input_embeds=inputs_embeds, + attention_mask=attention_mask, + cache_position=cache_position, + past_key_values=past_key_values, + position_ids=position_ids, + ) + + hidden_states = inputs_embeds + + # create position embeddings to be shared across the decoder layers + position_embeddings = self.rotary_emb(hidden_states, position_ids) + + for decoder_layer in self.layers[: self.config.num_hidden_layers]: + hidden_states = decoder_layer( + hidden_states, + position_embeddings=position_embeddings, + attention_mask=causal_mask, + position_ids=position_ids, + past_key_values=past_key_values, + use_cache=use_cache, + cache_position=cache_position, + **kwargs, + ) + + hidden_states = self.norm(hidden_states) + + return MoeModelOutputWithPast( # only diff with Mistral is the output type, we need MoE + last_hidden_state=hidden_states, + past_key_values=past_key_values, + ) + + +def load_balancing_loss_func( + gate_logits: Union[torch.Tensor, tuple[torch.Tensor], None], + num_experts: Optional[int] = None, + num_keys: Optional[int] = None, + top_k: int = 2, + attention_mask: Optional[torch.Tensor] = None, +) -> Union[torch.Tensor, int]: + r""" + Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch. + + See Switch Transformer (https://huggingface.co/papers/2101.03961) for more details. This function implements the loss + function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between + experts is too unbalanced. + + Args: + gate_logits: + Logits from the `router_gate`, should be a tuple of model.config.num_hidden_layers tensors of + shape [2, batch_size * sequence_length, num_keys]. + num_experts: + Number of experts + num_keys: + Number of keys + top_k: + The number of experts to route per-token, can be also interpreted as the `top-k` routing + parameter. + attention_mask (`torch.Tensor`, *optional*): + The attention_mask used in forward function + shape [batch_size X sequence_length] if not None. + + Returns: + The auxiliary loss. + """ + if gate_logits is None or not isinstance(gate_logits, tuple): + return 0 + + compute_dtype = gate_logits[0].dtype + compute_device = gate_logits[0].device + all_expert_indices = [] + all_routing_weights = [] + + for layer_gate_logits in gate_logits: + layer_gate_logits = layer_gate_logits.to(compute_device) + + (scores_x, scores_y), (indices_x, indices_y) = layer_gate_logits.topk(num_keys, dim=-1) + + all_scores = scores_x.unsqueeze(-1) + scores_y.unsqueeze(-2) + all_indices = indices_x.unsqueeze(-1) * num_keys + indices_y.unsqueeze(-2) + all_scores = all_scores.view(*all_scores.shape[:-2], -1) + all_indices = all_indices.view(*all_indices.shape[:-2], -1) + + _, position_indices = all_scores.topk(top_k, dim=-1) + expert_indices = all_indices.gather(-1, position_indices) + + routing_weights = F.softmax(all_scores, dim=-1) + + all_expert_indices.append(expert_indices) + all_routing_weights.append(routing_weights) + all_expert_indices = torch.cat(all_expert_indices, dim=0) + all_routing_weights = torch.cat(all_routing_weights, dim=0) + + if attention_mask is None: + # Compute the percentage of tokens routed to each experts + all_expert_indices = all_expert_indices.view(-1) + tokens_per_expert = torch.zeros(num_experts, dtype=compute_dtype, device=compute_device) + pad = torch.ones_like(all_expert_indices, dtype=compute_dtype, device=compute_device) + tokens_per_expert = tokens_per_expert.scatter_add_(0, all_expert_indices, pad) / all_expert_indices.shape[0] + + # Compute the average probability of routing to these experts + router_prob_per_expert = torch.mean(all_routing_weights, dim=0) + else: + batch_size, sequence_length = attention_mask.shape + num_hidden_layers = len(gate_logits) + + # Compute the mask that masks all padding tokens as 0 with the same shape of expert_mask + expert_attention_mask = ( + attention_mask[None, :, :, None] + .expand((num_hidden_layers, batch_size, sequence_length, top_k)) + .reshape(-1) + .to(compute_device) + ) + all_expert_indices = all_expert_indices.view(-1)[expert_attention_mask.bool()] + + # Compute the percentage of tokens routed to each experts + tokens_per_expert = torch.zeros(num_experts, dtype=compute_dtype, device=compute_device) + pad = torch.ones_like(all_expert_indices, dtype=compute_dtype, device=compute_device) + tokens_per_expert = tokens_per_expert.scatter_add_(0, all_expert_indices, pad) / torch.sum( + expert_attention_mask + ) + + # Compute the mask that masks all padding tokens as 0 with the same shape of tokens_per_expert + router_per_expert_attention_mask = ( + attention_mask[None, :, :, None] + .expand((num_hidden_layers, batch_size, sequence_length, num_experts)) + .reshape(-1, num_experts) + .to(compute_device) + ) + + # Compute the average probability of routing to these experts + router_prob_per_expert = torch.sum(all_routing_weights * router_per_expert_attention_mask, dim=0) / torch.sum( + router_per_expert_attention_mask, dim=0 + ) + + overall_loss = torch.sum(tokens_per_expert * router_prob_per_expert) + return overall_loss * num_experts + + +@auto_docstring +class DogeForCausalLM(DogePreTrainedModel, GenerationMixin): + _tied_weights_keys = ["lm_head.weight"] + _tp_plan = {"lm_head": "colwise_rep"} + _pp_plan = {"lm_head": (["hidden_states"], ["logits"])} + + def __init__(self, config): + super().__init__(config) + self.model = DogeModel(config) + self.vocab_size = config.vocab_size + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + self.router_aux_loss_coef = config.router_aux_loss_coef + self.num_experts = config.num_experts + self.num_experts_per_tok = config.num_experts_per_tok + + # Initialize weights and apply final processing + self.post_init() + + @can_return_tuple + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + logits_to_keep: Union[int, torch.Tensor] = 0, + output_router_logits: Optional[bool] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> MoeCausalLMOutputWithPast: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Example: + + ```python + >>> from transformers import AutoTokenizer, DogeForCausalLM + + >>> model = DogeForCausalLM.from_pretrained("SmallDoge/Doge-320M") + >>> tokenizer = AutoTokenizer.from_pretrained("SmallDoge/Doge-320M") + + >>> prompt = "Hey, are you conscious? Can you talk to me?" + >>> inputs = tokenizer(prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(inputs.input_ids, max_length=30) + >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you." + ```""" + output_router_logits = ( + output_router_logits if output_router_logits is not None else self.config.output_router_logits + ) + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs: MoeModelOutputWithPast = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + cache_position=cache_position, + **kwargs, + ) + + hidden_states = outputs.last_hidden_state + # Only compute necessary logits, and do not upcast them to float if we are not computing the loss + slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep + logits = self.lm_head(hidden_states[:, slice_indices, :]) + + loss = None + if labels is not None: + loss = self.loss_function(logits, labels, self.vocab_size, **kwargs) + + aux_loss = None + if output_router_logits: + aux_loss = load_balancing_loss_func( + outputs.router_logits, + self.num_experts, + math.floor(math.sqrt(self.num_experts)), + self.num_experts_per_tok, + attention_mask, + ) + if labels is not None: + loss += self.router_aux_loss_coef * aux_loss.to(loss.device) # make sure to reside in the same device + + return MoeCausalLMOutputWithPast( + loss=loss, + aux_loss=aux_loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + router_logits=outputs.router_logits, + ) + + +class DogeForSequenceClassification(GenericForSequenceClassification, DogePreTrainedModel): + pass + + +__all__ = ["DogeForCausalLM", "DogeModel", "DogePreTrainedModel", "DogeForSequenceClassification"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/doge/modular_doge.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/doge/modular_doge.py new file mode 100644 index 0000000000000000000000000000000000000000..c4c95e627376982b3042465624a618bdf1278b87 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/doge/modular_doge.py @@ -0,0 +1,800 @@ +# coding=utf-8 +# Copyright 2025 Jingze Shi and the HuggingFace Inc. team. All rights reserved. +# +# The Doge family of small language models is trained by SmallDoge Team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch Doge model.""" + +import math +from typing import Callable, Optional, Union + +import torch +import torch.nn.functional as F +from torch import nn + +from ...activations import ACT2FN +from ...cache_utils import Cache +from ...configuration_utils import PretrainedConfig +from ...integrations.flex_attention import compile_friendly_flex_attention +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_outputs import MoeCausalLMOutputWithPast, MoeModelOutputWithPast +from ...modeling_rope_utils import rope_config_validation +from ...modeling_utils import AttentionInterface, PreTrainedModel +from ...processing_utils import Unpack +from ...utils import TransformersKwargs, is_torch_flex_attn_available +from ...utils.deprecation import deprecate_kwarg +from ...utils.generic import OutputRecorder +from ..llama.modeling_llama import ( + LlamaForSequenceClassification, + LlamaMLP, + LlamaPreTrainedModel, + LlamaRMSNorm, + LlamaRotaryEmbedding, + apply_rotary_pos_emb, + eager_attention_forward, + repeat_kv, +) +from ..mixtral.modeling_mixtral import MixtralForCausalLM, MixtralModel + + +if is_torch_flex_attn_available(): + from torch.nn.attention.flex_attention import BlockMask + + +class DogeConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`DogeModel`]. It is used to instantiate an Doge + model according to the specified arguments, defining the model architecture like [SmallDoge/Doge-320M](https://huggingface.co/SmallDoge/Doge-320M). + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + vocab_size (`int`, *optional*, defaults to 32768): + Vocabulary size of the Doge2 model. Defines the number of different tokens that can be represented by the `inputs_ids` passed when calling [`DogeModel`] + hidden_size (`int`, *optional*, defaults to 1024): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 2048): + Dimension of the MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 32): + Number of hidden layers in the Transformer decoder. + hidden_dropout (`float`, *optional*, defaults to 0.0): + Dropout probability for each sequence transformation and state transformation module. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the decoder. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + rms_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon used by the rms normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether the model's input and output word embeddings should be tied. + max_position_embeddings (`int`, *optional*, defaults to 2048): + The maximum sequence length that this model might ever be used with. + rope_theta (`float`, *optional*, defaults to 10000.0): + The base period of the RoPE embeddings. + rope_scaling (`Dict`, *optional*): + Dictionary containing the scaling configuration for the RoPE embeddings. + NOTE: if you apply new rope type and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value accordingly. + Doge family of small models use `{ 'rope_type': 'dynamic', 'factor': 4.0, 'original_max_position_embeddings': 2048 }` as the default value. + Expected contents: + `rope_type` (`str`): + The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', 'llama3'], with 'default' being the original RoPE implementation. + `factor` (`float`, *optional*): + Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. + In most scaling types, a `factor` of x will enable the model to handle sequences of length x * original maximum pre-trained length. + `original_max_position_embeddings` (`int`, *optional*): + Used with 'dynamic', 'longrope' and 'llama3'. + The original max position embeddings used during pretraining. + `attention_factor` (`float`, *optional*): + Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention + computation. + If unspecified, it defaults to value recommended by the implementation, using the `factor` field to infer the suggested value. + `beta_fast` (`float`, *optional*): + Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear + ramp function. If unspecified, it defaults to 32. + `beta_slow` (`float`, *optional*): + Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear + ramp function. If unspecified, it defaults to 1. + `short_factor` (`List[float]`, *optional*): + Only used with 'longrope'. The scaling factor to be applied to short contexts (<`original_max_position_embeddings`). + Must be a list of numbers with the same length as the hidden size divided by the number of attention heads divided by 2 + `long_factor` (`List[float]`, *optional*): + Only used with 'longrope'. The scaling factor to be applied to long contexts (<`original_max_position_embeddings`). + Must be a list of numbers with the same length as the hidden size divided by the number of attention heads divided by 2 + `low_freq_factor` (`float`, *optional*): + Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE + `high_freq_factor` (`float`, *optional*): + Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE + num_attention_heads (`int`, *optional*, defaults to 8): + Number of attention heads for each attention layer in the Transformer decoder. + num_key_value_heads (`int`, *optional*): + This is the number of key_value heads that should be used to implement Grouped Query Attention. + If `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. + When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed by meanpooling all the original heads within that group. + For more details checkout [this paper](https://huggingface.co/papers/2305.13245). + If it is not specified, will default to `num_attention_heads`. + attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`): + Whether to use a bias in the query, key, value and output projection layers during self-attention. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + mlp_bias (`bool`, *optional*, defaults to `False`): + Whether to use a bias in up_proj, down_proj and gate_proj layers in the MLP layers. + sliding_window (`int`, *optional*): + Sliding window attention window size. If not specified, will default to `None`. + keep_window_size (`int`, *optional*, defaults to 2048): + The window size of tokens that are not dynamically masked, and dynamic masking is only performed when the sequence length exceeds this value. + is_moe (`bool`, *optional*, defaults to `False`): + Whether to use the Cross Domain Mixture of Experts, if `True`, the MoE will inherit the MLP to initialize. + num_experts (`int`, *optional*, defaults to 16384): + Number of routed experts in the model. This is only used when `is_moe=True`. + num_experts_per_tok (`int`, *optional*, defaults to 64): + Number of selected experts to route per-token. + norm_topk_prob (`bool`, *optional*, defaults to `False`): + Whether to normalize the topk probabilities. + output_router_logits (`bool`, *optional*, defaults to `False`): + Whether or not the router logits should be returned by the model. Enabling this will also + allow the model to output the auxiliary loss, including load balancing loss and router z-loss. + router_aux_loss_coef (`float`, *optional*, defaults to 0.001): + The aux loss factor for the total loss. + + ```python + >>> from transformers import DogeConfig, DogeModel + + >>> # Initializing a Doge-320M style configuration + >>> configuration = DogeConfig() + + >>> # Initializing a model from the Doge-320M style configuration + >>> model = DogeModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "doge" + keys_to_ignore_at_inference = ["past_key_values"] + # Default tensor parallel plan for base model `DogeModel` + base_model_tp_plan = { + "layers.*.self_attn.q_proj": "colwise", + "layers.*.self_attn.k_proj": "colwise", + "layers.*.self_attn.v_proj": "colwise", + "layers.*.self_attn.dt_proj": "rowwise", + "layers.*.self_attn.o_proj": "rowwise", + "layers.*.input_layernorm.weight": "sequence_parallel", + "layers.*.input_residual.weight": "sequence_parallel", + "layers.*.post_attention_layernorm.weight": "sequence_parallel", + "layers.*.post_attention_residual.weight": "sequence_parallel", + "norm.weight": "sequence_parallel", + "layers.*.mlp.gate_proj": "colwise", + "layers.*.mlp.up_proj": "colwise", + "layers.*.mlp.down_proj": "rowwise", + "layers.*.mlp.router_gate": "colwise_rep", + "layers.*.mlp.down_embed": "rowwise_rep", + "layers.*.mlp.up_embed": "rowwise_rep", + } + base_model_pp_plan = { + "embed_tokens": (["input_ids"], ["inputs_embeds"]), + "layers": (["hidden_states", "attention_mask"], ["hidden_states"]), + "norm": (["hidden_states"], ["hidden_states"]), + } + + def __init__( + self, + vocab_size=32768, + hidden_size=1024, + intermediate_size=2048, + num_hidden_layers=32, + hidden_dropout=0.0, + hidden_act="silu", + initializer_range=0.02, + rms_norm_eps=1e-06, + use_cache=True, + tie_word_embeddings=False, + max_position_embeddings=2048, + rope_theta=10000.0, + rope_scaling=None, + num_attention_heads=8, + num_key_value_heads=None, + attention_bias=False, + attention_dropout=0.0, + mlp_bias=False, + sliding_window=None, + keep_window_size=2048, + is_moe=False, + num_experts=16384, + num_experts_per_tok=64, + norm_topk_prob=False, + output_router_logits=False, + router_aux_loss_coef=0.001, + **kwargs, + ): + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + + self.hidden_dropout = hidden_dropout + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + self.use_cache = use_cache + + self.max_position_embeddings = max_position_embeddings + self.rope_theta = rope_theta + self.rope_scaling = rope_scaling + self.num_attention_heads = num_attention_heads + self.num_key_value_heads = num_key_value_heads + self.attention_bias = attention_bias + self.attention_dropout = attention_dropout + self.mlp_bias = mlp_bias + self.sliding_window = sliding_window + self.keep_window_size = keep_window_size + self.is_moe = is_moe + self.num_experts = num_experts + self.num_experts_per_tok = num_experts_per_tok + self.norm_topk_prob = norm_topk_prob + self.output_router_logits = output_router_logits + self.router_aux_loss_coef = router_aux_loss_coef + + # Validate the correctness of rotary position embeddings parameters + # BC: if there is a 'type' field, copy it it to 'rope_type'. + if self.rope_scaling is not None and "type" in self.rope_scaling: + self.rope_scaling["rope_type"] = self.rope_scaling["type"] + rope_config_validation(self) + + # for backward compatibility + if num_key_value_heads is None: + self.num_key_value_heads = num_attention_heads + + super().__init__( + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) + + +class DogeRMSNorm(LlamaRMSNorm): + pass + + +class DogeRotaryEmbedding(LlamaRotaryEmbedding): + pass + + +def flex_attention_forward( + module: nn.Module, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attention_mask: Union[torch.Tensor, "BlockMask"], + scaling: Optional[float] = None, + softcap: Optional[float] = None, + head_mask: Optional[torch.Tensor] = None, + **kwargs, +) -> tuple[torch.Tensor, torch.Tensor]: + block_mask = None + causal_mask = None + if isinstance(attention_mask, BlockMask): + block_mask = attention_mask + else: + causal_mask = attention_mask + + if causal_mask is not None: + causal_mask = causal_mask[:, :, :, : key.shape[-2]] + + def score_mod(score, batch_idx, head_idx, q_idx, kv_idx): + if softcap is not None: + score = softcap * torch.tanh(score / softcap) + if causal_mask is not None: + score = score + causal_mask[batch_idx][head_idx][q_idx][kv_idx] + if head_mask is not None: + score = score + head_mask[batch_idx][head_idx][0][0] + return score + + attn_output, attention_weights = compile_friendly_flex_attention( + query, + key, + value, + score_mod=score_mod, + block_mask=block_mask, + enable_gqa=True, + scale=scaling, + # Last time checked on PyTorch == 2.5.1: Flex Attention always computes the lse regardless. + # For simplification, we thus always return it as no additional computations are introduced. + return_lse=True, + ) + # lse is returned in float32 + attention_weights = attention_weights.to(value.dtype) + attn_output = attn_output.transpose(1, 2).contiguous() + + return attn_output, attention_weights + + +ALL_ATTENTION_FUNCTIONS = AttentionInterface() +ALL_ATTENTION_FUNCTIONS["doge_flex_attention"] = flex_attention_forward + + +class DogeAttention(nn.Module): + def __init__(self, config: DogeConfig, layer_idx: Optional[int] = None): + super().__init__() + self.config = config + self.layer_idx = layer_idx + self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads) + self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads + self.scaling = self.head_dim**-0.5 + self.attention_dropout = config.attention_dropout + self.keep_window_size = config.keep_window_size + + self.q_proj = nn.Linear( + config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias + ) + self.k_proj = nn.Linear( + config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias + ) + self.v_proj = nn.Linear( + config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias + ) + # dynamic mask for the QK^T attention weights matrix + self.A = nn.Parameter(torch.zeros(config.num_key_value_heads)) + self.dt_proj = nn.Linear( + config.num_key_value_heads * self.head_dim, config.num_key_value_heads, bias=config.attention_bias + ) + self.o_proj = nn.Linear( + config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias + ) + self.q_norm = DogeRMSNorm(self.head_dim, eps=config.rms_norm_eps) + self.k_norm = DogeRMSNorm(self.head_dim, eps=config.rms_norm_eps) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + position_embeddings: tuple[torch.Tensor, torch.Tensor], + attention_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[Cache] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs, + ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]: + input_shape = hidden_states.shape[:-1] + hidden_shape = (*input_shape, -1, self.head_dim) + + query_states = self.q_norm(self.q_proj(hidden_states).view(hidden_shape)).transpose(1, 2) + key_states = self.k_norm(self.k_proj(hidden_states).view(hidden_shape)).transpose(1, 2) + value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2) + + cos, sin = position_embeddings + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) + + if past_key_values is not None: + # sin and cos are specific to RoPE models; cache_position needed for the static cache + cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} + key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs) + + # calculate dynamic mask from value_states + dt_states = self.dt_proj( + value_states.transpose(1, 2).reshape(value_states.shape[0], value_states.shape[-2], -1) + ) + dt_states = torch.exp(self.A * F.softplus(dt_states)).transpose(-1, -2) + attn_mask = self.prepare_dynamic_mask( + hidden_states=hidden_states, + dt_states=dt_states, + keep_window_size=self.keep_window_size, + attention_mask=attention_mask, + ) + attn_mask = repeat_kv(attn_mask, self.num_key_value_groups) + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + attn_output, attn_weights = attention_interface( + self, + query_states, + key_states, + value_states, + attention_mask=attn_mask, + dropout=0.0 if not self.training else self.attention_dropout, + scaling=self.scaling, + **kwargs, + ) + + attn_output = attn_output.reshape(*input_shape, -1).contiguous() + attn_output = self.o_proj(attn_output) + return attn_output, attn_weights + + def prepare_dynamic_mask( + self, + hidden_states: torch.Tensor, + dt_states: torch.Tensor, + keep_window_size: int = 2048, + attention_mask: Optional[torch.Tensor] = None, + ): + """ + The core idea of DMA is to calculate the dynamic attention mask to mask the tokens that should be masked, so as to form sparse attention. + + Combine `dt_states` with `attention_mask` to generate the final `attn_mask`. + + Args: + hidden_states (`torch.Tensor`): The input hidden_states, used to determine the minimum value of the current input precision. + dt_states (`torch.Tensor`): dt_states of shape `(batch_size, num_heads, key_sequence_length)`. + keep_window_size (`int`): The window size of tokens that are not dynamically masked, and dynamic masking is only performed when the sequence length exceeds this value. + attention_mask (`torch.Tensor`, *optional*): attention mask of shape `(batch_size, 1, query_sequence_length, key_sequence_length)`. + """ + min_dtype = torch.finfo(hidden_states.dtype).min + dtype = hidden_states.dtype + attn_mask = dt_states[:, :, None, :].expand( + -1, -1, hidden_states.shape[1], -1 + ) # [batch_size, num_heads, query_len, key_len] + if attention_mask is not None and not isinstance(attention_mask, BlockMask): + if attention_mask.dtype == torch.bool: + dtype = hidden_states.dtype + attention_mask = torch.where( + attention_mask, torch.tensor(0.0, device=attention_mask.device, dtype=dtype), min_dtype + ) + attn_mask = attn_mask.masked_fill(attention_mask[:, :, :, : attn_mask.shape[-1]] != 0, min_dtype) + if attn_mask.shape[-1] > keep_window_size: + active_mask = torch.zeros_like(attn_mask, dtype=dtype, device=attn_mask.device) + topk_indices = torch.topk(attn_mask, keep_window_size, dim=-1, largest=True, sorted=False).indices + active_mask = active_mask.scatter(-1, topk_indices, 1.0) + attn_mask = attn_mask.masked_fill(active_mask == 0.0, min_dtype) + return attn_mask + + +class DogeMLP(LlamaMLP): + pass + + +class DogeCDMoE(nn.Module): + def __init__(self, config: DogeConfig): + super().__init__() + self.hidden_size = config.hidden_size + self.intermediate_size = config.intermediate_size + self.act_fn = ACT2FN[config.hidden_act] + + self.num_experts = config.num_experts + self.num_keys = math.floor(math.sqrt(self.num_experts)) + self.top_k = config.num_experts_per_tok + self.norm_topk_prob = config.norm_topk_prob + + # shared expert + self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias) + self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias) + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias) + + # router gate for retrieval experts + self.router_gate = nn.Linear(self.hidden_size, self.num_keys * 2, bias=False) + + # routed experts + self.down_embed = nn.Embedding(self.num_experts, self.hidden_size) + self.up_embed = nn.Embedding(self.num_experts, self.hidden_size) + + def forward( + self, + hidden_states: torch.Tensor, + **kwargs, + ) -> torch.Tensor: + bsz, seq_len, _ = hidden_states.shape + + # get routing logits with router gate + router_logits = self.router_gate(hidden_states).view(2, bsz * seq_len, -1) + + # get experts with the highest routing logits + (scores_x, scores_y), (indices_x, indices_y) = router_logits.topk(self.num_keys, dim=-1) + all_scores = scores_x.unsqueeze(-1) + scores_y.unsqueeze(-2) + all_indices = indices_x.unsqueeze(-1) * self.num_keys + indices_y.unsqueeze(-2) + all_scores = all_scores.view(*all_scores.shape[:-2], -1) + all_indices = all_indices.view(*all_indices.shape[:-2], -1) + scores, position_indices = all_scores.topk(self.top_k, dim=-1) + indices = all_indices.gather(-1, position_indices) + routing_weights = F.softmax(scores, dim=-1) + if self.norm_topk_prob: + routing_weights /= routing_weights.sum(dim=-1, keepdim=True) + + # mix routed experts states with shared expert states + down_embed = self.down_embed(indices) + up_embed = self.up_embed(indices) + experts_weights = torch.matmul(down_embed, hidden_states.view(bsz * seq_len, -1, 1)).view(bsz * seq_len, -1) + experts_weights = self.act_fn(experts_weights) * routing_weights + experts_states = torch.matmul(experts_weights.view(bsz * seq_len, 1, -1), up_embed).view(bsz, seq_len, -1) + hidden_states = self.down_proj(self.act_fn(self.gate_proj(hidden_states)) * self.up_proj(hidden_states)) + hidden_states = hidden_states + experts_states + return hidden_states, router_logits + + +class DogeDecoderLayer(GradientCheckpointingLayer): + def __init__(self, config: DogeConfig, layer_idx: Optional[int] = None): + super().__init__() + self.hidden_dropout = config.hidden_dropout + + self.input_layernorm = DogeRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.self_attn = DogeAttention(config=config, layer_idx=layer_idx) + self.input_residual = nn.Parameter(torch.ones(config.hidden_size)) + + self.post_attention_layernorm = DogeRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.mlp = DogeMLP(config) if not config.is_moe else DogeCDMoE(config) + self.post_attention_residual = nn.Parameter(torch.ones(config.hidden_size)) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + position_embeddings: tuple[torch.Tensor, torch.Tensor], + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + use_cache: Optional[bool] = False, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]: + # sequence transformation + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + hidden_states, self_attn_weights = self.self_attn( + hidden_states=hidden_states, + position_embeddings=position_embeddings, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + use_cache=use_cache, + cache_position=cache_position, + **kwargs, + ) + hidden_states = F.dropout(hidden_states, p=self.hidden_dropout, training=self.training) + hidden_states = self.input_residual * residual + hidden_states + + # state transformation + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = F.dropout(hidden_states, p=self.hidden_dropout, training=self.training) + hidden_states = self.post_attention_residual * residual + hidden_states + + return hidden_states + + +class DogePreTrainedModel(LlamaPreTrainedModel): + _supports_flash_attn = False + _can_compile_fullgraph = False + _can_record_outputs = { + "router_logits": OutputRecorder(DogeCDMoE, index=1), + "hidden_states": DogeDecoderLayer, + "attentions": DogeAttention, + } + + def _init_weights(self, module): + """Initialize the weights""" + PreTrainedModel._init_weights(self, module) + if isinstance(module, DogeAttention): + if hasattr(module, "A"): + module.A.data.zero_() + elif isinstance(module, DogeDecoderLayer): + if hasattr(module, "input_residual"): + module.input_residual.data.fill_(1.0) + if hasattr(module, "post_attention_residual"): + module.post_attention_residual.data.fill_(1.0) + + +class DogeModel(MixtralModel): + pass + + +def load_balancing_loss_func( + gate_logits: Union[torch.Tensor, tuple[torch.Tensor], None], + num_experts: Optional[int] = None, + num_keys: Optional[int] = None, + top_k: int = 2, + attention_mask: Optional[torch.Tensor] = None, +) -> Union[torch.Tensor, int]: + r""" + Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch. + + See Switch Transformer (https://huggingface.co/papers/2101.03961) for more details. This function implements the loss + function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between + experts is too unbalanced. + + Args: + gate_logits: + Logits from the `router_gate`, should be a tuple of model.config.num_hidden_layers tensors of + shape [2, batch_size * sequence_length, num_keys]. + num_experts: + Number of experts + num_keys: + Number of keys + top_k: + The number of experts to route per-token, can be also interpreted as the `top-k` routing + parameter. + attention_mask (`torch.Tensor`, *optional*): + The attention_mask used in forward function + shape [batch_size X sequence_length] if not None. + + Returns: + The auxiliary loss. + """ + if gate_logits is None or not isinstance(gate_logits, tuple): + return 0 + + compute_dtype = gate_logits[0].dtype + compute_device = gate_logits[0].device + all_expert_indices = [] + all_routing_weights = [] + + for layer_gate_logits in gate_logits: + layer_gate_logits = layer_gate_logits.to(compute_device) + + (scores_x, scores_y), (indices_x, indices_y) = layer_gate_logits.topk(num_keys, dim=-1) + + all_scores = scores_x.unsqueeze(-1) + scores_y.unsqueeze(-2) + all_indices = indices_x.unsqueeze(-1) * num_keys + indices_y.unsqueeze(-2) + all_scores = all_scores.view(*all_scores.shape[:-2], -1) + all_indices = all_indices.view(*all_indices.shape[:-2], -1) + + _, position_indices = all_scores.topk(top_k, dim=-1) + expert_indices = all_indices.gather(-1, position_indices) + + routing_weights = F.softmax(all_scores, dim=-1) + + all_expert_indices.append(expert_indices) + all_routing_weights.append(routing_weights) + all_expert_indices = torch.cat(all_expert_indices, dim=0) + all_routing_weights = torch.cat(all_routing_weights, dim=0) + + if attention_mask is None: + # Compute the percentage of tokens routed to each experts + all_expert_indices = all_expert_indices.view(-1) + tokens_per_expert = torch.zeros(num_experts, dtype=compute_dtype, device=compute_device) + pad = torch.ones_like(all_expert_indices, dtype=compute_dtype, device=compute_device) + tokens_per_expert = tokens_per_expert.scatter_add_(0, all_expert_indices, pad) / all_expert_indices.shape[0] + + # Compute the average probability of routing to these experts + router_prob_per_expert = torch.mean(all_routing_weights, dim=0) + else: + batch_size, sequence_length = attention_mask.shape + num_hidden_layers = len(gate_logits) + + # Compute the mask that masks all padding tokens as 0 with the same shape of expert_mask + expert_attention_mask = ( + attention_mask[None, :, :, None] + .expand((num_hidden_layers, batch_size, sequence_length, top_k)) + .reshape(-1) + .to(compute_device) + ) + all_expert_indices = all_expert_indices.view(-1)[expert_attention_mask.bool()] + + # Compute the percentage of tokens routed to each experts + tokens_per_expert = torch.zeros(num_experts, dtype=compute_dtype, device=compute_device) + pad = torch.ones_like(all_expert_indices, dtype=compute_dtype, device=compute_device) + tokens_per_expert = tokens_per_expert.scatter_add_(0, all_expert_indices, pad) / torch.sum( + expert_attention_mask + ) + + # Compute the mask that masks all padding tokens as 0 with the same shape of tokens_per_expert + router_per_expert_attention_mask = ( + attention_mask[None, :, :, None] + .expand((num_hidden_layers, batch_size, sequence_length, num_experts)) + .reshape(-1, num_experts) + .to(compute_device) + ) + + # Compute the average probability of routing to these experts + router_prob_per_expert = torch.sum(all_routing_weights * router_per_expert_attention_mask, dim=0) / torch.sum( + router_per_expert_attention_mask, dim=0 + ) + + overall_loss = torch.sum(tokens_per_expert * router_prob_per_expert) + return overall_loss * num_experts + + +class DogeForCausalLM(MixtralForCausalLM): + def __init__(self, config): + super().__init__(config) + self.model = DogeModel(config) + self.num_experts = config.num_experts + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + logits_to_keep: Union[int, torch.Tensor] = 0, + output_router_logits: Optional[bool] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> MoeCausalLMOutputWithPast: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Example: + + ```python + >>> from transformers import AutoTokenizer, DogeForCausalLM + + >>> model = DogeForCausalLM.from_pretrained("SmallDoge/Doge-320M") + >>> tokenizer = AutoTokenizer.from_pretrained("SmallDoge/Doge-320M") + + >>> prompt = "Hey, are you conscious? Can you talk to me?" + >>> inputs = tokenizer(prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(inputs.input_ids, max_length=30) + >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you." + ```""" + output_router_logits = ( + output_router_logits if output_router_logits is not None else self.config.output_router_logits + ) + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs: MoeModelOutputWithPast = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + cache_position=cache_position, + **kwargs, + ) + + hidden_states = outputs.last_hidden_state + # Only compute necessary logits, and do not upcast them to float if we are not computing the loss + slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep + logits = self.lm_head(hidden_states[:, slice_indices, :]) + + loss = None + if labels is not None: + loss = self.loss_function(logits, labels, self.vocab_size, **kwargs) + + aux_loss = None + if output_router_logits: + aux_loss = load_balancing_loss_func( + outputs.router_logits, + self.num_experts, + math.floor(math.sqrt(self.num_experts)), + self.num_experts_per_tok, + attention_mask, + ) + if labels is not None: + loss += self.router_aux_loss_coef * aux_loss.to(loss.device) # make sure to reside in the same device + + return MoeCausalLMOutputWithPast( + loss=loss, + aux_loss=aux_loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + router_logits=outputs.router_logits, + ) + + +class DogeForSequenceClassification(LlamaForSequenceClassification): + pass + + +__all__ = [ + "DogeConfig", + "DogeForCausalLM", + "DogeModel", + "DogePreTrainedModel", + "DogeForSequenceClassification", +] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/dpt/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/dpt/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ce0070f270f3604afd0661e0cd8aaa4fa2141217 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/dpt/__init__.py @@ -0,0 +1,30 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_dpt import * + from .feature_extraction_dpt import * + from .image_processing_dpt import * + from .image_processing_dpt_fast import * + from .modeling_dpt import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/dpt/configuration_dpt.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/dpt/configuration_dpt.py new file mode 100644 index 0000000000000000000000000000000000000000..311425fcda1c88c888171256f36e27d9aeaa7487 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/dpt/configuration_dpt.py @@ -0,0 +1,302 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""DPT model configuration""" + +import copy + +from ...configuration_utils import PretrainedConfig +from ...utils import logging +from ...utils.backbone_utils import verify_backbone_config_arguments +from ..auto.configuration_auto import CONFIG_MAPPING +from ..bit import BitConfig + + +logger = logging.get_logger(__name__) + + +class DPTConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`DPTModel`]. It is used to instantiate an DPT + model according to the specified arguments, defining the model architecture. Instantiating a configuration with the + defaults will yield a similar configuration to that of the DPT + [Intel/dpt-large](https://huggingface.co/Intel/dpt-large) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + intermediate_size (`int`, *optional*, defaults to 3072): + Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"` and `"gelu_new"` are supported. + hidden_dropout_prob (`float`, *optional*, defaults to 0.0): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-12): + The epsilon used by the layer normalization layers. + image_size (`int`, *optional*, defaults to 384): + The size (resolution) of each image. + patch_size (`int`, *optional*, defaults to 16): + The size (resolution) of each patch. + num_channels (`int`, *optional*, defaults to 3): + The number of input channels. + is_hybrid (`bool`, *optional*, defaults to `False`): + Whether to use a hybrid backbone. Useful in the context of loading DPT-Hybrid models. + qkv_bias (`bool`, *optional*, defaults to `True`): + Whether to add a bias to the queries, keys and values. + backbone_out_indices (`list[int]`, *optional*, defaults to `[2, 5, 8, 11]`): + Indices of the intermediate hidden states to use from backbone. + readout_type (`str`, *optional*, defaults to `"project"`): + The readout type to use when processing the readout token (CLS token) of the intermediate hidden states of + the ViT backbone. Can be one of [`"ignore"`, `"add"`, `"project"`]. + + - "ignore" simply ignores the CLS token. + - "add" passes the information from the CLS token to all other tokens by adding the representations. + - "project" passes information to the other tokens by concatenating the readout to all other tokens before + projecting the + representation to the original feature dimension D using a linear layer followed by a GELU non-linearity. + reassemble_factors (`list[int]`, *optional*, defaults to `[4, 2, 1, 0.5]`): + The up/downsampling factors of the reassemble layers. + neck_hidden_sizes (`list[str]`, *optional*, defaults to `[96, 192, 384, 768]`): + The hidden sizes to project to for the feature maps of the backbone. + fusion_hidden_size (`int`, *optional*, defaults to 256): + The number of channels before fusion. + head_in_index (`int`, *optional*, defaults to -1): + The index of the features to use in the heads. + use_batch_norm_in_fusion_residual (`bool`, *optional*, defaults to `False`): + Whether to use batch normalization in the pre-activate residual units of the fusion blocks. + use_bias_in_fusion_residual (`bool`, *optional*, defaults to `True`): + Whether to use bias in the pre-activate residual units of the fusion blocks. + add_projection (`bool`, *optional*, defaults to `False`): + Whether to add a projection layer before the depth estimation head. + use_auxiliary_head (`bool`, *optional*, defaults to `True`): + Whether to use an auxiliary head during training. + auxiliary_loss_weight (`float`, *optional*, defaults to 0.4): + Weight of the cross-entropy loss of the auxiliary head. + semantic_loss_ignore_index (`int`, *optional*, defaults to 255): + The index that is ignored by the loss function of the semantic segmentation model. + semantic_classifier_dropout (`float`, *optional*, defaults to 0.1): + The dropout ratio for the semantic classification head. + backbone_featmap_shape (`list[int]`, *optional*, defaults to `[1, 1024, 24, 24]`): + Used only for the `hybrid` embedding type. The shape of the feature maps of the backbone. + neck_ignore_stages (`list[int]`, *optional*, defaults to `[0, 1]`): + Used only for the `hybrid` embedding type. The stages of the readout layers to ignore. + backbone_config (`Union[dict[str, Any], PretrainedConfig]`, *optional*): + The configuration of the backbone model. Only used in case `is_hybrid` is `True` or in case you want to + leverage the [`AutoBackbone`] API. + backbone (`str`, *optional*): + Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this + will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone` + is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights. + use_pretrained_backbone (`bool`, *optional*, defaults to `False`): + Whether to use pretrained weights for the backbone. + use_timm_backbone (`bool`, *optional*, defaults to `False`): + Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers + library. + backbone_kwargs (`dict`, *optional*): + Keyword arguments to be passed to AutoBackbone when loading from a checkpoint + e.g. `{'out_indices': (0, 1, 2, 3)}`. Cannot be specified if `backbone_config` is set. + pooler_output_size (`int`, *optional*): + Dimensionality of the pooler layer. If None, defaults to `hidden_size`. + pooler_act (`str`, *optional*, defaults to `"tanh"`): + The activation function to be used by the pooler. Keys of ACT2FN are supported for Flax and + Pytorch, and elements of https://www.tensorflow.org/api_docs/python/tf/keras/activations are + supported for Tensorflow. + + Example: + + ```python + >>> from transformers import DPTModel, DPTConfig + + >>> # Initializing a DPT dpt-large style configuration + >>> configuration = DPTConfig() + + >>> # Initializing a model from the dpt-large style configuration + >>> model = DPTModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "dpt" + + def __init__( + self, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + hidden_act="gelu", + hidden_dropout_prob=0.0, + attention_probs_dropout_prob=0.0, + initializer_range=0.02, + layer_norm_eps=1e-12, + image_size=384, + patch_size=16, + num_channels=3, + is_hybrid=False, + qkv_bias=True, + backbone_out_indices=[2, 5, 8, 11], + readout_type="project", + reassemble_factors=[4, 2, 1, 0.5], + neck_hidden_sizes=[96, 192, 384, 768], + fusion_hidden_size=256, + head_in_index=-1, + use_batch_norm_in_fusion_residual=False, + use_bias_in_fusion_residual=None, + add_projection=False, + use_auxiliary_head=True, + auxiliary_loss_weight=0.4, + semantic_loss_ignore_index=255, + semantic_classifier_dropout=0.1, + backbone_featmap_shape=[1, 1024, 24, 24], + neck_ignore_stages=[0, 1], + backbone_config=None, + backbone=None, + use_pretrained_backbone=False, + use_timm_backbone=False, + backbone_kwargs=None, + pooler_output_size=None, + pooler_act="tanh", + **kwargs, + ): + super().__init__(**kwargs) + + self.hidden_size = hidden_size + self.is_hybrid = is_hybrid + + use_autobackbone = False + if self.is_hybrid: + if backbone_config is None: + backbone_config = { + "global_padding": "same", + "layer_type": "bottleneck", + "depths": [3, 4, 9], + "out_features": ["stage1", "stage2", "stage3"], + "embedding_dynamic_padding": True, + } + + if isinstance(backbone_config, dict): + logger.info("Initializing the config with a `BiT` backbone.") + backbone_config = BitConfig(**backbone_config) + elif not isinstance(backbone_config, PretrainedConfig): + raise ValueError( + f"backbone_config must be a dictionary or a `PretrainedConfig`, got {backbone_config.__class__}." + ) + self.backbone_config = backbone_config + self.backbone_featmap_shape = backbone_featmap_shape + self.neck_ignore_stages = neck_ignore_stages + + if readout_type != "project": + raise ValueError("Readout type must be 'project' when using `DPT-hybrid` mode.") + + elif backbone is not None or backbone_config is not None: + use_autobackbone = True + if isinstance(backbone_config, dict): + backbone_model_type = backbone_config.get("model_type") + config_class = CONFIG_MAPPING[backbone_model_type] + backbone_config = config_class.from_dict(backbone_config) + + self.backbone_config = backbone_config + self.backbone_featmap_shape = None + self.neck_ignore_stages = [] + + # We only use load_backbone when config.is_hydrid is False + verify_backbone_config_arguments( + use_timm_backbone=use_timm_backbone, + use_pretrained_backbone=use_pretrained_backbone, + backbone=backbone, + backbone_config=backbone_config, + backbone_kwargs=backbone_kwargs, + ) + else: + self.backbone_config = None + self.backbone_featmap_shape = None + self.neck_ignore_stages = [] + + self.backbone = backbone + self.use_pretrained_backbone = use_pretrained_backbone + self.use_timm_backbone = use_timm_backbone + self.backbone_kwargs = backbone_kwargs + + # ViT parameters used if not using a hybrid backbone + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.layer_norm_eps = layer_norm_eps + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.qkv_bias = qkv_bias + self.use_autobackbone = use_autobackbone + self.backbone_out_indices = None if use_autobackbone else backbone_out_indices + + if readout_type not in ["ignore", "add", "project"]: + raise ValueError("Readout_type must be one of ['ignore', 'add', 'project']") + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.readout_type = readout_type + self.reassemble_factors = reassemble_factors + self.neck_hidden_sizes = neck_hidden_sizes + self.fusion_hidden_size = fusion_hidden_size + self.head_in_index = head_in_index + self.use_batch_norm_in_fusion_residual = use_batch_norm_in_fusion_residual + self.use_bias_in_fusion_residual = use_bias_in_fusion_residual + self.add_projection = add_projection + + # auxiliary head attributes (semantic segmentation) + self.use_auxiliary_head = use_auxiliary_head + self.auxiliary_loss_weight = auxiliary_loss_weight + self.semantic_loss_ignore_index = semantic_loss_ignore_index + self.semantic_classifier_dropout = semantic_classifier_dropout + self.pooler_output_size = pooler_output_size if pooler_output_size else hidden_size + self.pooler_act = pooler_act + + def to_dict(self): + """ + Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`]. Returns: + `dict[str, any]`: Dictionary of all the attributes that make up this configuration instance, + """ + output = copy.deepcopy(self.__dict__) + + if output["backbone_config"] is not None: + output["backbone_config"] = self.backbone_config.to_dict() + + output["model_type"] = self.__class__.model_type + return output + + @property + def sub_configs(self): + return ( + {"backbone_config": type(self.backbone_config)} + if getattr(self, "backbone_config", None) is not None + else {} + ) + + +__all__ = ["DPTConfig"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/dpt/feature_extraction_dpt.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/dpt/feature_extraction_dpt.py new file mode 100644 index 0000000000000000000000000000000000000000..b6ab8ccbed8d33b1e5b15d429b6cb057ff781f78 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/dpt/feature_extraction_dpt.py @@ -0,0 +1,38 @@ +# coding=utf-8 +# Copyright 2021 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Feature extractor class for DPT.""" + +import warnings + +from ...utils import logging +from ...utils.import_utils import requires +from .image_processing_dpt import DPTImageProcessor + + +logger = logging.get_logger(__name__) + + +@requires(backends=("vision",)) +class DPTFeatureExtractor(DPTImageProcessor): + def __init__(self, *args, **kwargs) -> None: + warnings.warn( + "The class DPTFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please" + " use DPTImageProcessor instead.", + FutureWarning, + ) + super().__init__(*args, **kwargs) + + +__all__ = ["DPTFeatureExtractor"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/dpt/image_processing_dpt.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/dpt/image_processing_dpt.py new file mode 100644 index 0000000000000000000000000000000000000000..9b28950d2ded2d046e6c39d54d246f893a50d122 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/dpt/image_processing_dpt.py @@ -0,0 +1,677 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Image processor class for DPT.""" + +import math +from collections.abc import Iterable +from typing import TYPE_CHECKING, Optional, Union + +from ...utils.import_utils import requires + + +if TYPE_CHECKING: + from ...modeling_outputs import DepthEstimatorOutput + +import numpy as np + +from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict +from ...image_transforms import pad, resize, to_channel_dimension_format +from ...image_utils import ( + IMAGENET_STANDARD_MEAN, + IMAGENET_STANDARD_STD, + ChannelDimension, + ImageInput, + PILImageResampling, + get_image_size, + infer_channel_dimension_format, + is_scaled_image, + is_torch_available, + is_torch_tensor, + make_flat_list_of_images, + to_numpy_array, + valid_images, + validate_preprocess_arguments, +) +from ...utils import ( + TensorType, + filter_out_non_signature_kwargs, + is_vision_available, + logging, + requires_backends, +) + + +if is_torch_available(): + import torch + +if is_vision_available(): + import PIL + + +logger = logging.get_logger(__name__) + + +def get_resize_output_image_size( + input_image: np.ndarray, + output_size: Union[int, Iterable[int]], + keep_aspect_ratio: bool, + multiple: int, + input_data_format: Optional[Union[str, ChannelDimension]] = None, +) -> tuple[int, int]: + def constrain_to_multiple_of(val, multiple, min_val=0, max_val=None): + x = round(val / multiple) * multiple + + if max_val is not None and x > max_val: + x = math.floor(val / multiple) * multiple + + if x < min_val: + x = math.ceil(val / multiple) * multiple + + return x + + output_size = (output_size, output_size) if isinstance(output_size, int) else output_size + + input_height, input_width = get_image_size(input_image, input_data_format) + output_height, output_width = output_size + + # determine new height and width + scale_height = output_height / input_height + scale_width = output_width / input_width + + if keep_aspect_ratio: + # scale as little as possible + if abs(1 - scale_width) < abs(1 - scale_height): + # fit width + scale_height = scale_width + else: + # fit height + scale_width = scale_height + + new_height = constrain_to_multiple_of(scale_height * input_height, multiple=multiple) + new_width = constrain_to_multiple_of(scale_width * input_width, multiple=multiple) + + return (new_height, new_width) + + +@requires(backends=("vision",)) +class DPTImageProcessor(BaseImageProcessor): + r""" + Constructs a DPT image processor. + + Args: + do_resize (`bool`, *optional*, defaults to `True`): + Whether to resize the image's (height, width) dimensions. Can be overridden by `do_resize` in `preprocess`. + size (`dict[str, int]` *optional*, defaults to `{"height": 384, "width": 384}`): + Size of the image after resizing. Can be overridden by `size` in `preprocess`. + resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`): + Defines the resampling filter to use if resizing the image. Can be overridden by `resample` in `preprocess`. + keep_aspect_ratio (`bool`, *optional*, defaults to `False`): + If `True`, the image is resized to the largest possible size such that the aspect ratio is preserved. Can + be overridden by `keep_aspect_ratio` in `preprocess`. + ensure_multiple_of (`int`, *optional*, defaults to 1): + If `do_resize` is `True`, the image is resized to a size that is a multiple of this value. Can be overridden + by `ensure_multiple_of` in `preprocess`. + do_rescale (`bool`, *optional*, defaults to `True`): + Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in + `preprocess`. + rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): + Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in `preprocess`. + do_normalize (`bool`, *optional*, defaults to `True`): + Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess` + method. + image_mean (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`): + Mean to use if normalizing the image. This is a float or list of floats the length of the number of + channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. + image_std (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`): + Standard deviation to use if normalizing the image. This is a float or list of floats the length of the + number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method. + do_pad (`bool`, *optional*, defaults to `False`): + Whether to apply center padding. This was introduced in the DINOv2 paper, which uses the model in + combination with DPT. + size_divisor (`int`, *optional*): + If `do_pad` is `True`, pads the image dimensions to be divisible by this value. This was introduced in the + DINOv2 paper, which uses the model in combination with DPT. + do_reduce_labels (`bool`, *optional*, defaults to `False`): + Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0 is + used for background, and background itself is not included in all classes of a dataset (e.g. ADE20k). The + background label will be replaced by 255. Can be overridden by the `do_reduce_labels` parameter in the + `preprocess` method. + """ + + model_input_names = ["pixel_values"] + + def __init__( + self, + do_resize: bool = True, + size: Optional[dict[str, int]] = None, + resample: PILImageResampling = PILImageResampling.BICUBIC, + keep_aspect_ratio: bool = False, + ensure_multiple_of: int = 1, + do_rescale: bool = True, + rescale_factor: Union[int, float] = 1 / 255, + do_normalize: bool = True, + image_mean: Optional[Union[float, list[float]]] = None, + image_std: Optional[Union[float, list[float]]] = None, + do_pad: bool = False, + size_divisor: Optional[int] = None, + do_reduce_labels: bool = False, + **kwargs, + ) -> None: + super().__init__(**kwargs) + size = size if size is not None else {"height": 384, "width": 384} + size = get_size_dict(size) + self.do_resize = do_resize + self.size = size + self.keep_aspect_ratio = keep_aspect_ratio + self.ensure_multiple_of = ensure_multiple_of + self.resample = resample + self.do_rescale = do_rescale + self.rescale_factor = rescale_factor + self.do_normalize = do_normalize + self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN + self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD + self.do_pad = do_pad + self.size_divisor = size_divisor + self.do_reduce_labels = do_reduce_labels + + def resize( + self, + image: np.ndarray, + size: dict[str, int], + keep_aspect_ratio: bool = False, + ensure_multiple_of: int = 1, + resample: PILImageResampling = PILImageResampling.BICUBIC, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ) -> np.ndarray: + """ + Resize an image to target size `(size["height"], size["width"])`. If `keep_aspect_ratio` is `True`, the image + is resized to the largest possible size such that the aspect ratio is preserved. If `ensure_multiple_of` is + set, the image is resized to a size that is a multiple of this value. + + Args: + image (`np.ndarray`): + Image to resize. + size (`dict[str, int]`): + Target size of the output image. + keep_aspect_ratio (`bool`, *optional*, defaults to `False`): + If `True`, the image is resized to the largest possible size such that the aspect ratio is preserved. + ensure_multiple_of (`int`, *optional*, defaults to 1): + The image is resized to a size that is a multiple of this value. + resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`): + Defines the resampling filter to use if resizing the image. Otherwise, the image is resized to size + specified in `size`. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format of the image. If not provided, it will be the same as the input image. + input_data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format of the input image. If not provided, it will be inferred. + """ + size = get_size_dict(size) + if "height" not in size or "width" not in size: + raise ValueError(f"The size dictionary must contain the keys 'height' and 'width'. Got {size.keys()}") + + output_size = get_resize_output_image_size( + image, + output_size=(size["height"], size["width"]), + keep_aspect_ratio=keep_aspect_ratio, + multiple=ensure_multiple_of, + input_data_format=input_data_format, + ) + return resize( + image, + size=output_size, + resample=resample, + data_format=data_format, + input_data_format=input_data_format, + **kwargs, + ) + + def pad_image( + self, + image: np.ndarray, + size_divisor: int, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ): + """ + Center pad an image to be a multiple of `multiple`. + + Args: + image (`np.ndarray`): + Image to pad. + size_divisor (`int`): + The width and height of the image will be padded to a multiple of this number. + data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - Unset: Use the channel dimension format of the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + """ + + def _get_pad(size, size_divisor): + new_size = math.ceil(size / size_divisor) * size_divisor + pad_size = new_size - size + pad_size_left = pad_size // 2 + pad_size_right = pad_size - pad_size_left + return pad_size_left, pad_size_right + + if input_data_format is None: + input_data_format = infer_channel_dimension_format(image) + + height, width = get_image_size(image, input_data_format) + + pad_size_left, pad_size_right = _get_pad(height, size_divisor) + pad_size_top, pad_size_bottom = _get_pad(width, size_divisor) + + return pad(image, ((pad_size_left, pad_size_right), (pad_size_top, pad_size_bottom)), data_format=data_format) + + # Copied from transformers.models.beit.image_processing_beit.BeitImageProcessor.reduce_label + def reduce_label(self, label: ImageInput) -> np.ndarray: + label = to_numpy_array(label) + # Avoid using underflow conversion + label[label == 0] = 255 + label = label - 1 + label[label == 254] = 255 + return label + + def _preprocess( + self, + image: ImageInput, + do_reduce_labels: Optional[bool] = None, + do_resize: Optional[bool] = None, + size: Optional[dict[str, int]] = None, + resample: Optional[PILImageResampling] = None, + keep_aspect_ratio: Optional[bool] = None, + ensure_multiple_of: Optional[int] = None, + do_rescale: Optional[bool] = None, + rescale_factor: Optional[float] = None, + do_normalize: Optional[bool] = None, + image_mean: Optional[Union[float, list[float]]] = None, + image_std: Optional[Union[float, list[float]]] = None, + do_pad: Optional[bool] = None, + size_divisor: Optional[int] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ): + if do_reduce_labels: + image = self.reduce_label(image) + + if do_resize: + image = self.resize( + image=image, + size=size, + resample=resample, + keep_aspect_ratio=keep_aspect_ratio, + ensure_multiple_of=ensure_multiple_of, + input_data_format=input_data_format, + ) + + if do_rescale: + image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format) + + if do_normalize: + image = self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format) + + if do_pad: + image = self.pad_image(image=image, size_divisor=size_divisor, input_data_format=input_data_format) + + return image + + def _preprocess_image( + self, + image: ImageInput, + do_resize: Optional[bool] = None, + size: Optional[dict[str, int]] = None, + resample: Optional[PILImageResampling] = None, + keep_aspect_ratio: Optional[bool] = None, + ensure_multiple_of: Optional[int] = None, + do_rescale: Optional[bool] = None, + rescale_factor: Optional[float] = None, + do_normalize: Optional[bool] = None, + image_mean: Optional[Union[float, list[float]]] = None, + image_std: Optional[Union[float, list[float]]] = None, + do_pad: Optional[bool] = None, + size_divisor: Optional[int] = None, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> np.ndarray: + """Preprocesses a single image.""" + # All transformations expect numpy arrays. + image = to_numpy_array(image) + if do_rescale and is_scaled_image(image): + logger.warning_once( + "It looks like you are trying to rescale already rescaled images. If the input" + " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again." + ) + if input_data_format is None: + # We assume that all images have the same channel dimension format. + input_data_format = infer_channel_dimension_format(image) + + image = self._preprocess( + image, + do_reduce_labels=False, + do_resize=do_resize, + size=size, + resample=resample, + keep_aspect_ratio=keep_aspect_ratio, + ensure_multiple_of=ensure_multiple_of, + do_rescale=do_rescale, + rescale_factor=rescale_factor, + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + do_pad=do_pad, + size_divisor=size_divisor, + input_data_format=input_data_format, + ) + if data_format is not None: + image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) + return image + + def _preprocess_segmentation_map( + self, + segmentation_map: ImageInput, + do_resize: Optional[bool] = None, + size: Optional[dict[str, int]] = None, + resample: Optional[PILImageResampling] = None, + keep_aspect_ratio: Optional[bool] = None, + ensure_multiple_of: Optional[int] = None, + do_reduce_labels: Optional[bool] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ): + """Preprocesses a single segmentation map.""" + # All transformations expect numpy arrays. + segmentation_map = to_numpy_array(segmentation_map) + # Add an axis to the segmentation maps for transformations. + if segmentation_map.ndim == 2: + segmentation_map = segmentation_map[None, ...] + added_dimension = True + input_data_format = ChannelDimension.FIRST + else: + added_dimension = False + if input_data_format is None: + input_data_format = infer_channel_dimension_format(segmentation_map, num_channels=1) + segmentation_map = self._preprocess( + image=segmentation_map, + do_reduce_labels=do_reduce_labels, + do_resize=do_resize, + size=size, + resample=resample, + keep_aspect_ratio=keep_aspect_ratio, + ensure_multiple_of=ensure_multiple_of, + do_normalize=False, + do_rescale=False, + input_data_format=input_data_format, + ) + # Remove extra axis if added + if added_dimension: + segmentation_map = np.squeeze(segmentation_map, axis=0) + segmentation_map = segmentation_map.astype(np.int64) + return segmentation_map + + # Copied from transformers.models.beit.image_processing_beit.BeitImageProcessor.__call__ + def __call__(self, images, segmentation_maps=None, **kwargs): + # Overrides the `__call__` method of the `Preprocessor` class such that the images and segmentation maps can both + # be passed in as positional arguments. + return super().__call__(images, segmentation_maps=segmentation_maps, **kwargs) + + @filter_out_non_signature_kwargs() + def preprocess( + self, + images: ImageInput, + segmentation_maps: Optional[ImageInput] = None, + do_resize: Optional[bool] = None, + size: Optional[int] = None, + keep_aspect_ratio: Optional[bool] = None, + ensure_multiple_of: Optional[int] = None, + resample: Optional[PILImageResampling] = None, + do_rescale: Optional[bool] = None, + rescale_factor: Optional[float] = None, + do_normalize: Optional[bool] = None, + image_mean: Optional[Union[float, list[float]]] = None, + image_std: Optional[Union[float, list[float]]] = None, + do_pad: Optional[bool] = None, + size_divisor: Optional[int] = None, + do_reduce_labels: Optional[bool] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + data_format: ChannelDimension = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> PIL.Image.Image: + """ + Preprocess an image or batch of images. + + Args: + images (`ImageInput`): + Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If + passing in images with pixel values between 0 and 1, set `do_rescale=False`. + segmentation_maps (`ImageInput`, *optional*): + Segmentation map to preprocess. + do_resize (`bool`, *optional*, defaults to `self.do_resize`): + Whether to resize the image. + size (`dict[str, int]`, *optional*, defaults to `self.size`): + Size of the image after reszing. If `keep_aspect_ratio` is `True`, the image is resized to the largest + possible size such that the aspect ratio is preserved. If `ensure_multiple_of` is set, the image is + resized to a size that is a multiple of this value. + keep_aspect_ratio (`bool`, *optional*, defaults to `self.keep_aspect_ratio`): + Whether to keep the aspect ratio of the image. If False, the image will be resized to (size, size). If + True, the image will be resized to keep the aspect ratio and the size will be the maximum possible. + ensure_multiple_of (`int`, *optional*, defaults to `self.ensure_multiple_of`): + Ensure that the image size is a multiple of this value. + resample (`int`, *optional*, defaults to `self.resample`): + Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`, Only + has an effect if `do_resize` is set to `True`. + do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): + Whether to rescale the image values between [0 - 1]. + rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): + Rescale factor to rescale the image by if `do_rescale` is set to `True`. + do_normalize (`bool`, *optional*, defaults to `self.do_normalize`): + Whether to normalize the image. + image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`): + Image mean. + image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`): + Image standard deviation. + do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`): + Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0 + is used for background, and background itself is not included in all classes of a dataset (e.g. + ADE20k). The background label will be replaced by 255. + return_tensors (`str` or `TensorType`, *optional*): + The type of tensors to return. Can be one of: + - Unset: Return a list of `np.ndarray`. + - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`. + - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. + - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. + - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. + data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): + The channel dimension format for the output image. Can be one of: + - `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `ChannelDimension.LAST`: image in (height, width, num_channels) format. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + """ + do_resize = do_resize if do_resize is not None else self.do_resize + size = size if size is not None else self.size + size = get_size_dict(size) + keep_aspect_ratio = keep_aspect_ratio if keep_aspect_ratio is not None else self.keep_aspect_ratio + ensure_multiple_of = ensure_multiple_of if ensure_multiple_of is not None else self.ensure_multiple_of + resample = resample if resample is not None else self.resample + do_rescale = do_rescale if do_rescale is not None else self.do_rescale + rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor + do_normalize = do_normalize if do_normalize is not None else self.do_normalize + image_mean = image_mean if image_mean is not None else self.image_mean + image_std = image_std if image_std is not None else self.image_std + do_pad = do_pad if do_pad is not None else self.do_pad + size_divisor = size_divisor if size_divisor is not None else self.size_divisor + do_reduce_labels = do_reduce_labels if do_reduce_labels is not None else self.do_reduce_labels + + images = make_flat_list_of_images(images) + + if segmentation_maps is not None: + segmentation_maps = make_flat_list_of_images(segmentation_maps, expected_ndims=2) + + if not valid_images(images): + raise ValueError( + "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " + "torch.Tensor, tf.Tensor or jax.ndarray." + ) + validate_preprocess_arguments( + do_rescale=do_rescale, + rescale_factor=rescale_factor, + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + do_resize=do_resize, + size=size, + resample=resample, + ) + + images = [ + self._preprocess_image( + image=img, + do_resize=do_resize, + do_rescale=do_rescale, + do_normalize=do_normalize, + do_pad=do_pad, + size=size, + resample=resample, + keep_aspect_ratio=keep_aspect_ratio, + ensure_multiple_of=ensure_multiple_of, + rescale_factor=rescale_factor, + image_mean=image_mean, + image_std=image_std, + size_divisor=size_divisor, + data_format=data_format, + input_data_format=input_data_format, + ) + for img in images + ] + + data = {"pixel_values": images} + + if segmentation_maps is not None: + segmentation_maps = [ + self._preprocess_segmentation_map( + segmentation_map=segmentation_map, + do_reduce_labels=do_reduce_labels, + do_resize=do_resize, + size=size, + resample=resample, + keep_aspect_ratio=keep_aspect_ratio, + ensure_multiple_of=ensure_multiple_of, + input_data_format=input_data_format, + ) + for segmentation_map in segmentation_maps + ] + + data["labels"] = segmentation_maps + + return BatchFeature(data=data, tensor_type=return_tensors) + + # Copied from transformers.models.beit.image_processing_beit.BeitImageProcessor.post_process_semantic_segmentation with Beit->DPT + def post_process_semantic_segmentation(self, outputs, target_sizes: Optional[list[tuple]] = None): + """ + Converts the output of [`DPTForSemanticSegmentation`] into semantic segmentation maps. Only supports PyTorch. + + Args: + outputs ([`DPTForSemanticSegmentation`]): + Raw outputs of the model. + target_sizes (`list[Tuple]` of length `batch_size`, *optional*): + List of tuples corresponding to the requested final size (height, width) of each prediction. If unset, + predictions will not be resized. + + Returns: + semantic_segmentation: `list[torch.Tensor]` of length `batch_size`, where each item is a semantic + segmentation map of shape (height, width) corresponding to the target_sizes entry (if `target_sizes` is + specified). Each entry of each `torch.Tensor` correspond to a semantic class id. + """ + # TODO: add support for other frameworks + logits = outputs.logits + + # Resize logits and compute semantic segmentation maps + if target_sizes is not None: + if len(logits) != len(target_sizes): + raise ValueError( + "Make sure that you pass in as many target sizes as the batch dimension of the logits" + ) + + if is_torch_tensor(target_sizes): + target_sizes = target_sizes.numpy() + + semantic_segmentation = [] + + for idx in range(len(logits)): + resized_logits = torch.nn.functional.interpolate( + logits[idx].unsqueeze(dim=0), size=target_sizes[idx], mode="bilinear", align_corners=False + ) + semantic_map = resized_logits[0].argmax(dim=0) + semantic_segmentation.append(semantic_map) + else: + semantic_segmentation = logits.argmax(dim=1) + semantic_segmentation = [semantic_segmentation[i] for i in range(semantic_segmentation.shape[0])] + + return semantic_segmentation + + def post_process_depth_estimation( + self, + outputs: "DepthEstimatorOutput", + target_sizes: Optional[Union[TensorType, list[tuple[int, int]], None]] = None, + ) -> list[dict[str, TensorType]]: + """ + Converts the raw output of [`DepthEstimatorOutput`] into final depth predictions and depth PIL images. + Only supports PyTorch. + + Args: + outputs ([`DepthEstimatorOutput`]): + Raw outputs of the model. + target_sizes (`TensorType` or `list[tuple[int, int]]`, *optional*): + Tensor of shape `(batch_size, 2)` or list of tuples (`tuple[int, int]`) containing the target size + (height, width) of each image in the batch. If left to None, predictions will not be resized. + + Returns: + `list[dict[str, TensorType]]`: A list of dictionaries of tensors representing the processed depth + predictions. + """ + requires_backends(self, "torch") + + predicted_depth = outputs.predicted_depth + + if (target_sizes is not None) and (len(predicted_depth) != len(target_sizes)): + raise ValueError( + "Make sure that you pass in as many target sizes as the batch dimension of the predicted depth" + ) + + results = [] + target_sizes = [None] * len(predicted_depth) if target_sizes is None else target_sizes + for depth, target_size in zip(predicted_depth, target_sizes): + if target_size is not None: + depth = torch.nn.functional.interpolate( + depth.unsqueeze(0).unsqueeze(1), size=target_size, mode="bicubic", align_corners=False + ).squeeze() + + results.append({"predicted_depth": depth}) + + return results + + +__all__ = ["DPTImageProcessor"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/dpt/image_processing_dpt_fast.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/dpt/image_processing_dpt_fast.py new file mode 100644 index 0000000000000000000000000000000000000000..faaddb8023c08aee60e90142bef0cf44048e5ca2 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/dpt/image_processing_dpt_fast.py @@ -0,0 +1,406 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/dpt/modular_dpt.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_dpt.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright 2025 HuggingFace Inc. team. All rights reserved. +# +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +from collections.abc import Iterable +from typing import TYPE_CHECKING, Optional, Union + +import torch +from torchvision.transforms.v2 import functional as F + +from ...image_processing_base import BatchFeature +from ...image_processing_utils_fast import BaseImageProcessorFast, DefaultFastImageProcessorKwargs +from ...image_transforms import group_images_by_shape, reorder_images +from ...image_utils import ( + IMAGENET_STANDARD_MEAN, + IMAGENET_STANDARD_STD, + ChannelDimension, + ImageInput, + PILImageResampling, + SizeDict, + is_torch_tensor, +) +from ...processing_utils import Unpack +from ...utils import TensorType, auto_docstring, requires_backends + + +if TYPE_CHECKING: + from ...modeling_outputs import DepthEstimatorOutput + + +class DPTFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): + """ + ensure_multiple_of (`int`, *optional*, defaults to 1): + If `do_resize` is `True`, the image is resized to a size that is a multiple of this value. Can be overridden + by `ensure_multiple_of` in `preprocess`. + size_divisor (`int`, *optional*): + If `do_pad` is `True`, pads the image dimensions to be divisible by this value. This was introduced in the + DINOv2 paper, which uses the model in combination with DPT. + keep_aspect_ratio (`bool`, *optional*, defaults to `False`): + If `True`, the image is resized to the largest possible size such that the aspect ratio is preserved. Can + be overridden by `keep_aspect_ratio` in `preprocess`. + do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`): + Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0 + is used for background, and background itself is not included in all classes of a dataset (e.g. + ADE20k). The background label will be replaced by 255. + """ + + ensure_multiple_of: Optional[int] + size_divisor: Optional[int] + keep_aspect_ratio: Optional[bool] + do_reduce_labels: Optional[bool] + + +def get_resize_output_image_size( + input_image: "torch.Tensor", + output_size: Union[int, Iterable[int]], + keep_aspect_ratio: bool, + multiple: int, +) -> SizeDict: + def constrain_to_multiple_of(val, multiple, min_val=0, max_val=None): + x = round(val / multiple) * multiple + + if max_val is not None and x > max_val: + x = math.floor(val / multiple) * multiple + + if x < min_val: + x = math.ceil(val / multiple) * multiple + + return x + + input_height, input_width = input_image.shape[-2:] + output_height, output_width = output_size + + # determine new height and width + scale_height = output_height / input_height + scale_width = output_width / input_width + + if keep_aspect_ratio: + # scale as little as possible + if abs(1 - scale_width) < abs(1 - scale_height): + # fit width + scale_height = scale_width + else: + # fit height + scale_width = scale_height + + new_height = constrain_to_multiple_of(scale_height * input_height, multiple=multiple) + new_width = constrain_to_multiple_of(scale_width * input_width, multiple=multiple) + + return SizeDict(height=new_height, width=new_width) + + +@auto_docstring +class DPTImageProcessorFast(BaseImageProcessorFast): + resample = PILImageResampling.BICUBIC + image_mean = IMAGENET_STANDARD_MEAN + image_std = IMAGENET_STANDARD_STD + size = {"height": 384, "width": 384} + default_to_square = True + crop_size = None + do_resize = True + do_center_crop = None + do_rescale = True + do_normalize = True + do_reduce_labels = None + + valid_kwargs = DPTFastImageProcessorKwargs + do_pad = False + rescale_factor = 1 / 255 + ensure_multiple_of = 1 + keep_aspect_ratio = False + + def __init__(self, **kwargs: Unpack[DPTFastImageProcessorKwargs]): + super().__init__(**kwargs) + + def reduce_label(self, labels: list["torch.Tensor"]): + for idx in range(len(labels)): + label = labels[idx] + label = torch.where(label == 0, torch.tensor(255, dtype=label.dtype), label) + label = label - 1 + label = torch.where(label == 254, torch.tensor(255, dtype=label.dtype), label) + labels[idx] = label + + return label + + @auto_docstring + def preprocess( + self, + images: ImageInput, + segmentation_maps: Optional[ImageInput] = None, + **kwargs: Unpack[DPTFastImageProcessorKwargs], + ) -> BatchFeature: + r""" + segmentation_maps (`ImageInput`, *optional*): + The segmentation maps to preprocess. + """ + return super().preprocess(images, segmentation_maps, **kwargs) + + def _preprocess_image_like_inputs( + self, + images: ImageInput, + segmentation_maps: Optional[ImageInput], + do_convert_rgb: bool, + input_data_format: ChannelDimension, + device: Optional[Union[str, "torch.device"]] = None, + **kwargs: Unpack[DPTFastImageProcessorKwargs], + ) -> BatchFeature: + """ + Preprocess image-like inputs. + """ + images = self._prepare_image_like_inputs( + images=images, do_convert_rgb=do_convert_rgb, input_data_format=input_data_format, device=device + ) + images_kwargs = kwargs.copy() + images_kwargs["do_reduce_labels"] = False + batch_feature = self._preprocess(images, **images_kwargs) + + if segmentation_maps is not None: + processed_segmentation_maps = self._prepare_image_like_inputs( + images=segmentation_maps, + expected_ndims=2, + do_convert_rgb=False, + input_data_format=ChannelDimension.FIRST, + ) + + segmentation_maps_kwargs = kwargs.copy() + segmentation_maps_kwargs.update({"do_normalize": False, "do_rescale": False}) + processed_segmentation_maps = self._preprocess( + images=processed_segmentation_maps, **segmentation_maps_kwargs + ).pixel_values + batch_feature["labels"] = processed_segmentation_maps.squeeze(1).to(torch.int64) + + return batch_feature + + def _preprocess( + self, + images: list["torch.Tensor"], + do_reduce_labels: bool, + do_resize: bool, + size: SizeDict, + interpolation: Optional["F.InterpolationMode"], + do_center_crop: bool, + crop_size: SizeDict, + do_rescale: bool, + rescale_factor: float, + do_normalize: bool, + image_mean: Optional[Union[float, list[float]]], + image_std: Optional[Union[float, list[float]]], + keep_aspect_ratio: bool, + ensure_multiple_of: Optional[int], + do_pad: bool, + size_divisor: Optional[int], + disable_grouping: Optional[bool], + return_tensors: Optional[Union[str, TensorType]], + **kwargs, + ) -> BatchFeature: + if do_reduce_labels: + images = self.reduce_label(images) + + # Group images by size for batched resizing + grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping) + resized_images_grouped = {} + for shape, stacked_images in grouped_images.items(): + if do_resize: + stacked_images = self.resize( + image=stacked_images, + size=size, + interpolation=interpolation, + ensure_multiple_of=ensure_multiple_of, + keep_aspect_ratio=keep_aspect_ratio, + ) + resized_images_grouped[shape] = stacked_images + resized_images = reorder_images(resized_images_grouped, grouped_images_index) + + # Group images by size for further processing + # Needed in case do_resize is False, or resize returns images with different sizes + grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping) + processed_images_grouped = {} + for shape, stacked_images in grouped_images.items(): + if do_center_crop: + stacked_images = self.center_crop(stacked_images, crop_size) + if do_pad: + stacked_images = self.pad_image(stacked_images, size_divisor) + # Fused rescale and normalize + stacked_images = self.rescale_and_normalize( + stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std + ) + processed_images_grouped[shape] = stacked_images + + processed_images = reorder_images(processed_images_grouped, grouped_images_index) + processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images + return BatchFeature(data={"pixel_values": processed_images}) + + def post_process_semantic_segmentation(self, outputs, target_sizes: Optional[list[tuple]] = None): + """ + Converts the output of [`DPTForSemanticSegmentation`] into semantic segmentation maps. Only supports PyTorch. + + Args: + outputs ([`DPTForSemanticSegmentation`]): + Raw outputs of the model. + target_sizes (`list[Tuple]` of length `batch_size`, *optional*): + List of tuples corresponding to the requested final size (height, width) of each prediction. If unset, + predictions will not be resized. + + Returns: + semantic_segmentation: `list[torch.Tensor]` of length `batch_size`, where each item is a semantic + segmentation map of shape (height, width) corresponding to the target_sizes entry (if `target_sizes` is + specified). Each entry of each `torch.Tensor` correspond to a semantic class id. + """ + # TODO: add support for other frameworks + logits = outputs.logits + + # Resize logits and compute semantic segmentation maps + if target_sizes is not None: + if len(logits) != len(target_sizes): + raise ValueError( + "Make sure that you pass in as many target sizes as the batch dimension of the logits" + ) + + if is_torch_tensor(target_sizes): + target_sizes = target_sizes.numpy() + + semantic_segmentation = [] + + for idx in range(len(logits)): + resized_logits = torch.nn.functional.interpolate( + logits[idx].unsqueeze(dim=0), size=target_sizes[idx], mode="bilinear", align_corners=False + ) + semantic_map = resized_logits[0].argmax(dim=0) + semantic_segmentation.append(semantic_map) + else: + semantic_segmentation = logits.argmax(dim=1) + semantic_segmentation = [semantic_segmentation[i] for i in range(semantic_segmentation.shape[0])] + + return semantic_segmentation + + def resize( + self, + image: "torch.Tensor", + size: SizeDict, + interpolation: Optional["F.InterpolationMode"] = None, + antialias: bool = True, + ensure_multiple_of: Optional[int] = 1, + keep_aspect_ratio: bool = False, + ) -> "torch.Tensor": + """ + Resize an image to `(size["height"], size["width"])`. + + Args: + image (`torch.Tensor`): + Image to resize. + size (`SizeDict`): + Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image. + interpolation (`InterpolationMode`, *optional*, defaults to `InterpolationMode.BILINEAR`): + `InterpolationMode` filter to use when resizing the image e.g. `InterpolationMode.BICUBIC`. + antialias (`bool`, *optional*, defaults to `True`): + Whether to use antialiasing when resizing the image + ensure_multiple_of (`int`, *optional*): + If `do_resize` is `True`, the image is resized to a size that is a multiple of this value + keep_aspect_ratio (`bool`, *optional*, defaults to `False`): + If `True`, and `do_resize` is `True`, the image is resized to the largest possible size such that the aspect ratio is preserved. + + Returns: + `torch.Tensor`: The resized image. + """ + if not size.height or not size.width: + raise ValueError(f"The size dictionary must contain the keys 'height' and 'width'. Got {size.keys()}") + + output_size = get_resize_output_image_size( + image, + output_size=(size.height, size.width), + keep_aspect_ratio=keep_aspect_ratio, + multiple=ensure_multiple_of, + ) + return super().resize(image, output_size, interpolation=interpolation, antialias=antialias) + + def pad_image( + self, + image: "torch.Tensor", + size_divisor: int = 1, + ) -> "torch.Tensor": + r""" + Center pad a batch of images to be a multiple of `size_divisor`. + + Args: + image (`torch.Tensor`): + Image to pad. Can be a batch of images of dimensions (N, C, H, W) or a single image of dimensions (C, H, W). + size_divisor (`int`): + The width and height of the image will be padded to a multiple of this number. + """ + height, width = image.shape[-2:] + + def _get_pad(size, size_divisor): + new_size = math.ceil(size / size_divisor) * size_divisor + pad_size = new_size - size + pad_size_left = pad_size // 2 + pad_size_right = pad_size - pad_size_left + return pad_size_left, pad_size_right + + pad_top, pad_bottom = _get_pad(height, size_divisor) + pad_left, pad_right = _get_pad(width, size_divisor) + padding = (pad_left, pad_top, pad_right, pad_bottom) + return F.pad(image, padding) + + def post_process_depth_estimation( + self, + outputs: "DepthEstimatorOutput", + target_sizes: Optional[Union[TensorType, list[tuple[int, int]], None]] = None, + ) -> list[dict[str, TensorType]]: + """ + Converts the raw output of [`DepthEstimatorOutput`] into final depth predictions and depth PIL images. + Only supports PyTorch. + + Args: + outputs ([`DepthEstimatorOutput`]): + Raw outputs of the model. + target_sizes (`TensorType` or `List[Tuple[int, int]]`, *optional*): + Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size + (height, width) of each image in the batch. If left to None, predictions will not be resized. + + Returns: + `List[Dict[str, TensorType]]`: A list of dictionaries of tensors representing the processed depth + predictions. + """ + requires_backends(self, "torch") + + predicted_depth = outputs.predicted_depth + + if (target_sizes is not None) and (len(predicted_depth) != len(target_sizes)): + raise ValueError( + "Make sure that you pass in as many target sizes as the batch dimension of the predicted depth" + ) + + results = [] + target_sizes = [None] * len(predicted_depth) if target_sizes is None else target_sizes + for depth, target_size in zip(predicted_depth, target_sizes): + if target_size is not None: + depth = torch.nn.functional.interpolate( + depth.unsqueeze(0).unsqueeze(1), size=target_size, mode="bicubic", align_corners=False + ).squeeze() + + results.append({"predicted_depth": depth}) + + return results + + +__all__ = ["DPTImageProcessorFast"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/dpt/modeling_dpt.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/dpt/modeling_dpt.py new file mode 100644 index 0000000000000000000000000000000000000000..49e1bbcd9f0fc8823a74c64a8e7d9c578133fbaa --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/dpt/modeling_dpt.py @@ -0,0 +1,1225 @@ +# coding=utf-8 +# Copyright 2022 Intel Labs, OpenMMLab and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch DPT (Dense Prediction Transformers) model. + +This implementation is heavily inspired by OpenMMLab's implementation, found here: +https://github.com/open-mmlab/mmsegmentation/blob/master/mmseg/models/decode_heads/dpt_head.py. + +""" + +import collections.abc +from dataclasses import dataclass +from typing import Callable, Optional + +import torch +from torch import nn +from torch.nn import CrossEntropyLoss + +from ...activations import ACT2FN +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_outputs import BaseModelOutput, DepthEstimatorOutput, SemanticSegmenterOutput +from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel +from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer +from ...utils import ModelOutput, auto_docstring, logging, torch_int +from ...utils.backbone_utils import load_backbone +from ...utils.generic import can_return_tuple, check_model_inputs +from .configuration_dpt import DPTConfig + + +logger = logging.get_logger(__name__) + + +@dataclass +@auto_docstring( + custom_intro=""" + Base class for model's outputs that also contains intermediate activations that can be used at later stages. Useful + in the context of Vision models.: + """ +) +class BaseModelOutputWithIntermediateActivations(ModelOutput): + r""" + last_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + intermediate_activations (`tuple(torch.FloatTensor)`, *optional*): + Intermediate activations that can be used to compute hidden states of the model at various layers. + """ + + last_hidden_states: Optional[torch.FloatTensor] = None + intermediate_activations: Optional[tuple[torch.FloatTensor, ...]] = None + + +@dataclass +@auto_docstring( + custom_intro=""" + Base class for model's outputs that also contains a pooling of the last hidden states as well as intermediate + activations that can be used by the model at later stages. + """ +) +class BaseModelOutputWithPoolingAndIntermediateActivations(ModelOutput): + r""" + pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`): + Last layer hidden-state of the first token of the sequence (classification token) after further processing + through the layers used for the auxiliary pretraining task. E.g. for BERT-family of models, this returns + the classification token after processing through a linear layer and a tanh activation function. The linear + layer weights are trained from the next sentence prediction (classification) objective during pretraining. + intermediate_activations (`tuple(torch.FloatTensor)`, *optional*): + Intermediate activations that can be used to compute hidden states of the model at various layers. + """ + + last_hidden_state: Optional[torch.FloatTensor] = None + pooler_output: Optional[torch.FloatTensor] = None + hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None + attentions: Optional[tuple[torch.FloatTensor, ...]] = None + intermediate_activations: Optional[tuple[torch.FloatTensor, ...]] = None + + +class DPTViTHybridEmbeddings(nn.Module): + """ + This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial + `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a + Transformer. + """ + + def __init__(self, config: DPTConfig, feature_size: Optional[tuple[int, int]] = None): + super().__init__() + image_size, patch_size = config.image_size, config.patch_size + num_channels, hidden_size = config.num_channels, config.hidden_size + + image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size) + patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size) + num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) + + self.backbone = load_backbone(config) + feature_dim = self.backbone.channels[-1] + if len(self.backbone.channels) != 3: + raise ValueError(f"Expected backbone to have 3 output features, got {len(self.backbone.channels)}") + self.residual_feature_map_index = [0, 1] # Always take the output of the first and second backbone stage + + if feature_size is None: + feat_map_shape = config.backbone_featmap_shape + feature_size = feat_map_shape[-2:] + feature_dim = feat_map_shape[1] + else: + feature_size = ( + feature_size if isinstance(feature_size, collections.abc.Iterable) else (feature_size, feature_size) + ) + feature_dim = self.backbone.channels[-1] + + self.image_size = image_size + self.patch_size = patch_size[0] + self.num_channels = num_channels + + self.projection = nn.Conv2d(feature_dim, hidden_size, kernel_size=1) + + self.cls_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size)) + self.position_embeddings = nn.Parameter(torch.zeros(1, num_patches + 1, config.hidden_size)) + + def _resize_pos_embed(self, posemb, grid_size_height, grid_size_width, start_index=1): + posemb_tok = posemb[:, :start_index] + posemb_grid = posemb[0, start_index:] + + old_grid_size = torch_int(len(posemb_grid) ** 0.5) + + posemb_grid = posemb_grid.reshape(1, old_grid_size, old_grid_size, -1).permute(0, 3, 1, 2) + posemb_grid = nn.functional.interpolate(posemb_grid, size=(grid_size_height, grid_size_width), mode="bilinear") + posemb_grid = posemb_grid.permute(0, 2, 3, 1).reshape(1, grid_size_height * grid_size_width, -1) + + posemb = torch.cat([posemb_tok, posemb_grid], dim=1) + + return posemb + + def forward( + self, pixel_values: torch.Tensor, interpolate_pos_encoding: bool = False + ) -> BaseModelOutputWithIntermediateActivations: + batch_size, num_channels, height, width = pixel_values.shape + if num_channels != self.num_channels: + raise ValueError( + "Make sure that the channel dimension of the pixel values match with the one set in the configuration." + ) + if not interpolate_pos_encoding: + if height != self.image_size[0] or width != self.image_size[1]: + raise ValueError( + f"Input image size ({height}*{width}) doesn't match model" + f" ({self.image_size[0]}*{self.image_size[1]})." + ) + + position_embeddings = self._resize_pos_embed( + self.position_embeddings, height // self.patch_size, width // self.patch_size + ) + + backbone_output = self.backbone(pixel_values) + + features = backbone_output.feature_maps[-1] + + # Retrieve also the intermediate activations to use them at later stages + output_hidden_states = [backbone_output.feature_maps[index] for index in self.residual_feature_map_index] + + embeddings = self.projection(features).flatten(2).transpose(1, 2) + + cls_tokens = self.cls_token.expand(batch_size, -1, -1) + embeddings = torch.cat((cls_tokens, embeddings), dim=1) + + # add positional encoding to each token + embeddings = embeddings + position_embeddings + + # Return hidden states and intermediate activations + return BaseModelOutputWithIntermediateActivations( + last_hidden_states=embeddings, + intermediate_activations=output_hidden_states, + ) + + +class DPTViTEmbeddings(nn.Module): + """ + Construct the CLS token, position and patch embeddings. + + """ + + def __init__(self, config): + super().__init__() + + self.cls_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size)) + self.patch_embeddings = DPTViTPatchEmbeddings(config) + num_patches = self.patch_embeddings.num_patches + self.position_embeddings = nn.Parameter(torch.zeros(1, num_patches + 1, config.hidden_size)) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.config = config + + def _resize_pos_embed(self, posemb, grid_size_height, grid_size_width, start_index=1): + posemb_tok = posemb[:, :start_index] + posemb_grid = posemb[0, start_index:] + + old_grid_size = torch_int(posemb_grid.size(0) ** 0.5) + + posemb_grid = posemb_grid.reshape(1, old_grid_size, old_grid_size, -1).permute(0, 3, 1, 2) + posemb_grid = nn.functional.interpolate(posemb_grid, size=(grid_size_height, grid_size_width), mode="bilinear") + posemb_grid = posemb_grid.permute(0, 2, 3, 1).reshape(1, grid_size_height * grid_size_width, -1) + + posemb = torch.cat([posemb_tok, posemb_grid], dim=1) + + return posemb + + def forward(self, pixel_values: torch.Tensor) -> BaseModelOutputWithIntermediateActivations: + batch_size, num_channels, height, width = pixel_values.shape + + # possibly interpolate position encodings to handle varying image sizes + patch_size = self.config.patch_size + position_embeddings = self._resize_pos_embed( + self.position_embeddings, height // patch_size, width // patch_size + ) + + embeddings = self.patch_embeddings(pixel_values) + + batch_size, seq_len, _ = embeddings.size() + + # add the [CLS] token to the embedded patch tokens + cls_tokens = self.cls_token.expand(batch_size, -1, -1) + embeddings = torch.cat((cls_tokens, embeddings), dim=1) + + # add positional encoding to each token + embeddings = embeddings + position_embeddings + + embeddings = self.dropout(embeddings) + + return BaseModelOutputWithIntermediateActivations(last_hidden_states=embeddings) + + +class DPTViTPatchEmbeddings(nn.Module): + """ + Image to Patch Embedding. + + """ + + def __init__(self, config: DPTConfig): + super().__init__() + image_size, patch_size = config.image_size, config.patch_size + num_channels, hidden_size = config.num_channels, config.hidden_size + + image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size) + patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size) + num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.num_patches = num_patches + + self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size) + + def forward(self, pixel_values: torch.Tensor) -> torch.Tensor: + batch_size, num_channels, height, width = pixel_values.shape + if num_channels != self.num_channels: + raise ValueError( + "Make sure that the channel dimension of the pixel values match with the one set in the configuration." + ) + embeddings = self.projection(pixel_values).flatten(2).transpose(1, 2) + return embeddings + + +# Copied from transformers.models.vit.modeling_vit.eager_attention_forward +def eager_attention_forward( + module: nn.Module, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attention_mask: Optional[torch.Tensor], + scaling: float, + dropout: float = 0.0, + **kwargs, +): + # Take the dot product between "query" and "key" to get the raw attention scores. + attn_weights = torch.matmul(query, key.transpose(-1, -2)) * scaling + + # Normalize the attention scores to probabilities. + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) + + # Mask heads if we want to + if attention_mask is not None: + attn_weights = attn_weights * attention_mask + + attn_output = torch.matmul(attn_weights, value) + attn_output = attn_output.transpose(1, 2).contiguous() + + return attn_output, attn_weights + + +# Copied from transformers.models.vit.modeling_vit.ViTSelfAttention with ViT->DPT +class DPTSelfAttention(nn.Module): + def __init__(self, config: DPTConfig): + super().__init__() + if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): + raise ValueError( + f"The hidden size {config.hidden_size} is not a multiple of the number of attention " + f"heads {config.num_attention_heads}." + ) + + self.config = config + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + self.dropout_prob = config.attention_probs_dropout_prob + self.scaling = self.attention_head_size**-0.5 + self.is_causal = False + + self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias) + self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias) + self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias) + + def forward( + self, hidden_states: torch.Tensor, head_mask: Optional[torch.Tensor] = None + ) -> tuple[torch.Tensor, torch.Tensor]: + batch_size = hidden_states.shape[0] + new_shape = batch_size, -1, self.num_attention_heads, self.attention_head_size + + key_layer = self.key(hidden_states).view(*new_shape).transpose(1, 2) + value_layer = self.value(hidden_states).view(*new_shape).transpose(1, 2) + query_layer = self.query(hidden_states).view(*new_shape).transpose(1, 2) + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + context_layer, attention_probs = attention_interface( + self, + query_layer, + key_layer, + value_layer, + head_mask, + is_causal=self.is_causal, + scaling=self.scaling, + dropout=0.0 if not self.training else self.dropout_prob, + ) + + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.reshape(new_context_layer_shape) + + return context_layer, attention_probs + + +# Copied from transformers.models.vit.modeling_vit.ViTSelfOutput with ViTConfig->DPTConfig, ViTSelfOutput->DPTViTSelfOutput +class DPTViTSelfOutput(nn.Module): + """ + The residual connection is defined in ViTLayer instead of here (as is the case with other models), due to the + layernorm applied before each block. + """ + + def __init__(self, config: DPTConfig): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + return hidden_states + + +# Copied from transformers.models.vit.modeling_vit.ViTAttention with ViTConfig->DPTConfig, ViTSelfAttention->DPTSelfAttention, ViTSelfOutput->DPTViTSelfOutput +class DPTViTAttention(nn.Module): + def __init__(self, config: DPTConfig): + super().__init__() + self.attention = DPTSelfAttention(config) + self.output = DPTViTSelfOutput(config) + self.pruned_heads = set() + + def prune_heads(self, heads: set[int]): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads + ) + + # Prune linear layers + self.attention.query = prune_linear_layer(self.attention.query, index) + self.attention.key = prune_linear_layer(self.attention.key, index) + self.attention.value = prune_linear_layer(self.attention.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads) + self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + def forward(self, hidden_states: torch.Tensor, head_mask: Optional[torch.Tensor] = None) -> torch.Tensor: + self_attn_output, _ = self.attention(hidden_states, head_mask) + output = self.output(self_attn_output, hidden_states) + return output + + +# Copied from transformers.models.vit.modeling_vit.ViTIntermediate with ViTConfig->DPTConfig, ViTIntermediate->DPTViTIntermediate +class DPTViTIntermediate(nn.Module): + def __init__(self, config: DPTConfig): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +# Copied from transformers.models.vit.modeling_vit.ViTOutput with ViTConfig->DPTConfig, ViTOutput->DPTViTOutput +class DPTViTOutput(nn.Module): + def __init__(self, config: DPTConfig): + super().__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = hidden_states + input_tensor + return hidden_states + + +# Copied from transformers.models.vit.modeling_vit.ViTLayer with ViTConfig->DPTConfig, ViTAttention->DPTViTAttention, ViTIntermediate->DPTViTIntermediate, ViTOutput->DPTViTOutput, ViTLayer->DPTViTLayer +class DPTViTLayer(GradientCheckpointingLayer): + """This corresponds to the Block class in the timm implementation.""" + + def __init__(self, config: DPTConfig): + super().__init__() + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 + self.attention = DPTViTAttention(config) + self.intermediate = DPTViTIntermediate(config) + self.output = DPTViTOutput(config) + self.layernorm_before = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.layernorm_after = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + def forward(self, hidden_states: torch.Tensor, head_mask: Optional[torch.Tensor] = None) -> torch.Tensor: + hidden_states_norm = self.layernorm_before(hidden_states) + attention_output = self.attention(hidden_states_norm, head_mask) + + # first residual connection + hidden_states = attention_output + hidden_states + + # in ViT, layernorm is also applied after self-attention + layer_output = self.layernorm_after(hidden_states) + layer_output = self.intermediate(layer_output) + + # second residual connection is done here + layer_output = self.output(layer_output, hidden_states) + + return layer_output + + +# Copied from transformers.models.dinov2.modeling_dinov2.Dinov2Encoder with Dinov2Config->DPTConfig, Dinov2->DPTViT +class DPTViTEncoder(nn.Module): + def __init__(self, config: DPTConfig): + super().__init__() + self.config = config + self.layer = nn.ModuleList([DPTViTLayer(config) for _ in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + + def forward( + self, hidden_states: torch.Tensor, head_mask: Optional[torch.Tensor] = None, output_hidden_states: bool = False + ) -> BaseModelOutput: + all_hidden_states = [hidden_states] if output_hidden_states else None + for i, layer_module in enumerate(self.layer): + layer_head_mask = head_mask[i] if head_mask is not None else None + hidden_states = layer_module(hidden_states, layer_head_mask) + if all_hidden_states: + all_hidden_states.append(hidden_states) + + return BaseModelOutput( + last_hidden_state=hidden_states, + hidden_states=tuple(all_hidden_states) if all_hidden_states else None, + ) + + +class DPTReassembleStage(nn.Module): + """ + This class reassembles the hidden states of the backbone into image-like feature representations at various + resolutions. + + This happens in 3 stages: + 1. Map the N + 1 tokens to a set of N tokens, by taking into account the readout ([CLS]) token according to + `config.readout_type`. + 2. Project the channel dimension of the hidden states according to `config.neck_hidden_sizes`. + 3. Resizing the spatial dimensions (height, width). + + Args: + config (`[DPTConfig]`): + Model configuration class defining the model architecture. + """ + + def __init__(self, config): + super().__init__() + + self.config = config + self.layers = nn.ModuleList() + if config.is_hybrid: + self._init_reassemble_dpt_hybrid(config) + else: + self._init_reassemble_dpt(config) + + self.neck_ignore_stages = config.neck_ignore_stages + + def _init_reassemble_dpt_hybrid(self, config): + r""" " + For DPT-Hybrid the first 2 reassemble layers are set to `nn.Identity()`, please check the official + implementation: https://github.com/isl-org/DPT/blob/f43ef9e08d70a752195028a51be5e1aff227b913/dpt/vit.py#L438 + for more details. + """ + for i, factor in zip(range(len(config.neck_hidden_sizes)), config.reassemble_factors): + if i <= 1: + self.layers.append(nn.Identity()) + elif i > 1: + self.layers.append(DPTReassembleLayer(config, channels=config.neck_hidden_sizes[i], factor=factor)) + + if config.readout_type != "project": + raise ValueError(f"Readout type {config.readout_type} is not supported for DPT-Hybrid.") + + # When using DPT-Hybrid the readout type is set to "project". The sanity check is done on the config file + self.readout_projects = nn.ModuleList() + hidden_size = _get_backbone_hidden_size(config) + for i in range(len(config.neck_hidden_sizes)): + if i <= 1: + self.readout_projects.append(nn.Sequential(nn.Identity())) + elif i > 1: + self.readout_projects.append( + nn.Sequential(nn.Linear(2 * hidden_size, hidden_size), ACT2FN[config.hidden_act]) + ) + + def _init_reassemble_dpt(self, config): + for i, factor in zip(range(len(config.neck_hidden_sizes)), config.reassemble_factors): + self.layers.append(DPTReassembleLayer(config, channels=config.neck_hidden_sizes[i], factor=factor)) + + if config.readout_type == "project": + self.readout_projects = nn.ModuleList() + hidden_size = _get_backbone_hidden_size(config) + for _ in range(len(config.neck_hidden_sizes)): + self.readout_projects.append( + nn.Sequential(nn.Linear(2 * hidden_size, hidden_size), ACT2FN[config.hidden_act]) + ) + + def forward(self, hidden_states: list[torch.Tensor], patch_height=None, patch_width=None) -> list[torch.Tensor]: + """ + Args: + hidden_states (`list[torch.FloatTensor]`, each of shape `(batch_size, sequence_length + 1, hidden_size)`): + List of hidden states from the backbone. + """ + out = [] + + for i, hidden_state in enumerate(hidden_states): + if i not in self.neck_ignore_stages: + # reshape to (batch_size, num_channels, height, width) + cls_token, hidden_state = hidden_state[:, 0], hidden_state[:, 1:] + batch_size, sequence_length, num_channels = hidden_state.shape + if patch_height is not None and patch_width is not None: + hidden_state = hidden_state.reshape(batch_size, patch_height, patch_width, num_channels) + else: + size = torch_int(sequence_length**0.5) + hidden_state = hidden_state.reshape(batch_size, size, size, num_channels) + hidden_state = hidden_state.permute(0, 3, 1, 2).contiguous() + + feature_shape = hidden_state.shape + if self.config.readout_type == "project": + # reshape to (batch_size, height*width, num_channels) + hidden_state = hidden_state.flatten(2).permute((0, 2, 1)) + readout = cls_token.unsqueeze(1).expand_as(hidden_state) + # concatenate the readout token to the hidden states and project + hidden_state = self.readout_projects[i](torch.cat((hidden_state, readout), -1)) + # reshape back to (batch_size, num_channels, height, width) + hidden_state = hidden_state.permute(0, 2, 1).reshape(feature_shape) + elif self.config.readout_type == "add": + hidden_state = hidden_state.flatten(2) + cls_token.unsqueeze(-1) + hidden_state = hidden_state.reshape(feature_shape) + hidden_state = self.layers[i](hidden_state) + out.append(hidden_state) + + return out + + +def _get_backbone_hidden_size(config): + if config.backbone_config is not None and config.is_hybrid is False: + return config.backbone_config.hidden_size + else: + return config.hidden_size + + +class DPTReassembleLayer(nn.Module): + def __init__(self, config: DPTConfig, channels: int, factor: int): + super().__init__() + # projection + hidden_size = _get_backbone_hidden_size(config) + self.projection = nn.Conv2d(in_channels=hidden_size, out_channels=channels, kernel_size=1) + + # up/down sampling depending on factor + if factor > 1: + self.resize = nn.ConvTranspose2d(channels, channels, kernel_size=factor, stride=factor, padding=0) + elif factor == 1: + self.resize = nn.Identity() + elif factor < 1: + # so should downsample + self.resize = nn.Conv2d(channels, channels, kernel_size=3, stride=int(1 / factor), padding=1) + + def forward(self, hidden_state): + hidden_state = self.projection(hidden_state) + hidden_state = self.resize(hidden_state) + return hidden_state + + +class DPTFeatureFusionStage(nn.Module): + def __init__(self, config: DPTConfig): + super().__init__() + self.layers = nn.ModuleList() + for _ in range(len(config.neck_hidden_sizes)): + self.layers.append(DPTFeatureFusionLayer(config)) + + def forward(self, hidden_states): + # reversing the hidden_states, we start from the last + hidden_states = hidden_states[::-1] + + fused_hidden_states = [] + fused_hidden_state = None + for hidden_state, layer in zip(hidden_states, self.layers): + if fused_hidden_state is None: + # first layer only uses the last hidden_state + fused_hidden_state = layer(hidden_state) + else: + fused_hidden_state = layer(fused_hidden_state, hidden_state) + fused_hidden_states.append(fused_hidden_state) + + return fused_hidden_states + + +class DPTPreActResidualLayer(nn.Module): + """ + ResidualConvUnit, pre-activate residual unit. + + Args: + config (`[DPTConfig]`): + Model configuration class defining the model architecture. + """ + + def __init__(self, config: DPTConfig): + super().__init__() + + self.use_batch_norm = config.use_batch_norm_in_fusion_residual + use_bias_in_fusion_residual = ( + config.use_bias_in_fusion_residual + if config.use_bias_in_fusion_residual is not None + else not self.use_batch_norm + ) + + self.activation1 = nn.ReLU() + self.convolution1 = nn.Conv2d( + config.fusion_hidden_size, + config.fusion_hidden_size, + kernel_size=3, + stride=1, + padding=1, + bias=use_bias_in_fusion_residual, + ) + + self.activation2 = nn.ReLU() + self.convolution2 = nn.Conv2d( + config.fusion_hidden_size, + config.fusion_hidden_size, + kernel_size=3, + stride=1, + padding=1, + bias=use_bias_in_fusion_residual, + ) + + if self.use_batch_norm: + self.batch_norm1 = nn.BatchNorm2d(config.fusion_hidden_size) + self.batch_norm2 = nn.BatchNorm2d(config.fusion_hidden_size) + + def forward(self, hidden_state: torch.Tensor) -> torch.Tensor: + residual = hidden_state + hidden_state = self.activation1(hidden_state) + + hidden_state = self.convolution1(hidden_state) + + if self.use_batch_norm: + hidden_state = self.batch_norm1(hidden_state) + + hidden_state = self.activation2(hidden_state) + hidden_state = self.convolution2(hidden_state) + + if self.use_batch_norm: + hidden_state = self.batch_norm2(hidden_state) + + return hidden_state + residual + + +class DPTFeatureFusionLayer(nn.Module): + """Feature fusion layer, merges feature maps from different stages. + + Args: + config (`[DPTConfig]`): + Model configuration class defining the model architecture. + align_corners (`bool`, *optional*, defaults to `True`): + The align_corner setting for bilinear upsample. + """ + + def __init__(self, config: DPTConfig, align_corners: bool = True): + super().__init__() + + self.align_corners = align_corners + + self.projection = nn.Conv2d(config.fusion_hidden_size, config.fusion_hidden_size, kernel_size=1, bias=True) + + self.residual_layer1 = DPTPreActResidualLayer(config) + self.residual_layer2 = DPTPreActResidualLayer(config) + + def forward(self, hidden_state: torch.Tensor, residual: Optional[torch.Tensor] = None) -> torch.Tensor: + if residual is not None: + if hidden_state.shape != residual.shape: + residual = nn.functional.interpolate( + residual, size=(hidden_state.shape[2], hidden_state.shape[3]), mode="bilinear", align_corners=False + ) + hidden_state = hidden_state + self.residual_layer1(residual) + + hidden_state = self.residual_layer2(hidden_state) + hidden_state = nn.functional.interpolate( + hidden_state, scale_factor=2, mode="bilinear", align_corners=self.align_corners + ) + hidden_state = self.projection(hidden_state) + + return hidden_state + + +@auto_docstring +class DPTPreTrainedModel(PreTrainedModel): + config: DPTConfig + base_model_prefix = "dpt" + main_input_name = "pixel_values" + supports_gradient_checkpointing = True + _supports_sdpa = True + _supports_flash_attn = True + _supports_flex_attn = True + _supports_attention_backend = True + _can_record_outputs = { + "attentions": DPTSelfAttention, + } + + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, (nn.Linear, nn.Conv2d, nn.ConvTranspose2d)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, (nn.LayerNorm, nn.BatchNorm2d)): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + if isinstance(module, (DPTViTEmbeddings, DPTViTHybridEmbeddings)): + module.cls_token.data.zero_() + module.position_embeddings.data.zero_() + + +@auto_docstring +class DPTModel(DPTPreTrainedModel): + def __init__(self, config: DPTConfig, add_pooling_layer: bool = True): + r""" + add_pooling_layer (bool, *optional*, defaults to `True`): + Whether to add a pooling layer + """ + super().__init__(config) + self.config = config + + # vit encoder + if config.is_hybrid: + self.embeddings = DPTViTHybridEmbeddings(config) + else: + self.embeddings = DPTViTEmbeddings(config) + self.encoder = DPTViTEncoder(config) + + self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.pooler = DPTViTPooler(config) if add_pooling_layer else None + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + if self.config.is_hybrid: + return self.embeddings + else: + return self.embeddings.patch_embeddings + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + @check_model_inputs(tie_last_hidden_states=False) + @auto_docstring + def forward( + self, + pixel_values: torch.FloatTensor, + head_mask: Optional[torch.FloatTensor] = None, + output_hidden_states: Optional[bool] = None, + **kwargs, + ) -> BaseModelOutputWithPoolingAndIntermediateActivations: + if output_hidden_states is None: + output_hidden_states = self.config.output_hidden_states + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + + embedding_output: BaseModelOutputWithIntermediateActivations = self.embeddings(pixel_values) + embedding_last_hidden_states = embedding_output.last_hidden_states + + encoder_outputs: BaseModelOutput = self.encoder( + embedding_last_hidden_states, head_mask=head_mask, output_hidden_states=output_hidden_states + ) + sequence_output = encoder_outputs.last_hidden_state + + sequence_output = self.layernorm(sequence_output) + pooled_output = self.pooler(sequence_output) if self.pooler is not None else None + + return BaseModelOutputWithPoolingAndIntermediateActivations( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + intermediate_activations=embedding_output.intermediate_activations, + hidden_states=encoder_outputs.hidden_states, + ) + + +# Copied from transformers.models.vit.modeling_vit.ViTPooler with ViTConfig->DPTConfig, ViTPooler->DPTViTPooler +class DPTViTPooler(nn.Module): + def __init__(self, config: DPTConfig): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.pooler_output_size) + self.activation = ACT2FN[config.pooler_act] + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. + first_token_tensor = hidden_states[:, 0] + pooled_output = self.dense(first_token_tensor) + pooled_output = self.activation(pooled_output) + return pooled_output + + +class DPTNeck(nn.Module): + """ + DPTNeck. A neck is a module that is normally used between the backbone and the head. It takes a list of tensors as + input and produces another list of tensors as output. For DPT, it includes 2 stages: + + * DPTReassembleStage + * DPTFeatureFusionStage. + + Args: + config (dict): config dict. + """ + + def __init__(self, config: DPTConfig): + super().__init__() + self.config = config + + # postprocessing: only required in case of a non-hierarchical backbone (e.g. ViT, BEiT) + if config.backbone_config is not None and config.backbone_config.model_type == "swinv2": + self.reassemble_stage = None + else: + self.reassemble_stage = DPTReassembleStage(config) + + self.convs = nn.ModuleList() + for channel in config.neck_hidden_sizes: + self.convs.append(nn.Conv2d(channel, config.fusion_hidden_size, kernel_size=3, padding=1, bias=False)) + + # fusion + self.fusion_stage = DPTFeatureFusionStage(config) + + def forward( + self, + hidden_states: list[torch.Tensor], + patch_height: Optional[int] = None, + patch_width: Optional[int] = None, + ) -> list[torch.Tensor]: + """ + Args: + hidden_states (`list[torch.FloatTensor]`, each of shape `(batch_size, sequence_length, hidden_size)` or `(batch_size, hidden_size, height, width)`): + List of hidden states from the backbone. + """ + if not isinstance(hidden_states, (tuple, list)): + raise TypeError("hidden_states should be a tuple or list of tensors") + + if len(hidden_states) != len(self.config.neck_hidden_sizes): + raise ValueError("The number of hidden states should be equal to the number of neck hidden sizes.") + + # postprocess hidden states + if self.reassemble_stage is not None: + hidden_states = self.reassemble_stage(hidden_states, patch_height, patch_width) + + features = [self.convs[i](feature) for i, feature in enumerate(hidden_states)] + + # fusion blocks + output = self.fusion_stage(features) + + return output + + +class DPTDepthEstimationHead(nn.Module): + """ + Output head consisting of 3 convolutional layers. It progressively halves the feature dimension and upsamples + the predictions to the input resolution after the first convolutional layer (details can be found in the paper's + supplementary material). + """ + + def __init__(self, config: DPTConfig): + super().__init__() + + self.config = config + + self.projection = None + if config.add_projection: + self.projection = nn.Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + + features = config.fusion_hidden_size + self.head = nn.Sequential( + nn.Conv2d(features, features // 2, kernel_size=3, stride=1, padding=1), + nn.Upsample(scale_factor=2, mode="bilinear", align_corners=True), + nn.Conv2d(features // 2, 32, kernel_size=3, stride=1, padding=1), + nn.ReLU(), + nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0), + nn.ReLU(), + ) + + def forward(self, hidden_states: list[torch.Tensor]) -> torch.Tensor: + # use last features + hidden_states = hidden_states[self.config.head_in_index] + + if self.projection is not None: + hidden_states = self.projection(hidden_states) + hidden_states = nn.ReLU()(hidden_states) + + predicted_depth = self.head(hidden_states) + predicted_depth = predicted_depth.squeeze(dim=1) + + return predicted_depth + + +@auto_docstring( + custom_intro=""" + DPT Model with a depth estimation head on top (consisting of 3 convolutional layers) e.g. for KITTI, NYUv2. + """ +) +class DPTForDepthEstimation(DPTPreTrainedModel): + def __init__(self, config): + super().__init__(config) + + self.backbone = None + if config.is_hybrid is False and (config.backbone_config is not None or config.backbone is not None): + self.backbone = load_backbone(config) + else: + self.dpt = DPTModel(config, add_pooling_layer=False) + + # Neck + self.neck = DPTNeck(config) + + # Depth estimation head + self.head = DPTDepthEstimationHead(config) + + # Initialize weights and apply final processing + self.post_init() + + @can_return_tuple + @auto_docstring + def forward( + self, + pixel_values: torch.FloatTensor, + head_mask: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + output_hidden_states: Optional[bool] = None, + **kwargs, + ) -> DepthEstimatorOutput: + r""" + labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*): + Ground truth depth estimation maps for computing the loss. + + Examples: + ```python + >>> from transformers import AutoImageProcessor, DPTForDepthEstimation + >>> import torch + >>> import numpy as np + >>> from PIL import Image + >>> import requests + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> image_processor = AutoImageProcessor.from_pretrained("Intel/dpt-large") + >>> model = DPTForDepthEstimation.from_pretrained("Intel/dpt-large") + + >>> # prepare image for the model + >>> inputs = image_processor(images=image, return_tensors="pt") + + >>> with torch.no_grad(): + ... outputs = model(**inputs) + + >>> # interpolate to original size + >>> post_processed_output = image_processor.post_process_depth_estimation( + ... outputs, + ... target_sizes=[(image.height, image.width)], + ... ) + + >>> # visualize the prediction + >>> predicted_depth = post_processed_output[0]["predicted_depth"] + >>> depth = predicted_depth * 255 / predicted_depth.max() + >>> depth = depth.detach().cpu().numpy() + >>> depth = Image.fromarray(depth.astype("uint8")) + ```""" + + if output_hidden_states is None: + output_hidden_states = self.config.output_hidden_states + + loss = None + if labels is not None: + raise NotImplementedError("Training is not implemented yet") + + if self.backbone is not None: + outputs = self.backbone.forward_with_filtered_kwargs(pixel_values, output_hidden_states=True, **kwargs) + hidden_states = outputs.feature_maps + else: + outputs = self.dpt(pixel_values, head_mask=head_mask, output_hidden_states=True, **kwargs) + hidden_states = outputs.hidden_states + # only keep certain features based on config.backbone_out_indices + # note that the hidden_states also include the initial embeddings + if not self.config.is_hybrid: + hidden_states = [ + feature for idx, feature in enumerate(hidden_states[1:]) if idx in self.config.backbone_out_indices + ] + else: + backbone_hidden_states = outputs.intermediate_activations + backbone_hidden_states.extend( + feature + for idx, feature in enumerate(hidden_states[1:]) + if idx in self.config.backbone_out_indices[2:] + ) + hidden_states = backbone_hidden_states + + patch_height, patch_width = None, None + if self.config.backbone_config is not None and self.config.is_hybrid is False: + _, _, height, width = pixel_values.shape + patch_size = self.config.backbone_config.patch_size + patch_height = height // patch_size + patch_width = width // patch_size + + hidden_states = self.neck(hidden_states, patch_height, patch_width) + predicted_depth = self.head(hidden_states) + + return DepthEstimatorOutput( + loss=loss, + predicted_depth=predicted_depth, + hidden_states=outputs.hidden_states if output_hidden_states else None, + attentions=outputs.attentions, + ) + + +class DPTSemanticSegmentationHead(nn.Module): + def __init__(self, config: DPTConfig): + super().__init__() + + self.config = config + features = config.fusion_hidden_size + self.head = nn.Sequential( + nn.Conv2d(features, features, kernel_size=3, padding=1, bias=False), + nn.BatchNorm2d(features), + nn.ReLU(), + nn.Dropout(config.semantic_classifier_dropout), + nn.Conv2d(features, config.num_labels, kernel_size=1), + nn.Upsample(scale_factor=2, mode="bilinear", align_corners=True), + ) + + def forward(self, hidden_states: list[torch.Tensor]) -> torch.Tensor: + # use last features + hidden_states = hidden_states[self.config.head_in_index] + logits = self.head(hidden_states) + return logits + + +class DPTAuxiliaryHead(nn.Module): + def __init__(self, config: DPTConfig): + super().__init__() + + features = config.fusion_hidden_size + self.head = nn.Sequential( + nn.Conv2d(features, features, kernel_size=3, padding=1, bias=False), + nn.BatchNorm2d(features), + nn.ReLU(), + nn.Dropout(0.1, False), + nn.Conv2d(features, config.num_labels, kernel_size=1), + ) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + logits = self.head(hidden_states) + return logits + + +@auto_docstring +class DPTForSemanticSegmentation(DPTPreTrainedModel): + def __init__(self, config: DPTConfig): + super().__init__(config) + + self.dpt = DPTModel(config, add_pooling_layer=False) + + # Neck + self.neck = DPTNeck(config) + + # Segmentation head(s) + self.head = DPTSemanticSegmentationHead(config) + self.auxiliary_head = DPTAuxiliaryHead(config) if config.use_auxiliary_head else None + + # Initialize weights and apply final processing + self.post_init() + + @can_return_tuple + @auto_docstring + def forward( + self, + pixel_values: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + output_hidden_states: Optional[bool] = None, + **kwargs, + ) -> SemanticSegmenterOutput: + r""" + labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*): + Ground truth semantic segmentation maps for computing the loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels > 1`, a classification loss is computed (Cross-Entropy). + + Examples: + ```python + >>> from transformers import AutoImageProcessor, DPTForSemanticSegmentation + >>> from PIL import Image + >>> import requests + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> image_processor = AutoImageProcessor.from_pretrained("Intel/dpt-large-ade") + >>> model = DPTForSemanticSegmentation.from_pretrained("Intel/dpt-large-ade") + + >>> inputs = image_processor(images=image, return_tensors="pt") + + >>> outputs = model(**inputs) + >>> logits = outputs.logits + ```""" + if output_hidden_states is None: + output_hidden_states = self.config.output_hidden_states + + if labels is not None and self.config.num_labels == 1: + raise ValueError("The number of labels should be greater than one") + + outputs: BaseModelOutputWithPoolingAndIntermediateActivations = self.dpt( + pixel_values, head_mask=head_mask, output_hidden_states=True, **kwargs + ) + hidden_states = outputs.hidden_states + + # only keep certain features based on config.backbone_out_indices + # note that the hidden_states also include the initial embeddings + if not self.config.is_hybrid: + hidden_states = [ + feature for idx, feature in enumerate(hidden_states[1:]) if idx in self.config.backbone_out_indices + ] + else: + backbone_hidden_states = outputs.intermediate_activations + backbone_hidden_states.extend( + feature for idx, feature in enumerate(hidden_states[1:]) if idx in self.config.backbone_out_indices[2:] + ) + + hidden_states = backbone_hidden_states + + hidden_states = self.neck(hidden_states=hidden_states) + logits = self.head(hidden_states) + + auxiliary_logits = None + if self.auxiliary_head is not None: + auxiliary_logits = self.auxiliary_head(hidden_states[-1]) + + loss = None + if labels is not None: + # upsample logits to the images' original size + upsampled_logits = nn.functional.interpolate( + logits, size=labels.shape[-2:], mode="bilinear", align_corners=False + ) + if auxiliary_logits is not None: + upsampled_auxiliary_logits = nn.functional.interpolate( + auxiliary_logits, size=labels.shape[-2:], mode="bilinear", align_corners=False + ) + # compute weighted loss + loss_fct = CrossEntropyLoss(ignore_index=self.config.semantic_loss_ignore_index) + main_loss = loss_fct(upsampled_logits, labels) + auxiliary_loss = loss_fct(upsampled_auxiliary_logits, labels) + loss = main_loss + self.config.auxiliary_loss_weight * auxiliary_loss + + return SemanticSegmenterOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states if output_hidden_states else None, + attentions=outputs.attentions, + ) + + +__all__ = ["DPTForDepthEstimation", "DPTForSemanticSegmentation", "DPTModel", "DPTPreTrainedModel"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/dpt/modular_dpt.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/dpt/modular_dpt.py new file mode 100644 index 0000000000000000000000000000000000000000..34eb08f39b684cfca10624b73a5669b9d9577632 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/dpt/modular_dpt.py @@ -0,0 +1,299 @@ +# coding=utf-8 +# Copyright 2025 HuggingFace Inc. team. All rights reserved. +# +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +from collections.abc import Iterable +from typing import TYPE_CHECKING, Optional, Union + +import torch + +from ...image_processing_base import BatchFeature +from ...image_processing_utils_fast import BaseImageProcessorFast, DefaultFastImageProcessorKwargs +from ...image_transforms import group_images_by_shape, reorder_images +from ...image_utils import ( + IMAGENET_STANDARD_MEAN, + IMAGENET_STANDARD_STD, + PILImageResampling, + SizeDict, +) +from ...utils import ( + TensorType, + auto_docstring, + requires_backends, +) +from ..beit.image_processing_beit_fast import BeitImageProcessorFast + + +if TYPE_CHECKING: + from ...modeling_outputs import DepthEstimatorOutput + +from torchvision.transforms.v2 import functional as F + + +def get_resize_output_image_size( + input_image: "torch.Tensor", + output_size: Union[int, Iterable[int]], + keep_aspect_ratio: bool, + multiple: int, +) -> SizeDict: + def constrain_to_multiple_of(val, multiple, min_val=0, max_val=None): + x = round(val / multiple) * multiple + + if max_val is not None and x > max_val: + x = math.floor(val / multiple) * multiple + + if x < min_val: + x = math.ceil(val / multiple) * multiple + + return x + + input_height, input_width = input_image.shape[-2:] + output_height, output_width = output_size + + # determine new height and width + scale_height = output_height / input_height + scale_width = output_width / input_width + + if keep_aspect_ratio: + # scale as little as possible + if abs(1 - scale_width) < abs(1 - scale_height): + # fit width + scale_height = scale_width + else: + # fit height + scale_width = scale_height + + new_height = constrain_to_multiple_of(scale_height * input_height, multiple=multiple) + new_width = constrain_to_multiple_of(scale_width * input_width, multiple=multiple) + + return SizeDict(height=new_height, width=new_width) + + +class DPTFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): + """ + ensure_multiple_of (`int`, *optional*, defaults to 1): + If `do_resize` is `True`, the image is resized to a size that is a multiple of this value. Can be overridden + by `ensure_multiple_of` in `preprocess`. + size_divisor (`int`, *optional*): + If `do_pad` is `True`, pads the image dimensions to be divisible by this value. This was introduced in the + DINOv2 paper, which uses the model in combination with DPT. + keep_aspect_ratio (`bool`, *optional*, defaults to `False`): + If `True`, the image is resized to the largest possible size such that the aspect ratio is preserved. Can + be overridden by `keep_aspect_ratio` in `preprocess`. + do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`): + Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0 + is used for background, and background itself is not included in all classes of a dataset (e.g. + ADE20k). The background label will be replaced by 255. + """ + + ensure_multiple_of: Optional[int] + size_divisor: Optional[int] + keep_aspect_ratio: Optional[bool] + do_reduce_labels: Optional[bool] + + +@auto_docstring +class DPTImageProcessorFast(BeitImageProcessorFast): + resample = PILImageResampling.BICUBIC + image_mean = IMAGENET_STANDARD_MEAN + image_std = IMAGENET_STANDARD_STD + size = {"height": 384, "width": 384} + do_resize = True + do_rescale = True + do_normalize = True + do_pad = False + rescale_factor = 1 / 255 + ensure_multiple_of = 1 + keep_aspect_ratio = False + do_reduce_labels = False + crop_size = None + do_center_crop = None + do_reduce_labels = None + + valid_kwargs = DPTFastImageProcessorKwargs + + def resize( + self, + image: "torch.Tensor", + size: SizeDict, + interpolation: Optional["F.InterpolationMode"] = None, + antialias: bool = True, + ensure_multiple_of: Optional[int] = 1, + keep_aspect_ratio: bool = False, + ) -> "torch.Tensor": + """ + Resize an image to `(size["height"], size["width"])`. + + Args: + image (`torch.Tensor`): + Image to resize. + size (`SizeDict`): + Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image. + interpolation (`InterpolationMode`, *optional*, defaults to `InterpolationMode.BILINEAR`): + `InterpolationMode` filter to use when resizing the image e.g. `InterpolationMode.BICUBIC`. + antialias (`bool`, *optional*, defaults to `True`): + Whether to use antialiasing when resizing the image + ensure_multiple_of (`int`, *optional*): + If `do_resize` is `True`, the image is resized to a size that is a multiple of this value + keep_aspect_ratio (`bool`, *optional*, defaults to `False`): + If `True`, and `do_resize` is `True`, the image is resized to the largest possible size such that the aspect ratio is preserved. + + Returns: + `torch.Tensor`: The resized image. + """ + if not size.height or not size.width: + raise ValueError(f"The size dictionary must contain the keys 'height' and 'width'. Got {size.keys()}") + + output_size = get_resize_output_image_size( + image, + output_size=(size.height, size.width), + keep_aspect_ratio=keep_aspect_ratio, + multiple=ensure_multiple_of, + ) + return BaseImageProcessorFast.resize( + self, image, output_size, interpolation=interpolation, antialias=antialias + ) + + def pad_image( + self, + image: "torch.Tensor", + size_divisor: int = 1, + ) -> "torch.Tensor": + r""" + Center pad a batch of images to be a multiple of `size_divisor`. + + Args: + image (`torch.Tensor`): + Image to pad. Can be a batch of images of dimensions (N, C, H, W) or a single image of dimensions (C, H, W). + size_divisor (`int`): + The width and height of the image will be padded to a multiple of this number. + """ + height, width = image.shape[-2:] + + def _get_pad(size, size_divisor): + new_size = math.ceil(size / size_divisor) * size_divisor + pad_size = new_size - size + pad_size_left = pad_size // 2 + pad_size_right = pad_size - pad_size_left + return pad_size_left, pad_size_right + + pad_top, pad_bottom = _get_pad(height, size_divisor) + pad_left, pad_right = _get_pad(width, size_divisor) + padding = (pad_left, pad_top, pad_right, pad_bottom) + return F.pad(image, padding) + + def _preprocess( + self, + images: list["torch.Tensor"], + do_reduce_labels: bool, + do_resize: bool, + size: SizeDict, + interpolation: Optional["F.InterpolationMode"], + do_center_crop: bool, + crop_size: SizeDict, + do_rescale: bool, + rescale_factor: float, + do_normalize: bool, + image_mean: Optional[Union[float, list[float]]], + image_std: Optional[Union[float, list[float]]], + keep_aspect_ratio: bool, + ensure_multiple_of: Optional[int], + do_pad: bool, + size_divisor: Optional[int], + disable_grouping: Optional[bool], + return_tensors: Optional[Union[str, TensorType]], + **kwargs, + ) -> BatchFeature: + if do_reduce_labels: + images = self.reduce_label(images) + + # Group images by size for batched resizing + grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping) + resized_images_grouped = {} + for shape, stacked_images in grouped_images.items(): + if do_resize: + stacked_images = self.resize( + image=stacked_images, + size=size, + interpolation=interpolation, + ensure_multiple_of=ensure_multiple_of, + keep_aspect_ratio=keep_aspect_ratio, + ) + resized_images_grouped[shape] = stacked_images + resized_images = reorder_images(resized_images_grouped, grouped_images_index) + + # Group images by size for further processing + # Needed in case do_resize is False, or resize returns images with different sizes + grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping) + processed_images_grouped = {} + for shape, stacked_images in grouped_images.items(): + if do_center_crop: + stacked_images = self.center_crop(stacked_images, crop_size) + if do_pad: + stacked_images = self.pad_image(stacked_images, size_divisor) + # Fused rescale and normalize + stacked_images = self.rescale_and_normalize( + stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std + ) + processed_images_grouped[shape] = stacked_images + + processed_images = reorder_images(processed_images_grouped, grouped_images_index) + processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images + return BatchFeature(data={"pixel_values": processed_images}) + + def post_process_depth_estimation( + self, + outputs: "DepthEstimatorOutput", + target_sizes: Optional[Union[TensorType, list[tuple[int, int]], None]] = None, + ) -> list[dict[str, TensorType]]: + """ + Converts the raw output of [`DepthEstimatorOutput`] into final depth predictions and depth PIL images. + Only supports PyTorch. + + Args: + outputs ([`DepthEstimatorOutput`]): + Raw outputs of the model. + target_sizes (`TensorType` or `List[Tuple[int, int]]`, *optional*): + Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size + (height, width) of each image in the batch. If left to None, predictions will not be resized. + + Returns: + `List[Dict[str, TensorType]]`: A list of dictionaries of tensors representing the processed depth + predictions. + """ + requires_backends(self, "torch") + + predicted_depth = outputs.predicted_depth + + if (target_sizes is not None) and (len(predicted_depth) != len(target_sizes)): + raise ValueError( + "Make sure that you pass in as many target sizes as the batch dimension of the predicted depth" + ) + + results = [] + target_sizes = [None] * len(predicted_depth) if target_sizes is None else target_sizes + for depth, target_size in zip(predicted_depth, target_sizes): + if target_size is not None: + depth = torch.nn.functional.interpolate( + depth.unsqueeze(0).unsqueeze(1), size=target_size, mode="bicubic", align_corners=False + ).squeeze() + + results.append({"predicted_depth": depth}) + + return results + + +__all__ = ["DPTImageProcessorFast"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/edgetam_video/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/edgetam_video/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..669dd64ec3044b4689c8a15bfd55ff0749675820 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/edgetam_video/__init__.py @@ -0,0 +1,29 @@ +# coding=utf-8 +# Copyright 2025 the HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_edgetam_video import * + from .modeling_edgetam_video import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/edgetam_video/configuration_edgetam_video.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/edgetam_video/configuration_edgetam_video.py new file mode 100644 index 0000000000000000000000000000000000000000..954864397dcbe63c1d95bb5c60586b5fbedcaabc --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/edgetam_video/configuration_edgetam_video.py @@ -0,0 +1,435 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/edgetam_video/modular_edgetam_video.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_edgetam_video.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright 2025 the HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ...configuration_utils import PretrainedConfig +from ..auto import CONFIG_MAPPING, AutoConfig + + +class EdgeTamVideoPromptEncoderConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`EdgeTamVideoPromptEncoder`]. The [`EdgeTamVideoPromptEncoder`] + module is used to encode the input 2D points and bounding boxes. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + hidden_size (`int`, *optional*, defaults to 256): + Dimensionality of the hidden states. + image_size (`int`, *optional*, defaults to 1024): + The expected output resolution of the image. + patch_size (`int`, *optional*, defaults to 16): + The size (resolution) of each patch. + mask_input_channels (`int`, *optional*, defaults to 16): + The number of channels to be fed to the `MaskDecoder` module. + num_point_embeddings (`int`, *optional*, defaults to 4): + The number of point embeddings to be used. + hidden_act (`str`, *optional*, defaults to `"gelu"`): + The non-linear activation function in the encoder and pooler. + layer_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon used by the layer normalization layers. + scale (`float`, *optional*, defaults to 1): + The scale factor for the prompt encoder. + """ + + base_config_key = "prompt_encoder_config" + + def __init__( + self, + hidden_size=256, + image_size=1024, + patch_size=16, + mask_input_channels=16, + num_point_embeddings=4, + hidden_act="gelu", + layer_norm_eps=1e-6, + scale=1, + **kwargs, + ): + super().__init__(**kwargs) + self.hidden_size = hidden_size + self.image_size = image_size + self.patch_size = patch_size + self.mask_input_channels = mask_input_channels + self.num_point_embeddings = num_point_embeddings + self.hidden_act = hidden_act + self.layer_norm_eps = layer_norm_eps + self.scale = scale + + +class EdgeTamVideoMaskDecoderConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`EdgeTamVideoMaskDecoder`]. It is used to instantiate a EDGETAM_VIDEO + memory encoder according to the specified arguments, defining the model architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + hidden_size (`int`, *optional*, defaults to 256): + Dimensionality of the hidden states. + hidden_act (`str`, *optional*, defaults to `"gelu"`): + The non-linear activation function in the EDGETAM_VIDEO mask decoder. + mlp_dim (`int`, *optional*, defaults to 2048): + The dimension of the MLP in the two-way transformer. + num_hidden_layers (`int`, *optional*, defaults to 2): + The number of hidden layers in the two-way transformer. + num_attention_heads (`int`, *optional*, defaults to 8): + The number of attention heads in the two-way transformer. + attention_downsample_rate (`int`, *optional*, defaults to 2): + The downsample rate for the attention layers. + num_multimask_outputs (`int`, *optional*, defaults to 3): + The number of multimask outputs. + iou_head_depth (`int`, *optional*, defaults to 3): + The depth of the IoU head. + iou_head_hidden_dim (`int`, *optional*, defaults to 256): + The hidden dimension of the IoU head. + dynamic_multimask_via_stability (`bool`, *optional*, defaults to `True`): + Whether to use dynamic multimask via stability. + dynamic_multimask_stability_delta (`float`, *optional*, defaults to 0.05): + The stability delta for the dynamic multimask. + dynamic_multimask_stability_thresh (`float`, *optional*, defaults to 0.98): + The stability threshold for the dynamic multimask. + + """ + + base_config_key = "mask_decoder_config" + + def __init__( + self, + hidden_size=256, + hidden_act="gelu", + mlp_dim=2048, + num_hidden_layers=2, + num_attention_heads=8, + attention_downsample_rate=2, + num_multimask_outputs=3, + iou_head_depth=3, + iou_head_hidden_dim=256, + dynamic_multimask_via_stability=True, + dynamic_multimask_stability_delta=0.05, + dynamic_multimask_stability_thresh=0.98, + **kwargs, + ): + super().__init__(**kwargs) + + self.hidden_size = hidden_size + self.num_multimask_outputs = num_multimask_outputs + self.hidden_act = hidden_act + self.iou_head_depth = iou_head_depth + self.iou_head_hidden_dim = iou_head_hidden_dim + self.dynamic_multimask_via_stability = dynamic_multimask_via_stability + self.dynamic_multimask_stability_delta = dynamic_multimask_stability_delta + self.dynamic_multimask_stability_thresh = dynamic_multimask_stability_thresh + + # TwoWayTransformer configuration + self.num_hidden_layers = num_hidden_layers + self.hidden_size = hidden_size + self.num_attention_heads = num_attention_heads + self.mlp_dim = mlp_dim + self.attention_downsample_rate = attention_downsample_rate + + +class EdgeTamVideoConfig(PretrainedConfig): + r""" + [`EdgeTamVideoConfig`] is the configuration class to store the configuration of a [`EdgeTamVideoModel`]. It is used to instantiate a + EDGETAM model according to the specified arguments, defining the memory attention, memory encoder, and image encoder + configs. Instantiating a configuration defaults will yield a similar configuration to that of the SAM 2.1 Hiera-tiny + [facebook/EdgeTAM](https://huggingface.co/facebook/EdgeTAM) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + vision_config (Union[`dict`, `EdgeTamVideoVisionConfig`], *optional*): + Dictionary of configuration options used to initialize [`EdgeTamVideoVisionConfig`]. + prompt_encoder_config (Union[`dict`, `EdgeTamVideoPromptEncoderConfig`], *optional*): + Dictionary of configuration options used to initialize [`EdgeTamVideoPromptEncoderConfig`]. + mask_decoder_config (Union[`dict`, `EdgeTamVideoMaskDecoderConfig`], *optional*): + Dictionary of configuration options used to initialize [`EdgeTamMaskDecoderConfig`]. + initializer_range (`float`, *optional*, defaults to 0.02): + Standard deviation for parameter initialization. + num_maskmem (`int`, *optional*, defaults to 7): + The number of memory slots for the mask memory. + image_size (`int`, *optional*, defaults to 1024): + The size of the input images. + sigmoid_scale_for_mem_enc (`float`, *optional*, defaults to 20.0): + Scale factor for the sigmoid function in the memory encoder. + sigmoid_bias_for_mem_enc (`float`, *optional*, defaults to -10.0): + Bias for the sigmoid function in the memory encoder. + enable_occlusion_spatial_embedding (`bool`, *optional*, defaults to `True`): + Whether to enable spatial embedding for occlusions. + multimask_output_in_sam (`bool`, *optional*, defaults to `True`): + Whether to output multiple masks from the SAM head. + multimask_min_pt_num (`int`, *optional*, defaults to 0): + The minimum number of points to trigger multimask output. + multimask_max_pt_num (`int`, *optional*, defaults to 1): + The maximum number of points to trigger multimask output. + multimask_output_for_tracking (`bool`, *optional*, defaults to `True`): + Whether to use multimask output for tracking. + max_object_pointers_in_encoder (`int`, *optional*, defaults to 16): + The maximum number of object pointers in the encoder. + enable_temporal_pos_encoding_for_object_pointers (`bool`, *optional*, defaults to `True`): + Whether to enable temporal positional encoding for object pointers. + memory_attention_hidden_size (`int`, *optional*, defaults to 256): + Dimensionality of the memory attention hidden states. + memory_attention_num_layers (`int`, *optional*, defaults to 2): + The number of layers in the memory attention module. + memory_attention_num_attention_heads (`int`, *optional*, defaults to 1): + Number of attention heads for each attention layer in the memory attention. + memory_attention_downsample_rate (`int`, *optional*, defaults to 1): + The downsample rate for the attention layers. + memory_attention_mlp_hidden_size (`int`, *optional*, defaults to 2048): + The dimension of the feedforward network in the memory attention module. + memory_attention_mlp_hidden_act (`str`, *optional*, defaults to `"relu"`): + The non-linear activation function in the feedforward network in the memory attention module. + memory_attention_dropout (`float`, *optional*, defaults to 0.1): + The dropout rate for the memory attention module. + memory_attention_rope_theta (`float`, *optional*, defaults to 10000): + The Rope theta parameter. + memory_attention_rope_feat_sizes (`Tuple[int, int]`, *optional*, defaults to `[64, 64]`): + The feature sizes for the Rope positional encoding. + memory_attention_rope_k_sizes (`List[int]`, *optional*, defaults to `[16, 16]`): + The key feature sizes for the RoPE positional encoding in memory attention. + memory_attention_rope_dropout (`float`, *optional*, defaults to 0.1): + The dropout rate for the Rope positional encoding. + perceiver_resampler_num_latents (`int`, *optional*, defaults to 256): + The number of 1D latent tokens in the perceiver resampler. + perceiver_resampler_num_latents_2d (`int`, *optional*, defaults to 256): + The number of 2D latent tokens in the perceiver resampler. + perceiver_resampler_hidden_size (`int`, *optional*, defaults to 64): + The hidden size of the perceiver resampler. + perceiver_resampler_mlp_intermediate_size (`int`, *optional*, defaults to 256): + The intermediate size of the feedforward network in the perceiver resampler. + perceiver_resampler_num_attention_heads (`int`, *optional*, defaults to 1): + The number of attention heads in the perceiver resampler. + perceiver_resampler_attention_head_dim (`int`, *optional*, defaults to 64): + The dimension of each attention head in the perceiver resampler. + perceiver_resampler_num_layers (`int`, *optional*, defaults to 2): + The number of layers in the perceiver resampler. + perceiver_resampler_hidden_dropout (`float`, *optional*, defaults to 0.0): + The dropout rate for the hidden layers in the perceiver resampler. + perceiver_resampler_attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout rate for the attention layers in the perceiver resampler. + memory_encoder_hidden_size (`int`, *optional*, defaults to 256): + Dimensionality of the memory encoder hidden states. + memory_encoder_output_channels (`int`, *optional*, defaults to 64): + The number of output channels for the memory encoder. + mask_downsampler_embed_dim (`int`, *optional*, defaults to 256): + The dimension of the mask downsampler embedding. + memory_fuser_intermediate_dim (`int`, *optional*, defaults to 1024): + The intermediate dimension of the memory fuser feedforward network. + mask_downsampler_kernel_size (`int`, *optional*, defaults to 3): + The kernel size for the mask downsampler. + mask_downsampler_stride (`int`, *optional*, defaults to 2): + The stride for the mask downsampler. + mask_downsampler_padding (`int`, *optional*, defaults to 1): + The padding for the mask downsampler. + mask_downsampler_total_stride (`int`, *optional*, defaults to 16): + The total stride for the mask downsampler. + mask_downsampler_hidden_act (`str`, *optional*, defaults to `"gelu"`): + The non-linear activation function in the mask downsampler. + memory_fuser_num_layers (`int`, *optional*, defaults to 2): + The number of layers in the memory fuser. + memory_fuser_embed_dim (`int`, *optional*, defaults to 256): + The dimension of the memory fuser embedding. + memory_fuser_kernel_size (`int`, *optional*, defaults to 7): + The kernel size for the memory fuser. + memory_fuser_padding (`int`, *optional*, defaults to 3): + The padding for the memory fuser. + memory_fuser_layer_scale_init_value (`float`, *optional*, defaults to 1e-06): + The initial value for the layer scale in the memory fuser. + memory_fuser_hidden_act (`str`, *optional*, defaults to `"gelu"`): + The non-linear activation function in the memory fuser. + + Example: + + ```python + >>> from transformers import ( + ... EdgeTamVisionConfig, + ... EdgeTamVideoPromptEncoderConfig, + ... EdgeTamVideoMaskDecoderConfig, + ... EdgeTamVideoModel, + ... EdgeTamVideoConfig, + ... ) + + >>> # Initializing a EdgeTamVideoConfig with `"facebook/edgetam.1_hiera_tiny"` style configuration + >>> configuration = EdgeTamVideoConfig() + + >>> # Initializing a EdgeTamVideoModel (with random weights) from the `"facebook/edgetam.1_hiera_tiny"` style configuration + >>> model = EdgeTamVideoModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + + >>> # We can also initialize a EdgeTamConfig from a EdgeTamVisionConfig, EdgeTamPromptEncoderConfig, and EdgeTamMaskDecoderConfig + + >>> # Initializing EDGETAM vision encoder, memory attention, and memory encoder configurations + >>> vision_config = EdgeTamVisionConfig() + >>> prompt_encoder_config = EdgeTamVideoPromptEncoderConfig() + >>> mask_decoder_config = EdgeTamVideoMaskDecoderConfig() + + >>> config = EdgeTamVideoConfig(vision_config, prompt_encoder_config, mask_decoder_config) + ```""" + + model_type = "edgetam_video" + sub_configs = { + "vision_config": AutoConfig, + "prompt_encoder_config": EdgeTamVideoPromptEncoderConfig, + "mask_decoder_config": EdgeTamVideoMaskDecoderConfig, + } + + def __init__( + self, + vision_config=None, + prompt_encoder_config=None, + mask_decoder_config=None, + initializer_range=0.02, + num_maskmem=7, + image_size=1024, + sigmoid_scale_for_mem_enc=20.0, + sigmoid_bias_for_mem_enc=-10.0, + enable_occlusion_spatial_embedding=True, + multimask_output_in_sam=True, + multimask_min_pt_num=0, + multimask_max_pt_num=1, + multimask_output_for_tracking=True, + max_object_pointers_in_encoder=16, + enable_temporal_pos_encoding_for_object_pointers=True, + # memory attention + memory_attention_hidden_size=256, + memory_attention_num_layers=2, + memory_attention_num_attention_heads=1, + memory_attention_downsample_rate=1, + memory_attention_mlp_hidden_size=2048, + memory_attention_mlp_hidden_act="relu", + memory_attention_dropout=0.1, + memory_attention_rope_theta=10000, + memory_attention_rope_feat_sizes=None, + memory_attention_rope_k_sizes=None, + memory_attention_rope_dropout=0.1, + # spatial perceiver resampler + perceiver_resampler_num_latents=256, + perceiver_resampler_num_latents_2d=256, + perceiver_resampler_hidden_size=64, + perceiver_resampler_mlp_intermediate_size=256, + perceiver_resampler_num_attention_heads=1, + perceiver_resampler_attention_head_dim=64, + perceiver_resampler_num_layers=2, + perceiver_resampler_hidden_dropout=0.0, + perceiver_resampler_attention_dropout=0.0, + # memory encoder + memory_encoder_hidden_size=256, + memory_encoder_output_channels=64, + mask_downsampler_embed_dim=256, + memory_fuser_intermediate_dim=1024, + mask_downsampler_kernel_size=3, + mask_downsampler_stride=2, + mask_downsampler_padding=1, + mask_downsampler_total_stride=16, + mask_downsampler_hidden_act="gelu", + memory_fuser_num_layers=2, + memory_fuser_embed_dim=256, + memory_fuser_kernel_size=7, + memory_fuser_padding=3, + memory_fuser_layer_scale_init_value=1e-6, + memory_fuser_hidden_act="gelu", + **kwargs, + ): + super().__init__(**kwargs) + vision_config = vision_config if vision_config is not None else {} + prompt_encoder_config = prompt_encoder_config if prompt_encoder_config is not None else {} + mask_decoder_config = mask_decoder_config if mask_decoder_config is not None else {} + memory_attention_rope_feat_sizes = ( + [64, 64] if memory_attention_rope_feat_sizes is None else memory_attention_rope_feat_sizes + ) + memory_attention_rope_k_sizes = ( + [16, 16] if memory_attention_rope_k_sizes is None else memory_attention_rope_k_sizes + ) + + if isinstance(vision_config, dict): + vision_config["model_type"] = vision_config.get("model_type", "sam2_vision_model") + vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config) + if isinstance(prompt_encoder_config, EdgeTamVideoPromptEncoderConfig): + prompt_encoder_config = prompt_encoder_config.to_dict() + if isinstance(mask_decoder_config, EdgeTamVideoMaskDecoderConfig): + mask_decoder_config = mask_decoder_config.to_dict() + + self.vision_config = vision_config + self.prompt_encoder_config = EdgeTamVideoPromptEncoderConfig(**prompt_encoder_config) + self.mask_decoder_config = EdgeTamVideoMaskDecoderConfig(**mask_decoder_config) + + self.initializer_range = initializer_range + self.num_maskmem = num_maskmem # default 1 input frame + 6 previous frames + self.image_size = image_size + self.sigmoid_scale_for_mem_enc = sigmoid_scale_for_mem_enc # scale factor for mask sigmoid prob + self.sigmoid_bias_for_mem_enc = sigmoid_bias_for_mem_enc # bias factor for mask sigmoid prob + self.enable_occlusion_spatial_embedding = enable_occlusion_spatial_embedding + self.multimask_output_in_sam = multimask_output_in_sam + self.multimask_min_pt_num = multimask_min_pt_num + self.multimask_max_pt_num = multimask_max_pt_num + self.multimask_output_for_tracking = multimask_output_for_tracking + self.max_object_pointers_in_encoder = max_object_pointers_in_encoder + self.enable_temporal_pos_encoding_for_object_pointers = enable_temporal_pos_encoding_for_object_pointers + + # memory attention + self.memory_attention_hidden_size = memory_attention_hidden_size + self.memory_attention_num_layers = memory_attention_num_layers + self.memory_attention_num_attention_heads = memory_attention_num_attention_heads + self.memory_attention_downsample_rate = memory_attention_downsample_rate + self.memory_attention_mlp_hidden_size = memory_attention_mlp_hidden_size + self.memory_attention_mlp_hidden_act = memory_attention_mlp_hidden_act + self.memory_attention_dropout = memory_attention_dropout + self.memory_attention_rope_theta = memory_attention_rope_theta + self.memory_attention_rope_feat_sizes = memory_attention_rope_feat_sizes + self.memory_attention_rope_k_sizes = memory_attention_rope_k_sizes + self.memory_attention_rope_dropout = memory_attention_rope_dropout + + # spatial perceiver resampler + self.perceiver_resampler_num_latents = perceiver_resampler_num_latents + self.perceiver_resampler_num_latents_2d = perceiver_resampler_num_latents_2d + self.perceiver_resampler_hidden_size = perceiver_resampler_hidden_size + self.perceiver_resampler_mlp_intermediate_size = perceiver_resampler_mlp_intermediate_size + self.perceiver_resampler_attention_head_dim = perceiver_resampler_attention_head_dim + self.perceiver_resampler_num_attention_heads = perceiver_resampler_num_attention_heads + self.perceiver_resampler_num_layers = perceiver_resampler_num_layers + self.perceiver_resampler_hidden_dropout = perceiver_resampler_hidden_dropout + self.perceiver_resampler_attention_dropout = perceiver_resampler_attention_dropout + + # memory encoder + self.memory_encoder_hidden_size = memory_encoder_hidden_size + self.memory_encoder_output_channels = memory_encoder_output_channels + self.mask_downsampler_embed_dim = mask_downsampler_embed_dim + self.mask_downsampler_kernel_size = mask_downsampler_kernel_size + self.mask_downsampler_stride = mask_downsampler_stride + self.mask_downsampler_padding = mask_downsampler_padding + self.mask_downsampler_total_stride = mask_downsampler_total_stride + self.mask_downsampler_hidden_act = mask_downsampler_hidden_act + self.memory_fuser_num_layers = memory_fuser_num_layers + self.memory_fuser_embed_dim = memory_fuser_embed_dim + self.memory_fuser_intermediate_dim = memory_fuser_intermediate_dim + self.memory_fuser_kernel_size = memory_fuser_kernel_size + self.memory_fuser_padding = memory_fuser_padding + self.memory_fuser_layer_scale_init_value = memory_fuser_layer_scale_init_value + self.memory_fuser_hidden_act = memory_fuser_hidden_act + + +__all__ = ["EdgeTamVideoMaskDecoderConfig", "EdgeTamVideoPromptEncoderConfig", "EdgeTamVideoConfig"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/edgetam_video/modeling_edgetam_video.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/edgetam_video/modeling_edgetam_video.py new file mode 100644 index 0000000000000000000000000000000000000000..3ba7ab4ebf2f18d2eacdb2cf1222d0dd8fb19fb6 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/edgetam_video/modeling_edgetam_video.py @@ -0,0 +1,3062 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/edgetam_video/modular_edgetam_video.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_edgetam_video.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright 2025 the HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +from collections import OrderedDict +from collections.abc import Iterator +from dataclasses import dataclass +from typing import Any, Callable, Optional, Union + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch import Tensor +from tqdm import tqdm + +from transformers.utils.generic import OutputRecorder + +from ...activations import ACT2FN +from ...modeling_flash_attention_utils import FlashAttentionKwargs +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_outputs import BaseModelOutput +from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel +from ...processing_utils import Unpack +from ...pytorch_utils import compile_compatible_method_lru_cache +from ...utils import ModelOutput, auto_docstring +from ...utils.generic import TransformersKwargs +from ..auto import AutoModel +from .configuration_edgetam_video import ( + EdgeTamVideoConfig, + EdgeTamVideoMaskDecoderConfig, + EdgeTamVideoPromptEncoderConfig, +) + + +class EdgeTamVideoLayerNorm(nn.LayerNorm): + r"""LayerNorm that supports two data formats: channels_last (default) or channels_first. + The ordering of the dimensions in the inputs. channels_last corresponds to inputs with shape (batch_size, height, + width, channels) while channels_first corresponds to inputs with shape (batch_size, channels, height, width). + """ + + def __init__(self, normalized_shape, *, eps=1e-6, data_format="channels_last", **kwargs): + super().__init__(normalized_shape, eps=eps, **kwargs) + if data_format not in ["channels_last", "channels_first"]: + raise NotImplementedError(f"Unsupported data format: {data_format}") + self.data_format = data_format + + def forward(self, features: torch.Tensor) -> torch.Tensor: + """ + Args: + features: Tensor of shape (batch_size, channels, height, width) OR (batch_size, height, width, channels) + """ + if self.data_format == "channels_first": + features = features.permute(0, 2, 3, 1) + features = super().forward(features) + features = features.permute(0, 3, 1, 2) + else: + features = super().forward(features) + return features + + +# Lightly adapted from ConvNext (https://github.com/facebookresearch/ConvNeXt) +class EdgeTamVideoMemoryFuserCXBlock(GradientCheckpointingLayer): + def __init__(self, config: EdgeTamVideoConfig): + super().__init__() + self.depthwise_conv = nn.Conv2d( + config.memory_fuser_embed_dim, + config.memory_fuser_embed_dim, + kernel_size=config.memory_fuser_kernel_size, + padding=config.memory_fuser_padding, + groups=config.memory_fuser_embed_dim, + ) # depthwise conv + self.layer_norm = EdgeTamVideoLayerNorm(config.memory_fuser_embed_dim, eps=1e-6, data_format="channels_first") + self.activation = ACT2FN[config.memory_fuser_hidden_act] + self.pointwise_conv1 = nn.Linear( + config.memory_fuser_embed_dim, config.memory_fuser_intermediate_dim + ) # pointwise/1x1 convs, implemented with linear layers + self.pointwise_conv2 = nn.Linear(config.memory_fuser_intermediate_dim, config.memory_fuser_embed_dim) + self.scale = nn.Parameter( + config.memory_fuser_layer_scale_init_value * torch.ones(config.memory_fuser_embed_dim), + requires_grad=True, + ) + + def forward(self, hidden_states): + input = hidden_states + hidden_states = self.depthwise_conv(hidden_states) + hidden_states = self.layer_norm(hidden_states) + hidden_states = hidden_states.permute(0, 2, 3, 1) # (N, C, H, W) -> (N, H, W, C) + hidden_states = self.pointwise_conv1(hidden_states) + hidden_states = self.activation(hidden_states) + hidden_states = self.pointwise_conv2(hidden_states) + hidden_states = self.scale * hidden_states + hidden_states = hidden_states.permute(0, 3, 1, 2) # (N, H, W, C) -> (N, C, H, W) + + hidden_states = input + hidden_states + return hidden_states + + +@dataclass +@auto_docstring(custom_intro="Base class for the vision encoder's outputs.") +class EdgeTamVideoVisionEncoderOutput(ModelOutput): + r""" + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, height, width, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + fpn_hidden_states (`tuple(torch.FloatTensor)`): + Tuple of `torch.FloatTensor` (one for each feature level, from high to low resolution) of shape + `(batch_size, hidden_size, height, width)`. Feature maps from the Feature Pyramid Network neck. + fpn_position_encoding (`tuple(torch.FloatTensor)`): + Tuple of `torch.FloatTensor` (one for each feature level, from high to low resolution) of shape + `(batch_size, hidden_size, height, width)`. Positional encodings corresponding to the `fpn_hidden_states`. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each stage) of shape `(batch_size, height, width, hidden_size)`. Hidden-states of the + model at the output of each stage. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in + the self-attention heads. + """ + + last_hidden_state: Optional[torch.FloatTensor] = None + fpn_hidden_states: Optional[torch.FloatTensor] = None + fpn_position_encoding: Optional[torch.FloatTensor] = None + hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None + attentions: Optional[tuple[torch.FloatTensor, ...]] = None + + +class EdgeTamVideoVisionRotaryEmbedding(nn.Module): + """ + Vision Rotary Position Embedding for SAM2, following transformers library standards. + Supports 2D (axial) rotary embeddings for spatial dimensions. + """ + + def __init__(self, config: EdgeTamVideoConfig, end_x: Optional[int] = None, end_y: Optional[int] = None): + super().__init__() + dim = config.memory_attention_hidden_size // ( + config.memory_attention_downsample_rate * config.memory_attention_num_attention_heads + ) + # Ensure even dimension for proper axial splitting + if dim % 4 != 0: + raise ValueError("Dimension must be divisible by 4 for axial RoPE") + end_x, end_y = config.memory_attention_rope_feat_sizes if end_x is None else (end_x, end_y) + freqs = 1.0 / (config.memory_attention_rope_theta ** (torch.arange(0, dim, 4)[: (dim // 4)].float() / dim)) + + # Generate 2D position indices for axial rotary embedding + flattened_indices = torch.arange(end_x * end_y, dtype=torch.long) + x_positions = flattened_indices % end_x + y_positions = torch.div(flattened_indices, end_x, rounding_mode="floor") + freqs_x = torch.outer(x_positions, freqs).float() + freqs_y = torch.outer(y_positions, freqs).float() + inv_freq = torch.cat([freqs_x, freqs_y], dim=-1) + inv_freq = inv_freq.repeat_interleave(2, dim=-1) + # directly register the cos and sin embeddings as we have a fixed feature shape + self.register_buffer("rope_embeddings_cos", inv_freq.cos(), persistent=False) + self.register_buffer("rope_embeddings_sin", inv_freq.sin(), persistent=False) + + @torch.no_grad() + def forward(self) -> tuple[torch.Tensor, torch.Tensor]: + # As the feature map size is fixed, we can just return the pre-computed embeddings. + return self.rope_embeddings_cos, self.rope_embeddings_sin + + +def eager_attention_forward( + module: nn.Module, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attention_mask: Optional[torch.Tensor], + scaling: float, + dropout: float = 0.0, + **kwargs, +): + attn_weights = torch.matmul(query, key.transpose(2, 3)) * scaling + if attention_mask is not None: + attn_weights = attn_weights + attention_mask + + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype) + attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) + attn_output = torch.matmul(attn_weights, value) + attn_output = attn_output.transpose(1, 2).contiguous() + + return attn_output, attn_weights + + +class EdgeTamVideoAttention(nn.Module): + """ + EDGETAM_VIDEO's attention layer that allows for downscaling the size of the embedding after projection to queries, keys, and + values. + """ + + def __init__(self, config, downsample_rate=None): + super().__init__() + downsample_rate = config.attention_downsample_rate if downsample_rate is None else downsample_rate + self.config = config + self.hidden_size = config.hidden_size + self.internal_dim = config.hidden_size // downsample_rate + self.num_attention_heads = config.num_attention_heads + self.head_dim = self.internal_dim // config.num_attention_heads + self.scaling = self.head_dim**-0.5 + self.is_causal = False + + self.q_proj = nn.Linear(self.hidden_size, self.internal_dim) + self.k_proj = nn.Linear(self.hidden_size, self.internal_dim) + self.v_proj = nn.Linear(self.hidden_size, self.internal_dim) + self.o_proj = nn.Linear(self.internal_dim, self.hidden_size) + + def forward( + self, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attention_similarity: Optional[torch.Tensor] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> tuple[torch.Tensor, torch.Tensor]: + # Input projections + batch_size, point_batch_size = query.shape[:2] + new_shape = (batch_size * point_batch_size, -1, self.num_attention_heads, self.head_dim) + + query = self.q_proj(query).view(*new_shape).transpose(1, 2) + key = self.k_proj(key).view(*new_shape).transpose(1, 2) + value = self.v_proj(value).view(*new_shape).transpose(1, 2) + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + attn_output, attn_weights = attention_interface( + self, + query, + key, + value, + attention_mask=attention_similarity, + dropout=0.0, + scaling=self.scaling, + is_causal=self.is_causal, + **kwargs, + ) + + attn_output = attn_output.reshape( + batch_size, point_batch_size, -1, self.num_attention_heads * self.head_dim + ).contiguous() + attn_output = self.o_proj(attn_output) + + return attn_output, attn_weights + + +def rotate_pairwise(x): + """ + pairwise rotation of the hidden dims of the input. Differerent from Llama Half-Tensor Rotation. + + This is an optimized version of the following more explicit implementation: + ```python + x_rotated = torch.zeros_like(x, dtype=x.dtype, device=x.device) + x_rotated[..., ::2] = -x[..., 1::2] + x_rotated[..., 1::2] = x[..., ::2] + return x_rotated + ``` + """ + x = x.view(*x.shape[:-1], -1, 2) + x1, x2 = x.unbind(dim=-1) + x = torch.stack((-x2, x1), dim=-1) + return x.flatten(start_dim=-2) + + +def apply_rotary_pos_emb_2d_self_attn( + q: torch.Tensor, + k: torch.Tensor, + cos: torch.Tensor, + sin: torch.Tensor, +) -> tuple[torch.Tensor, torch.Tensor]: + """ + Apply rotary position embedding to query and key tensors for self-attention. + + Args: + q: Query tensor of shape (..., seq_len, head_dim) + k: Key tensor of shape (..., seq_len, head_dim) + cos: Cosine position embedding of shape (seq_len, head_dim) + sin: Sine position embedding of shape (seq_len, head_dim) + + Returns: + Rotated (q, k) tensors + """ + # Apply RoPE to queries + q_embed = q.float() # force upscale to float32 as in the original implementation + q_embed = (q_embed * cos) + (rotate_pairwise(q_embed) * sin) + + # Apply RoPE to keys (same embeddings as queries for self-attention) + k_embed = k.float() # force upscale to float32 as in the original implementation + k_embed = (k_embed * cos) + (rotate_pairwise(k_embed) * sin) + + return q_embed.type_as(q), k_embed.type_as(k) + + +class EdgeTamVideoRoPESelfAttention(nn.Module): + """Self-attention with rotary position encoding.""" + + def __init__(self, config: EdgeTamVideoConfig): + super().__init__() + self.config = config + self.hidden_size = config.memory_attention_hidden_size + self.internal_dim = self.hidden_size // config.memory_attention_downsample_rate + self.num_attention_heads = config.memory_attention_num_attention_heads + self.head_dim = self.internal_dim // config.memory_attention_num_attention_heads + self.scaling = self.head_dim**-0.5 + self.is_causal = False + + self.q_proj = nn.Linear(self.hidden_size, self.internal_dim) + self.k_proj = nn.Linear(self.hidden_size, self.internal_dim) + self.v_proj = nn.Linear(self.hidden_size, self.internal_dim) + self.o_proj = nn.Linear(self.internal_dim, self.hidden_size) + self.dropout_p = config.memory_attention_rope_dropout + + def forward( + self, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + position_embeddings: tuple[torch.Tensor, torch.Tensor], + **kwargs: Unpack[FlashAttentionKwargs], + ) -> Tensor: + # Input projections + batch_size, point_batch_size = query.shape[:2] + new_shape = (batch_size * point_batch_size, -1, self.num_attention_heads, self.head_dim) + + query = self.q_proj(query).view(*new_shape).transpose(1, 2) + key = self.k_proj(key).view(*new_shape).transpose(1, 2) + value = self.v_proj(value).view(*new_shape).transpose(1, 2) + + cos, sin = position_embeddings + # Apply rotary position encoding for self-attention + query, key = apply_rotary_pos_emb_2d_self_attn(query, key, cos=cos, sin=sin) + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + attn_output, attn_weights = attention_interface( + self, + query, + key, + value, + attention_mask=None, + dropout=0.0 if not self.training else self.dropout_p, + scaling=self.scaling, + is_causal=self.is_causal, + **kwargs, + ) + attn_output = attn_output.reshape( + batch_size, point_batch_size, -1, self.num_attention_heads * self.head_dim + ).contiguous() + attn_output = self.o_proj(attn_output) + return attn_output, attn_weights + + +def apply_rotary_pos_emb_2d_cross_attn( + q: torch.Tensor, + k: torch.Tensor, + cos: torch.Tensor, + sin: torch.Tensor, + cos_k: torch.Tensor, + sin_k: torch.Tensor, + num_k_exclude_rope: int = 0, + repeat_freqs_k: int = 1, +) -> tuple[torch.Tensor, torch.Tensor]: + """ + Apply rotary position embedding to query and key tensors for cross-attention. + + Args: + q: Query tensor of shape (..., seq_len, head_dim) + k: Key tensor of shape (..., seq_len, head_dim) + cos: Cosine position embedding of shape (seq_len, head_dim) + sin: Sine position embedding of shape (seq_len, head_dim) + cos_k: Cosine position embedding for keys of shape (seq_len, head_dim) + sin_k: Sine position embedding for keys of shape (seq_len, head_dim) + num_k_exclude_rope: Number of tokens at end of k to exclude from RoPE (e.g., object pointer tokens) + repeat_freqs_k: Frequency repetition for keys in cross-attention (e.g., for spatial memory tokens) + + Returns: + Rotated (q, k) tensors + """ + # Apply RoPE to queries (always straightforward) + q_embed = q.float() + q_embed = (q_embed * cos) + (rotate_pairwise(q_embed) * sin) + + # Split keys: RoPE tokens and excluded tokens (e.g., object pointers) + num_total_k_tokens = k.shape[-2] + k_for_rope = k[..., : num_total_k_tokens - num_k_exclude_rope, :] + k_excluded = k[..., num_total_k_tokens - num_k_exclude_rope :, :] + + # Early return if no keys need RoPE + if k_for_rope.shape[-2] == 0: + return q_embed.type_as(q), k_excluded + + batch_size, num_heads, k_seq_len, channels_per_head = k_for_rope.shape + + # Handle temporal/spatial token structure for memory + # Keys have temporal + spatial structure, only spatial tokens get RoPE + tokens_per_group = k_seq_len // repeat_freqs_k + spatial_tokens = cos_k.shape[-2] + temporal_tokens = tokens_per_group - spatial_tokens + + # Reshape and separate temporal/spatial tokens + k_grouped = k_for_rope.view(batch_size, num_heads, repeat_freqs_k, tokens_per_group, channels_per_head) + k_temporal = k_grouped[..., :temporal_tokens, :].reshape(batch_size, num_heads, -1, channels_per_head) + k_spatial = k_grouped[..., temporal_tokens:, :].reshape(batch_size, num_heads, -1, channels_per_head) + + # Only apply RoPE to spatial tokens + k_rope_input = k_spatial + + # Prepare position embeddings for repeated groups + if repeat_freqs_k > 1: + cos_k = cos_k.repeat(1, 1, repeat_freqs_k, 1) + sin_k = sin_k.repeat(1, 1, repeat_freqs_k, 1) + + # Apply RoPE to spatial tokens + k_spatial_embed = k_rope_input.float() + k_spatial_embed = (k_spatial_embed * cos_k) + (rotate_pairwise(k_spatial_embed) * sin_k) + + # Reconstruct: temporal + spatial tokens back to original structure + k_spatial_reshaped = k_spatial_embed.view(batch_size, num_heads, repeat_freqs_k, -1, channels_per_head) + k_temporal_reshaped = k_temporal.view(batch_size, num_heads, repeat_freqs_k, -1, channels_per_head) + k_final = torch.cat([k_temporal_reshaped, k_spatial_reshaped], dim=3) + k_final = k_final.view(batch_size, num_heads, k_seq_len, channels_per_head) + + # Combine RoPE-processed keys with excluded tokens + k_embed = torch.cat([k_final.type_as(k), k_excluded], dim=-2) + return q_embed.type_as(q), k_embed + + +class EdgeTamVideoRoPECrossAttention(nn.Module): + """Cross-attention with rotary position encoding.""" + + def __init__(self, config: EdgeTamVideoConfig, kv_in_dim: int): + super().__init__() + self.config = config + self.hidden_size = config.memory_attention_hidden_size + self.internal_dim = self.hidden_size // config.memory_attention_downsample_rate + self.num_attention_heads = config.memory_attention_num_attention_heads + self.head_dim = self.internal_dim // config.memory_attention_num_attention_heads + self.scaling = self.head_dim**-0.5 + self.is_causal = False + + self.kv_in_dim = kv_in_dim + + self.q_proj = nn.Linear(self.hidden_size, self.internal_dim) + self.k_proj = nn.Linear(self.kv_in_dim, self.internal_dim) + self.v_proj = nn.Linear(self.kv_in_dim, self.internal_dim) + self.o_proj = nn.Linear(self.internal_dim, self.hidden_size) + self.dropout_p = config.memory_attention_rope_dropout + + def forward( + self, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + position_embeddings: tuple[torch.Tensor, torch.Tensor], + position_embeddings_k: tuple[torch.Tensor, torch.Tensor], + num_k_exclude_rope: int = 0, + rope_k_repeat: int = 0, + **kwargs: Unpack[FlashAttentionKwargs], + ) -> Tensor: + # Input projections + batch_size, point_batch_size = query.shape[:2] + new_shape = (batch_size * point_batch_size, -1, self.num_attention_heads, self.head_dim) + + query = self.q_proj(query).view(*new_shape).transpose(1, 2) + key = self.k_proj(key).view(*new_shape).transpose(1, 2) + value = self.v_proj(value).view(*new_shape).transpose(1, 2) + + cos, sin = position_embeddings + cos_k, sin_k = position_embeddings_k + # Apply rotary position encoding for cross-attention + query, key = apply_rotary_pos_emb_2d_cross_attn( + query, + key, + cos=cos, + sin=sin, + cos_k=cos_k, + sin_k=sin_k, + repeat_freqs_k=rope_k_repeat, + num_k_exclude_rope=num_k_exclude_rope, + ) + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + attn_output, attn_weights = attention_interface( + self, + query, + key, + value, + attention_mask=None, + dropout=0.0 if not self.training else self.dropout_p, + scaling=self.scaling, + is_causal=self.is_causal, + **kwargs, + ) + attn_output = attn_output.reshape( + batch_size, point_batch_size, -1, self.num_attention_heads * self.head_dim + ).contiguous() + attn_output = self.o_proj(attn_output) + return attn_output, attn_weights + + +class EdgeTamVideoTwoWayAttentionBlock(nn.Module): + def __init__(self, config: EdgeTamVideoMaskDecoderConfig, skip_first_layer_pe: bool = False): + """ + A transformer block with four layers: + (1) self-attention of sparse inputs (2) cross attention of sparse inputs -> dense inputs (3) mlp block on + sparse inputs (4) cross attention of dense inputs -> sparse inputs + + Arguments: + config (`EdgeTamVideoMaskDecoderConfig`): + The configuration file used to instantiate the block + attention_downsample_rate (*optionalk*, int, defaults to 2): + The downsample ratio of the block used to reduce the inner dim of the attention. + skip_first_layer_pe (*optional*, bool, defaults to `False`): + Whether or not to skip the addition of the query_point_embedding on the first layer. + """ + super().__init__() + self.self_attn = EdgeTamVideoAttention(config, downsample_rate=1) + self.layer_norm1 = nn.LayerNorm(config.hidden_size) + + self.cross_attn_token_to_image = EdgeTamVideoAttention(config) + self.layer_norm2 = nn.LayerNorm(config.hidden_size) + + self.mlp = EdgeTamVideoFeedForward( + config.hidden_size, config.mlp_dim, config.hidden_size, num_layers=config.num_hidden_layers + ) + self.layer_norm3 = nn.LayerNorm(config.hidden_size) + + self.layer_norm4 = nn.LayerNorm(config.hidden_size) + self.cross_attn_image_to_token = EdgeTamVideoAttention(config) + + self.skip_first_layer_pe = skip_first_layer_pe + + def forward( + self, + queries: Tensor, + keys: Tensor, + query_point_embedding: Tensor, + key_point_embedding: Tensor, + attention_similarity: Tensor, + **kwargs: Unpack[TransformersKwargs], + ): + # Self attention block + if self.skip_first_layer_pe: + queries, _ = self.self_attn(query=queries, key=queries, value=queries) + else: + query = queries + query_point_embedding + attn_out, _ = self.self_attn(query=query, key=query, value=queries) + queries = queries + attn_out + queries = self.layer_norm1(queries) + + # Cross attention block, tokens attending to image embedding + query = queries + query_point_embedding + key = keys + key_point_embedding + + attn_out, _ = self.cross_attn_token_to_image( + query=query, key=key, value=keys, attention_similarity=attention_similarity + ) + queries = queries + attn_out + + queries = self.layer_norm2(queries) + + # MLP block + mlp_out = self.mlp(queries) + queries = queries + mlp_out + queries = self.layer_norm3(queries) + + # Cross attention block, image embedding attending to tokens + query = queries + query_point_embedding + key = keys + key_point_embedding + + attn_out, _ = self.cross_attn_image_to_token(query=key, key=query, value=queries) + keys = keys + attn_out + + keys = self.layer_norm4(keys) + return queries, keys, attn_out + + +# copied and adapted from original implementation, also practically equal to DetrSinePositionEmbedding +class EdgeTamVideoPositionEmbeddingSine(nn.Module): + """ + This is a more standard version of the position embedding, very similar to the one used by the Attention is all you + need paper, generalized to work on images. + """ + + def __init__( + self, num_pos_feats: int = 64, temperature: int = 10000, normalize: bool = False, scale: Optional[float] = None + ): + super().__init__() + if scale is not None and normalize is False: + raise ValueError("normalize should be True if scale is passed") + self.num_pos_feats = num_pos_feats + self.temperature = temperature + self.normalize = normalize + self.scale = 2 * math.pi if scale is None else scale + + @compile_compatible_method_lru_cache(maxsize=2) + def forward( + self, + shape: torch.Size, + device: Union[torch.device, str], + dtype: torch.dtype, + mask: Optional[Tensor] = None, + ) -> Tensor: + if mask is None: + mask = torch.zeros((shape[0], shape[2], shape[3]), device=device, dtype=torch.bool) + not_mask = (~mask).to(dtype) + y_embed = not_mask.cumsum(1) + x_embed = not_mask.cumsum(2) + if self.normalize: + eps = 1e-6 + y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale + x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale + + dim_t = torch.arange(self.num_pos_feats, dtype=torch.int64, device=device).to(dtype) + dim_t = self.temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / self.num_pos_feats) + + pos_x = x_embed[:, :, :, None] / dim_t + pos_y = y_embed[:, :, :, None] / dim_t + pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3) + pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3) + pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) + return pos + + +class EdgeTamVideoMemoryFuser(nn.Module): + def __init__(self, config: EdgeTamVideoConfig): + super().__init__() + self.layers = nn.ModuleList( + [EdgeTamVideoMemoryFuserCXBlock(config) for _ in range(config.memory_fuser_num_layers)] + ) + + def forward(self, hidden_states): + # normally hidden_states: (N, C, H, W) + for layer in self.layers: + hidden_states = layer(hidden_states) + return hidden_states + + +class EdgeTamVideoMaskDownSamplerLayer(nn.Module): + def __init__(self, config: EdgeTamVideoConfig, in_channels: int, out_channels: int): + super().__init__() + self.conv = nn.Conv2d( + in_channels, + out_channels, + kernel_size=config.mask_downsampler_kernel_size, + stride=config.mask_downsampler_stride, + padding=config.mask_downsampler_padding, + ) + self.layer_norm = EdgeTamVideoLayerNorm(out_channels, eps=1e-6, data_format="channels_first") + self.activation = ACT2FN[config.mask_downsampler_hidden_act] + + def forward(self, x): + return self.activation(self.layer_norm(self.conv(x))) + + +class EdgeTamVideoMaskDownSampler(nn.Module): + """ + Progressively downsample a mask by total_stride, each time by stride. + Note that LayerNorm is applied per *token*, like in ViT. + + With each downsample (by a factor stride**2), channel capacity increases by the same factor. + In the end, we linearly project to embed_dim channels. + """ + + def __init__(self, config: EdgeTamVideoConfig): + super().__init__() + + num_layers = int(math.log2(config.mask_downsampler_total_stride) // math.log2(config.mask_downsampler_stride)) + + self.layers = nn.ModuleList() + self.activation = ACT2FN[config.mask_downsampler_hidden_act] + mask_in_chans, mask_out_chans = 1, 1 + for _ in range(num_layers): + mask_out_chans = mask_in_chans * (config.mask_downsampler_stride**2) + self.layers.append(EdgeTamVideoMaskDownSamplerLayer(config, mask_in_chans, mask_out_chans)) + mask_in_chans = mask_out_chans + + self.final_conv = nn.Conv2d(mask_out_chans, config.mask_downsampler_embed_dim, kernel_size=1) + + def forward(self, x): + for layer in self.layers: + x = layer(x) + x = self.final_conv(x) + return x + + +class EdgeTamVideoMemoryEncoder(nn.Module): + def __init__(self, config: EdgeTamVideoConfig): + super().__init__() + + hidden_size = config.memory_encoder_hidden_size + output_channels = config.memory_encoder_output_channels + self.mask_downsampler = EdgeTamVideoMaskDownSampler(config) + self.feature_projection = nn.Conv2d(hidden_size, hidden_size, kernel_size=1) + self.memory_fuser = EdgeTamVideoMemoryFuser(config) + self.position_encoding = EdgeTamVideoPositionEmbeddingSine(num_pos_feats=output_channels // 2, normalize=True) + self.projection = nn.Conv2d(hidden_size, output_channels, kernel_size=1) + + def forward( + self, + vision_features: torch.Tensor, + masks: torch.Tensor, + ) -> tuple[torch.Tensor, torch.Tensor]: + ## Process masks + masks = self.mask_downsampler(masks) + ## Fuse pixel_features and downsampled masks + + vision_features = self.feature_projection(vision_features) + vision_features = vision_features + masks + vision_features = self.memory_fuser(vision_features) + vision_features = self.projection(vision_features) + + vision_pos_enc = self.position_encoding(vision_features.shape, vision_features.device, vision_features.dtype) + + return vision_features, vision_pos_enc + + +class EdgeTamVideoFeedForward(nn.Module): + def __init__( + self, + input_dim: int, + hidden_dim: int, + output_dim: int, + num_layers: int, + activation: str = "relu", + sigmoid_output: bool = False, + ): + super().__init__() + self.num_layers = num_layers + self.activation = ACT2FN[activation] + self.proj_in = nn.Linear(input_dim, hidden_dim) + self.proj_out = nn.Linear(hidden_dim, output_dim) + self.layers = nn.ModuleList([nn.Linear(hidden_dim, hidden_dim) for _ in range(num_layers - 2)]) + self.sigmoid_output = sigmoid_output + + def forward(self, hidden_states): + hidden_states = self.proj_in(hidden_states) + hidden_states = self.activation(hidden_states) + for layer in self.layers: + hidden_states = self.activation(layer(hidden_states)) + + hidden_states = self.proj_out(hidden_states) + if self.sigmoid_output: + hidden_states = F.sigmoid(hidden_states) + return hidden_states + + +@auto_docstring +class EdgeTamVideoPreTrainedModel(PreTrainedModel): + config_class = EdgeTamVideoConfig + base_model_prefix = "edgetam_video" + main_input_name = "pixel_values" + _supports_sdpa = True + _supports_flash_attn_2 = True + _supports_attention_backend = True + + def _init_weights(self, module): + std = self.config.initializer_range + if isinstance(module, (nn.Linear, nn.Conv2d, nn.ConvTranspose2d)): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, (nn.LayerNorm, EdgeTamVideoLayerNorm)): + module.weight.data.fill_(1.0) + module.bias.data.zero_() + elif isinstance(module, EdgeTamVideoModel): + if module.no_memory_positional_encoding is not None: + module.no_memory_positional_encoding.data.zero_() + if module.memory_temporal_positional_encoding is not None: + module.memory_temporal_positional_encoding.data.zero_() + if module.no_object_pointer is not None: + module.no_object_pointer.data.zero_() + if module.occlusion_spatial_embedding_parameter is not None: + module.occlusion_spatial_embedding_parameter.data.zero_() + if isinstance(module, EdgeTamVideoMemoryFuserCXBlock): + if module.scale is not None: + module.scale.data.zero_() + + +class EdgeTamVideoInferenceCache: + """Cache for vision features and model constants.""" + + def __init__( + self, + inference_device: Union[torch.device, str] = "cpu", + inference_state_device: Union[torch.device, str] = "cpu", + max_vision_features_cache_size: int = 1, + ): + self.inference_device = inference_device + self.inference_state_device = inference_state_device + self.max_vision_features_cache_size = max_vision_features_cache_size + + self._vision_features = {} + + def cache_vision_features(self, frame_idx: int, features: dict): + """Cache vision features with automatic device management.""" + cached = {} + if len(self._vision_features) >= self.max_vision_features_cache_size: + # remove the oldest frame + self._vision_features.pop(min(self._vision_features.keys())) + + for key, value in features.items(): + if isinstance(value, torch.Tensor): + cached[key] = value.to(self.inference_state_device, non_blocking=True) + elif isinstance(value, (list, tuple)) and value and isinstance(value[0], torch.Tensor): + cached[key] = [v.to(self.inference_state_device, non_blocking=True) for v in value] + else: + cached[key] = value + self._vision_features[frame_idx] = cached + + def get_vision_features(self, frame_idx: int) -> Optional[dict]: + """Get cached vision features, automatically moved to inference device.""" + if frame_idx not in self._vision_features: + return None + + cached = self._vision_features[frame_idx] + moved = {} + for key, value in cached.items(): + if isinstance(value, torch.Tensor): + moved[key] = value.to(self.inference_device, non_blocking=True) + elif isinstance(value, (list, tuple)) and value and isinstance(value[0], torch.Tensor): + moved[key] = [v.to(self.inference_device, non_blocking=True) for v in value] + else: + moved[key] = value + return moved + + def clear_all(self): + """Clear all cached data.""" + self._vision_features.clear() + + +class EdgeTamVideoInferenceSession: + r""" + Manages video inference session parameters, state and cache. + + Args: + video (`torch.FloatTensor`, *optional*): + The video to process. No need to provide when streaming. + video_height (`int`, *optional*): + The height of the video. + video_width (`int`, *optional*): + The width of the video. + inference_device (`torch.device`, *optional*, defaults to `"cpu"`): + The device to use for inference. + inference_state_device (`torch.device`, *optional*, defaults to `"cpu"`): + The device to store the inference state on. + video_storage_device (`torch.device`, *optional*, defaults to `"cpu"`): + The device to store the video on. + dtype (`torch.dtype`, *optional*, defaults to `"float32"`): + The dtype to use for the video. + max_vision_features_cache_size (`int`, *optional*, defaults to 1): + The maximum number of vision features to cache. + """ + + def __init__( + self, + video: Optional[torch.FloatTensor] = None, + video_height: Optional[int] = None, + video_width: Optional[int] = None, + inference_device: Union[torch.device, str] = "cpu", + inference_state_device: Union[torch.device, str] = "cpu", + video_storage_device: Union[torch.device, str] = "cpu", + dtype: Union[torch.dtype, str] = "float32", + max_vision_features_cache_size: int = 1, + ): + # store as a dictionary to avoid double memory allocation with torch.cat when adding new frames + self.processed_frames = ( + dict(enumerate(video.to(video_storage_device, dtype=dtype))) if video is not None else None + ) + self.video_height = video_height + self.video_width = video_width + + self.inference_device = inference_device + self.inference_state_device = inference_state_device + self.video_storage_device = video_storage_device + self.dtype = dtype + self.max_vision_features_cache_size = max_vision_features_cache_size + + # Cache for computed features + self.cache = EdgeTamVideoInferenceCache( + inference_device=self.inference_device, + inference_state_device=self.inference_state_device, + max_vision_features_cache_size=self.max_vision_features_cache_size, + ) + + # Persistent object tracking state + self._obj_id_to_idx = OrderedDict() + self._obj_idx_to_id = OrderedDict() + self.obj_ids = [] + + # Persistent user inputs + self.point_inputs_per_obj = {} + self.mask_inputs_per_obj = {} + + # Persistent model outputs/history + self.output_dict_per_obj = {} + self.frames_tracked_per_obj = {} + + # Session state flags + self.obj_with_new_inputs = [] + + @property + def num_frames(self) -> Optional[int]: + return len(self.processed_frames) if self.processed_frames is not None else None + + # Object management + def obj_id_to_idx(self, obj_id: int) -> int: + """Map object ID to index, creating new entry if needed.""" + obj_idx = self._obj_id_to_idx.get(obj_id, None) + if obj_idx is not None: + return obj_idx + + obj_idx = len(self._obj_id_to_idx) + self._obj_id_to_idx[obj_id] = obj_idx + self._obj_idx_to_id[obj_idx] = obj_id + self.obj_ids = list(self._obj_id_to_idx) + + self.point_inputs_per_obj[obj_idx] = {} + self.mask_inputs_per_obj[obj_idx] = {} + self.output_dict_per_obj[obj_idx] = { + "cond_frame_outputs": {}, + "non_cond_frame_outputs": {}, + } + self.frames_tracked_per_obj[obj_idx] = {} + + return obj_idx + + # Video Inference specific functions + def obj_idx_to_id(self, obj_idx: int) -> int: + """Map model-side object index to client-side object id.""" + return self._obj_idx_to_id[obj_idx] + + def get_obj_num(self) -> int: + """Get the total number of unique object ids received so far in this session.""" + return len(self._obj_idx_to_id) + + # Input management with device handling + def add_point_inputs(self, obj_idx: int, frame_idx: int, inputs: dict): + """Add point inputs with automatic device placement.""" + device_inputs = {} + for key, value in inputs.items(): + if isinstance(value, torch.Tensor): + device_inputs[key] = value.to(self.inference_device, non_blocking=True) + else: + device_inputs[key] = value + self.point_inputs_per_obj[obj_idx][frame_idx] = device_inputs + + def remove_point_inputs(self, obj_idx: int, frame_idx: int): + """Remove point inputs.""" + self.point_inputs_per_obj[obj_idx].pop(frame_idx, None) + + def add_mask_inputs(self, obj_idx: int, frame_idx: int, inputs: torch.Tensor): + """Add mask inputs with automatic device placement.""" + self.mask_inputs_per_obj[obj_idx][frame_idx] = inputs.to( + self.inference_device, dtype=self.dtype, non_blocking=True + ) + + def remove_mask_inputs(self, obj_idx: int, frame_idx: int): + """Remove mask inputs.""" + self.mask_inputs_per_obj[obj_idx].pop(frame_idx, None) + + # Output management with smart device placement + def store_output( + self, + obj_idx: int, + frame_idx: int, + output_key: Optional[str] = None, + output_value: Optional[Union[torch.Tensor, dict]] = None, + is_conditioning_frame: bool = True, + ): + """ + Store output with smart device management. + If output_key is None, the output is stored as a dictionary. + + Args: + obj_idx (int): The index of the object. + frame_idx (int): The index of the frame. + output_key (Optional[str]): The key of the output. If None, the output is stored as a dictionary. + output_value (Optional[Union[torch.Tensor, dict]]): The value of the output. + is_conditioning_frame (bool): Whether the output is for a conditioning frame. + """ + storage_key = "cond_frame_outputs" if is_conditioning_frame else "non_cond_frame_outputs" + + if output_key is None and isinstance(output_value, dict): + self.output_dict_per_obj[obj_idx][storage_key][frame_idx] = {} + for key, value in output_value.items(): + self.store_output(obj_idx, frame_idx, key, value, is_conditioning_frame) + return + + # Device placement: small tensors stay on inference device, large ones go to inference state device + if output_key in ["object_pointer", "object_score_logits"]: # Small tensors + self.output_dict_per_obj[obj_idx][storage_key][frame_idx][output_key] = output_value + elif isinstance(output_value, torch.Tensor): # Large tensors like masks, features + self.output_dict_per_obj[obj_idx][storage_key][frame_idx][output_key] = output_value.to( + self.inference_state_device, non_blocking=True + ) + else: + self.output_dict_per_obj[obj_idx][storage_key][frame_idx][output_key] = output_value + + def get_output( + self, + obj_idx: int, + frame_idx: int, + output_key: str, + is_conditioning_frame: bool = True, + ): + """ + Get output with smart device management. + + Args: + obj_idx (int): The index of the object. + frame_idx (int): The index of the frame. + output_key (str): The key of the output. + is_conditioning_frame (bool): Whether the output is for a conditioning frame. + """ + storage_key = "cond_frame_outputs" if is_conditioning_frame else "non_cond_frame_outputs" + out = self.output_dict_per_obj[obj_idx][storage_key].get(frame_idx, None) + # move to inference device if needed + if out is None: + return None + value = out[output_key] + if isinstance(value, torch.Tensor): + value = value.to(self.inference_device, non_blocking=True) + return value + + # Video frame management + def add_new_frame(self, pixel_values: torch.Tensor, frame_idx: Optional[int] = None) -> int: + """Add new frame with automatic device placement.""" + pixel_values = pixel_values.to(self.video_storage_device, dtype=self.dtype, non_blocking=True) + if pixel_values.dim() == 4: + pixel_values = pixel_values.squeeze(0) + + if frame_idx is None: + frame_idx = len(self.processed_frames) if self.processed_frames is not None else 0 + + if self.processed_frames is None: + self.processed_frames = {frame_idx: pixel_values} + else: + self.processed_frames[frame_idx] = pixel_values + + return frame_idx + + def get_frame(self, frame_idx: int) -> torch.Tensor: + """Get frame from video.""" + return self.processed_frames[frame_idx].to(self.inference_device, non_blocking=True) + + def reset_tracking_data(self): + """Reset tracking data but keep cache.""" + self._obj_id_to_idx.clear() + self._obj_idx_to_id.clear() + self.obj_ids.clear() + self.point_inputs_per_obj.clear() + self.mask_inputs_per_obj.clear() + self.output_dict_per_obj.clear() + self.frames_tracked_per_obj.clear() + self.obj_with_new_inputs = [] + # Note: cache and video data are preserved + + def reset_inference_session(self): + """Reset tracking data and cache.""" + self._obj_id_to_idx.clear() + self._obj_idx_to_id.clear() + self.obj_ids.clear() + self.point_inputs_per_obj.clear() + self.mask_inputs_per_obj.clear() + self.output_dict_per_obj.clear() + self.frames_tracked_per_obj.clear() + self.obj_with_new_inputs = [] + self.cache.clear_all() + + +class EdgeTamVideoMemoryAttentionMLP(nn.Module): + def __init__(self, config: EdgeTamVideoConfig): + super().__init__() + self.config = config + self.hidden_size = config.memory_attention_hidden_size + self.intermediate_size = config.memory_attention_mlp_hidden_size + self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size) + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size) + self.dropout = nn.Dropout(config.memory_attention_dropout) + self.act_fn = ACT2FN[config.memory_attention_mlp_hidden_act] + + def forward(self, x): + return self.down_proj(self.dropout(self.act_fn(self.up_proj(x)))) + + +class EdgeTamVideoMemoryAttentionLayer(nn.Module): + def __init__(self, config: EdgeTamVideoConfig): + super().__init__() + hidden_size = config.memory_attention_hidden_size + self.self_attn = EdgeTamVideoRoPESelfAttention(config) + self.cross_attn_image = EdgeTamVideoRoPECrossAttention(config, kv_in_dim=64) + + # MLP module + self.mlp = EdgeTamVideoMemoryAttentionMLP(config) + + self.layer_norm1 = nn.LayerNorm(hidden_size) + self.layer_norm2 = nn.LayerNorm(hidden_size) + self.layer_norm3 = nn.LayerNorm(hidden_size) + self.dropout1 = nn.Dropout(config.memory_attention_dropout) + self.dropout2 = nn.Dropout(config.memory_attention_dropout) + self.dropout3 = nn.Dropout(config.memory_attention_dropout) + + def forward( + self, + queries: Tensor, + keys: Tensor, + key_point_embedding: Tensor, + rope_position_embeddings: tuple[Tensor, Tensor], + rope_position_embeddings_k: Optional[tuple[Tensor, Tensor]] = None, + num_k_exclude_rope: int = 0, + rope_k_repeat: int = 0, + ) -> torch.Tensor: + # Self-Attention + query = self.layer_norm1(queries) + query, _ = self.self_attn(query=query, key=query, value=query, position_embeddings=rope_position_embeddings) + queries = queries + self.dropout1(query) + + # Cross-Attention + query = self.layer_norm2(queries) + query, _ = self.cross_attn_image( + query=query, + key=keys + key_point_embedding, + value=keys, + position_embeddings=rope_position_embeddings, + position_embeddings_k=rope_position_embeddings_k, + num_k_exclude_rope=num_k_exclude_rope, + rope_k_repeat=rope_k_repeat, + ) + queries = queries + self.dropout2(query) + # MLP + query = self.layer_norm3(queries) + query = self.mlp(query) + queries = queries + self.dropout3(query) + return queries + + +class EdgeTamVideoMemoryAttention(nn.Module): + def __init__(self, config: EdgeTamVideoConfig): + super().__init__() + self.layers = nn.ModuleList( + [EdgeTamVideoMemoryAttentionLayer(config) for _ in range(config.memory_attention_num_layers)] + ) + self.layer_norm = nn.LayerNorm(config.memory_attention_hidden_size) + self.rotary_emb = EdgeTamVideoVisionRotaryEmbedding(config=config) + self.rotary_emb_k = EdgeTamVideoVisionRotaryEmbedding( + config, end_x=config.memory_attention_rope_k_sizes[0], end_y=config.memory_attention_rope_k_sizes[1] + ) + + def forward( + self, + current_vision_features: torch.Tensor, + memory: torch.Tensor, + current_vision_position_embeddings: Optional[Tensor] = None, + memory_posision_embeddings: Optional[Tensor] = None, + num_object_pointer_tokens: int = 0, + num_spatial_memory_tokens: int = -1, + ): + """ + Args: + current_vision_features (`torch.FloatTensor`): + The current vision features used for self-attention. + memory (`torch.FloatTensor`): + The memory features used for cross-attention. + current_vision_position_embeddings (`torch.FloatTensor`, *optional*): + The position embeddings for the current vision features. + memory_posision_embeddings (`torch.FloatTensor`, *optional*): + The position embeddings for the memory features. + num_object_pointer_tokens (`int`, *optional*, defaults to 0): + The number of object pointer tokens. + """ + output = current_vision_features + if current_vision_position_embeddings is not None: + output = output + 0.1 * current_vision_position_embeddings + + # Convert to batch first + output = output.transpose(0, 1) + memory = memory.transpose(0, 1).unsqueeze(1) + memory_posision_embeddings = memory_posision_embeddings.transpose(0, 1).unsqueeze(1) + rope_position_embeddings = self.rotary_emb() + rope_position_embeddings_k = self.rotary_emb_k() + for layer in self.layers: + output = layer( + queries=output.unsqueeze(1) if output.ndim == 3 else output, + keys=memory, + key_point_embedding=memory_posision_embeddings, + rope_position_embeddings=rope_position_embeddings, + rope_position_embeddings_k=rope_position_embeddings_k, + num_k_exclude_rope=num_object_pointer_tokens, + rope_k_repeat=num_spatial_memory_tokens, + ) + + normed_output = self.layer_norm(output) + + # Convert back to seq first + normed_output = normed_output.transpose(0, 1) + + return normed_output + + +class EdgeTamVideoPerceiverMLP(nn.Module): + def __init__(self, config: EdgeTamVideoConfig): + super().__init__() + self.hidden_size = config.perceiver_resampler_hidden_size + self.intermediate_size = config.perceiver_resampler_mlp_intermediate_size + + self.layer_norm = nn.LayerNorm(self.hidden_size) + self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) + self.act_fn = nn.GELU() + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.layer_norm(hidden_states) + hidden_states = self.down_proj(self.act_fn(self.up_proj(hidden_states))) + return hidden_states + + +class EdgeTamVideoPerceiverAttention(nn.Module): + def __init__(self, config: EdgeTamVideoConfig): + super().__init__() + self.config = config + self.hidden_size = config.perceiver_resampler_hidden_size + self.num_attention_heads = config.perceiver_resampler_num_attention_heads + self.head_dim = config.perceiver_resampler_attention_head_dim + self.attention_dropout = config.perceiver_resampler_attention_dropout + + self.inner_dim = self.head_dim * self.num_attention_heads + self.scaling = self.head_dim**-0.5 + self.is_causal = False + + self.q_proj = nn.Linear(self.hidden_size, self.inner_dim, bias=False) + self.k_proj = nn.Linear(self.hidden_size, self.inner_dim, bias=False) + self.v_proj = nn.Linear(self.hidden_size, self.inner_dim, bias=False) + self.o_proj = nn.Linear(self.inner_dim, self.hidden_size, bias=False) + + def forward( + self, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + positional_encoding: Optional[torch.Tensor] = None, + **kwargs, + ) -> torch.Tensor: + # Project queries, keys, and values + query = self.q_proj(query) + key = self.k_proj(key) + value = self.v_proj(value) + + # Reshape for multi-head attention + batch_size, seq_len_q = query.shape[:2] + query = query.view(batch_size, seq_len_q, self.num_attention_heads, self.head_dim).transpose(1, 2) + seq_len_kv = key.shape[1] + key = key.view(batch_size, seq_len_kv, self.num_attention_heads, self.head_dim).transpose(1, 2) + value = value.view(batch_size, seq_len_kv, self.num_attention_heads, self.head_dim).transpose(1, 2) + + # Add positional encoding if provided + if positional_encoding is not None: + pos_encoding = positional_encoding.view( + batch_size, seq_len_kv, self.num_attention_heads, self.head_dim + ).transpose(1, 2) + key = key + pos_encoding + value = value + pos_encoding + + # Apply attention + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + attn_output, _ = attention_interface( + self, + query, + key, + value, + attention_mask=None, + dropout=0.0 if not self.training else self.attention_dropout, + scaling=self.scaling, + is_causal=self.is_causal, + **kwargs, + ) + + # Reshape output + attn_output = attn_output.transpose(1, 2).contiguous().view(batch_size, seq_len_q, self.inner_dim) + return self.o_proj(attn_output) + + +class EdgeTamVideoPerceiverEncoderLayer(nn.Module): + def __init__(self, config: EdgeTamVideoConfig): + super().__init__() + + self.cross_attention = EdgeTamVideoPerceiverAttention(config) + self.mlp = EdgeTamVideoPerceiverMLP(config) + self.dropout = nn.Dropout(config.perceiver_resampler_hidden_dropout) + + self.self_attention = EdgeTamVideoPerceiverAttention(config) + self.self_mlp = EdgeTamVideoPerceiverMLP(config) + + # Layer norms moved from attention classes to here + self.layer_norm_input = nn.LayerNorm(config.perceiver_resampler_hidden_size) + self.layer_norm_latents = nn.LayerNorm(config.perceiver_resampler_hidden_size) + self.layer_norm_self = nn.LayerNorm(config.perceiver_resampler_hidden_size) + + def forward( + self, + latents: torch.Tensor, + input_features: torch.Tensor, + positional_encoding: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + # Cross attention with layer norms + normalized_latents = self.layer_norm_latents(latents) + normalized_input = self.layer_norm_input(input_features) + cross_attention_output = self.cross_attention( + query=normalized_latents, + key=normalized_input, + value=normalized_input, + positional_encoding=positional_encoding, + ) + latents = latents + self.dropout(cross_attention_output) + + mlp_output = self.mlp(latents) + latents = latents + mlp_output + + # Self attention with layer norm + normalized_latents_self = self.layer_norm_self(latents) + self_attention_output = self.self_attention( + query=normalized_latents_self, key=normalized_latents_self, value=normalized_latents_self + ) + latents = latents + self_attention_output + + self_mlp_output = self.self_mlp(latents) + latents = latents + self_mlp_output + + return latents + + +def window_partition(hidden_state, window_size): + """ + Partition into non-overlapping windows with padding if needed. + + Args: + hidden_state (`torch.Tensor`): + Input tokens with [batch_size, height, width, num_channels]. + window_size (`int`): + Window size. + + Returns: + `tuple(torch.FloatTensor)` comprising various elements: + - windows: windows after partition with [batch_size * num_windows, window_size, window_size, num_channels]. + - (padded_height, padded_width): padded height and width before partition + """ + batch_size, height, width, num_channels = hidden_state.shape + + pad_height = (window_size - height % window_size) % window_size + pad_width = (window_size - width % window_size) % window_size + + # Noop in case pad_width == 0 and pad_height == 0. + hidden_state = nn.functional.pad(hidden_state, (0, 0, 0, pad_width, 0, pad_height)) + + padded_height, padded_width = height + pad_height, width + pad_width + + hidden_state = hidden_state.view( + batch_size, padded_height // window_size, window_size, padded_width // window_size, window_size, num_channels + ) + windows = hidden_state.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, num_channels) + return windows, (padded_height, padded_width) + + +class EdgeTamVideoPerceiverResampler(nn.Module): + def __init__(self, config: EdgeTamVideoConfig): + super().__init__() + self.config = config + self.hidden_size = config.perceiver_resampler_hidden_size + self.num_latents_1d = config.perceiver_resampler_num_latents + self.num_latents_2d = config.perceiver_resampler_num_latents_2d + self.num_layers = config.perceiver_resampler_num_layers + + if self.num_latents_1d > 0: + self.latents_1d = nn.Parameter(torch.randn(self.num_latents_1d, self.hidden_size)) + if self.num_latents_2d > 0: + self.latents_2d = nn.Parameter(torch.randn(self.num_latents_2d, self.hidden_size)) + + self.positional_encoding = EdgeTamVideoPositionEmbeddingSine( + num_pos_feats=self.hidden_size // 2, normalize=True + ) + + self.layers = nn.ModuleList([EdgeTamVideoPerceiverEncoderLayer(config) for _ in range(self.num_layers)]) + + self.layer_norm = nn.LayerNorm(self.hidden_size) + + def forward( + self, + hidden_states: torch.Tensor, + positional_encoding: Optional[torch.Tensor] = None, + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: + output_latents = [] + output_positional_encodings = [] + + if self.num_latents_1d > 0: + latents_1d, pos_1d = self._forward_1d(hidden_states, positional_encoding) + output_latents.append(latents_1d) + output_positional_encodings.append(pos_1d) + + if self.num_latents_2d > 0: + latents_2d, pos_2d = self._forward_2d(hidden_states) + output_latents.append(latents_2d) + output_positional_encodings.append(pos_2d) + + combined_latents = torch.cat(output_latents, dim=1) + + combined_positional_encoding = None + if positional_encoding is not None and output_positional_encodings: + combined_positional_encoding = torch.cat(output_positional_encodings, dim=1) + + return combined_latents, combined_positional_encoding + + def _forward_1d( + self, + hidden_states: torch.Tensor, + positional_encoding: Optional[torch.Tensor] = None, + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: + batch_size = hidden_states.shape[0] + + latents = self.latents_1d.unsqueeze(0).expand(batch_size, -1, -1) + flattened_features = hidden_states.permute(0, 2, 3, 1).flatten(1, 2) + + positional_features = None + if positional_encoding is not None: + positional_features = positional_encoding.permute(0, 2, 3, 1).flatten(1, 2) + + for layer in self.layers: + latents = layer(latents, flattened_features, positional_features) + + latents = self.layer_norm(latents) + + output_positional_encoding = None + if positional_encoding is not None: + output_positional_encoding = torch.zeros_like(latents) + + return latents, output_positional_encoding + + def _forward_2d(self, hidden_states: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: + batch_size, channels, height, width = hidden_states.shape + + latents_2d = self.latents_2d.unsqueeze(0).expand(batch_size, -1, -1).view(-1, 1, channels) + + num_windows_per_dim = int(math.sqrt(self.num_latents_2d)) + window_size = height // num_windows_per_dim + + windowed_input = hidden_states.permute(0, 2, 3, 1) + windowed_features, _ = window_partition(windowed_input, window_size) + windowed_features = windowed_features.flatten(1, 2) + + for layer in self.layers: + latents_2d = layer(latents_2d, windowed_features, positional_encoding=None) + + latents_2d = latents_2d.view(batch_size, num_windows_per_dim, num_windows_per_dim, channels).permute( + 0, 3, 1, 2 + ) + + positional_encoding_2d = self.positional_encoding(latents_2d.shape, latents_2d.device, latents_2d.dtype).to( + dtype=hidden_states.dtype + ) + positional_encoding_2d = positional_encoding_2d.permute(0, 2, 3, 1).flatten(1, 2) + + latents_2d = latents_2d.permute(0, 2, 3, 1).flatten(1, 2) + latents_2d = self.layer_norm(latents_2d) + + return latents_2d, positional_encoding_2d + + +@dataclass +@auto_docstring(custom_intro="Base class for the EdgeTamVideo model's output.") +class EdgeTamVideoImageSegmentationOutput(ModelOutput): + r""" + iou_scores (`torch.FloatTensor` of shape `(batch_size, point_batch_size, num_masks)`): + The Intersection over Union (IoU) scores of the predicted masks. + pred_masks (`torch.FloatTensor` of shape `(batch_size, point_batch_size, num_masks, height, width)`): + The predicted low-resolution masks. This is an alias for `low_res_masks`. These masks need to be post-processed + by the processor to be brought to the original image size. + object_score_logits (`torch.FloatTensor` of shape `(batch_size, point_batch_size, 1)`): + Logits for the object score, indicating if an object is present. + image_embeddings (`tuple(torch.FloatTensor)`): + The features from the FPN, which are used by the mask decoder. This is a tuple of `torch.FloatTensor` where each + tensor has shape `(batch_size, channels, height, width)`. + vision_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of each stage) of shape `(batch_size, height, width, hidden_size)`. + Hidden-states of the vision model at the output of each stage. + vision_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. + Attentions weights of the vision model. + mask_decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. + Attentions weights of the mask decoder. + high_res_masks (`torch.FloatTensor` of shape `(batch_size, point_batch_size, num_masks, image_size, image_size)`, *optional*): + The predicted masks, upscaled to the original image size. Only used for EdgeTamVideoModel. + object_pointer (`torch.FloatTensor` of shape `(batch_size, point_batch_size, hidden_size)`, *optional*): + A tensor representing the object pointer, used for tracking in videos. Only used for EdgeTamVideoModel. + """ + + iou_scores: Optional[torch.FloatTensor] = None + pred_masks: Optional[torch.FloatTensor] = None + object_score_logits: Optional[torch.FloatTensor] = None + image_embeddings: tuple[torch.FloatTensor, ...] = None + vision_hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None + vision_attentions: Optional[tuple[torch.FloatTensor, ...]] = None + mask_decoder_attentions: Optional[tuple[torch.FloatTensor, ...]] = None + + high_res_masks: Optional[torch.FloatTensor] = None + object_pointer: Optional[torch.FloatTensor] = None + + +@dataclass +@auto_docstring(custom_intro="Base class for the Sam2 model's output.") +class EdgeTamVideoSegmentationOutput(ModelOutput): + r""" + pred_masks (`torch.FloatTensor` of shape `(batch_size, num_masks, height, width)`): + The predicted masks stored at the model's resolution. + frame_idx (`int`): + The frame index of the video. + """ + + pred_masks: Optional[torch.FloatTensor] = None + frame_idx: Optional[int] = None + + +class EdgeTamVideoPositionalEmbedding(nn.Module): + def __init__(self, config: EdgeTamVideoPromptEncoderConfig): + super().__init__() + self.scale = config.scale + positional_embedding = self.scale * torch.randn((2, config.hidden_size // 2)) + self.register_buffer("positional_embedding", positional_embedding) + + def forward(self, input_coords, input_shape=None): + """Positionally encode points that are normalized to [0,1].""" + coordinates = input_coords.clone() + + if input_shape is not None: + coordinates[:, :, :, 0] = coordinates[:, :, :, 0] / input_shape[1] + coordinates[:, :, :, 1] = coordinates[:, :, :, 1] / input_shape[0] + coordinates.to(torch.float32) + + # assuming coords are in [0, 1]^2 square and have d_1 x ... x d_n x 2 shape + coordinates = 2 * coordinates - 1 + coordinates = coordinates.to(self.positional_embedding.dtype) + coordinates = coordinates @ self.positional_embedding + coordinates = 2 * np.pi * coordinates + # outputs d_1 x ... x d_n x channel shape + return torch.cat([torch.sin(coordinates), torch.cos(coordinates)], dim=-1) + + +class EdgeTamVideoMaskEmbedding(nn.Module): + def __init__(self, config: EdgeTamVideoPromptEncoderConfig): + super().__init__() + self.mask_input_channels = config.mask_input_channels // 4 + self.activation = ACT2FN[config.hidden_act] + self.conv1 = nn.Conv2d(1, self.mask_input_channels, kernel_size=2, stride=2) + self.conv2 = nn.Conv2d(self.mask_input_channels, config.mask_input_channels, kernel_size=2, stride=2) + self.conv3 = nn.Conv2d(config.mask_input_channels, config.hidden_size, kernel_size=1) + self.layer_norm1 = EdgeTamVideoLayerNorm( + self.mask_input_channels, eps=config.layer_norm_eps, data_format="channels_first" + ) + self.layer_norm2 = EdgeTamVideoLayerNorm( + self.mask_input_channels * 4, eps=config.layer_norm_eps, data_format="channels_first" + ) + + def forward(self, masks): + hidden_states = self.conv1(masks) + hidden_states = self.layer_norm1(hidden_states) + hidden_states = self.activation(hidden_states) + + hidden_states = self.conv2(hidden_states) + hidden_states = self.layer_norm2(hidden_states) + hidden_states = self.activation(hidden_states) + dense_embeddings = self.conv3(hidden_states) + return dense_embeddings + + +class EdgeTamVideoPromptEncoder(nn.Module): + def __init__(self, config: EdgeTamVideoPromptEncoderConfig): + super().__init__() + self.shared_embedding = EdgeTamVideoPositionalEmbedding(config) + self.mask_embed = EdgeTamVideoMaskEmbedding(config) + self.no_mask_embed = nn.Embedding(1, config.hidden_size) + + self.image_embedding_size = (config.image_size // config.patch_size, config.image_size // config.patch_size) + self.mask_input_size = (4 * config.image_size // config.patch_size, 4 * config.image_size // config.patch_size) + self.input_image_size = config.image_size + + self.point_embed = nn.Embedding(config.num_point_embeddings, config.hidden_size) + self.hidden_size = config.hidden_size + self.not_a_point_embed = nn.Embedding(1, config.hidden_size) + + def _embed_points(self, points: torch.Tensor, labels: torch.Tensor, pad: bool) -> torch.Tensor: + """Embeds point prompts.""" + points = points + 0.5 # Shift to center of pixel + if pad: + points = torch.nn.functional.pad(points, (0, 0, 0, 1), mode="constant", value=0) + labels = torch.nn.functional.pad(labels, (0, 1), mode="constant", value=-1) + input_shape = (self.input_image_size, self.input_image_size) + point_embedding = self.shared_embedding(points, input_shape) + + # torch.where and expanding the labels tensor is required by the ONNX export + point_embedding = torch.where(labels[..., None] == -1, self.not_a_point_embed.weight, point_embedding) + + # This is required for the ONNX export. The dtype, device need to be explicitly + # specified as otherwise torch.onnx.export interprets as double + point_embedding = torch.where( + labels[..., None] != -10, + point_embedding, + torch.zeros_like(point_embedding), + ) + + # Add point embeddings for labels >= 0 + point_embedding = point_embedding + self.point_embed(labels.clamp(min=0)) * (labels >= 0).unsqueeze(-1) + + return point_embedding + + def _embed_boxes(self, boxes: torch.Tensor) -> torch.Tensor: + """Embeds box prompts.""" + boxes += 0.5 # Shift to center of pixel + coords = boxes.view(*boxes.shape[:2], 2, 2) + # add padding point for consistency with the original implementation + coords = torch.nn.functional.pad(coords, (0, 0, 0, 1), mode="constant", value=0) + corner_embedding = self.shared_embedding(coords, (self.input_image_size, self.input_image_size)) + corner_embedding[:, :, 0, :] += self.point_embed.weight[2] + corner_embedding[:, :, 1, :] += self.point_embed.weight[3] + corner_embedding[:, :, 2, :] = self.not_a_point_embed.weight.expand_as(corner_embedding[:, :, 2, :]) + return corner_embedding + + def forward( + self, + input_points: Optional[tuple[torch.Tensor, torch.Tensor]], + input_labels: Optional[torch.Tensor], + input_boxes: Optional[torch.Tensor], + input_masks: Optional[torch.Tensor], + ) -> tuple[torch.Tensor, torch.Tensor]: + """ + Embeds different types of prompts, returning both sparse and dense embeddings. + + Args: + points (`torch.Tensor`, *optional*): + point coordinates and labels to embed. + boxes (`torch.Tensor`, *optional*): + boxes to embed + masks (`torch.Tensor`, *optional*): + masks to embed + """ + sparse_embeddings = None + batch_size = 1 + if input_points is not None: + batch_size = input_points.shape[0] + if input_labels is None: + raise ValueError("If points are provided, labels must also be provided.") + point_embeddings = self._embed_points(input_points, input_labels, pad=(input_boxes is None)) + sparse_embeddings = point_embeddings + if input_boxes is not None: + batch_size = input_boxes.shape[0] + box_embeddings = self._embed_boxes(input_boxes) + if sparse_embeddings is None: + sparse_embeddings = box_embeddings + else: + sparse_embeddings = torch.cat([sparse_embeddings, box_embeddings], dim=2) + if input_masks is not None: + dense_embeddings = self.mask_embed(input_masks) + else: + dense_embeddings = self.no_mask_embed.weight.reshape(1, -1, 1, 1).expand( + batch_size, -1, self.image_embedding_size[0], self.image_embedding_size[1] + ) + + return sparse_embeddings, dense_embeddings + + +class EdgeTamVideoTwoWayTransformer(nn.Module): + def __init__(self, config: EdgeTamVideoMaskDecoderConfig): + super().__init__() + self.config = config + + self.num_hidden_layers = config.num_hidden_layers + self.layers = nn.ModuleList() + + for i in range(self.num_hidden_layers): + self.layers.append(EdgeTamVideoTwoWayAttentionBlock(config, skip_first_layer_pe=(i == 0))) + + self.final_attn_token_to_image = EdgeTamVideoAttention(config) + self.layer_norm_final_attn = nn.LayerNorm(config.hidden_size) + + def forward( + self, + point_embeddings: Tensor, + image_embeddings: Tensor, + image_positional_embeddings: Tensor, + attention_similarity: Tensor, + target_embedding=None, + **kwargs: Unpack[TransformersKwargs], + ) -> Union[tuple, BaseModelOutput]: + if image_embeddings is None: + raise ValueError("You have to specify an image_embedding") + + image_embeddings = image_embeddings.flatten(2).permute(0, 2, 1).unsqueeze(1) + image_positional_embeddings = image_positional_embeddings.flatten(2).permute(0, 2, 1).unsqueeze(1) + + # Prepare queries + queries = point_embeddings + keys = image_embeddings + + # Apply transformer blocks and final layernorm + for layer in self.layers: + if target_embedding is not None: + queries += target_embedding + + queries, keys, _ = layer( + queries=queries, + keys=keys, + query_point_embedding=point_embeddings, + key_point_embedding=image_positional_embeddings, + attention_similarity=attention_similarity, + **kwargs, + ) + # Apply the final attention layer from the points to the image + query = queries + point_embeddings + key = keys + image_positional_embeddings + + attn_out, _ = self.final_attn_token_to_image(query=query, key=key, value=keys) + + queries = queries + attn_out + queries = self.layer_norm_final_attn(queries) + return queries, keys + + +class EdgeTamVideoMaskDecoder(nn.Module): + def __init__(self, config: EdgeTamVideoMaskDecoderConfig): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + + self.num_multimask_outputs = config.num_multimask_outputs + self.num_mask_tokens = config.num_multimask_outputs + 1 + + self.iou_token = nn.Embedding(1, self.hidden_size) + self.mask_tokens = nn.Embedding(self.num_mask_tokens, self.hidden_size) + + self.transformer = EdgeTamVideoTwoWayTransformer(config) + + # should we create a new class for this? + self.upscale_conv1 = nn.ConvTranspose2d(self.hidden_size, self.hidden_size // 4, kernel_size=2, stride=2) + self.upscale_conv2 = nn.ConvTranspose2d(self.hidden_size // 4, self.hidden_size // 8, kernel_size=2, stride=2) + self.upscale_layer_norm = EdgeTamVideoLayerNorm(self.hidden_size // 4, data_format="channels_first") + self.activation = nn.GELU() + + mlps_list = [] + for _ in range(self.num_mask_tokens): + mlps_list += [EdgeTamVideoFeedForward(self.hidden_size, self.hidden_size, self.hidden_size // 8, 3)] + self.output_hypernetworks_mlps = nn.ModuleList(mlps_list) + self.iou_prediction_head = EdgeTamVideoFeedForward( + self.hidden_size, + config.iou_head_hidden_dim, + self.num_mask_tokens, + config.iou_head_depth, + sigmoid_output=True, + ) + + self.conv_s0 = nn.Conv2d(config.hidden_size, config.hidden_size // 8, kernel_size=1, stride=1) + self.conv_s1 = nn.Conv2d(config.hidden_size, config.hidden_size // 4, kernel_size=1, stride=1) + + self.obj_score_token = nn.Embedding(1, self.hidden_size) + self.pred_obj_score_head = EdgeTamVideoFeedForward(self.hidden_size, self.hidden_size, 1, 3) + + self.dynamic_multimask_via_stability = config.dynamic_multimask_via_stability + self.dynamic_multimask_stability_delta = config.dynamic_multimask_stability_delta + self.dynamic_multimask_stability_thresh = config.dynamic_multimask_stability_thresh + + def forward( + self, + image_embeddings: torch.Tensor, + image_positional_embeddings: torch.Tensor, + sparse_prompt_embeddings: torch.Tensor, + dense_prompt_embeddings: torch.Tensor, + multimask_output: bool, + high_resolution_features: list[torch.Tensor], + attention_similarity: Optional[torch.Tensor] = None, + target_embedding: Optional[torch.Tensor] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Predict masks given image and prompt embeddings. + + Args: + image_embeddings (`torch.Tensor`): + The embeddings from the image encoder. + image_positional_embeddings (`torch.Tensor`): + Positional encoding with the shape of image_embeddings. + sparse_prompt_embeddings (`torch.Tensor`): + The embeddings of the points and boxes. + dense_prompt_embeddings (`torch.Tensor`): + The embeddings of the mask inputs. + multimask_output (`bool`): + Whether to return multiple masks or a single mask. + high_resolution_features (`list[torch.Tensor]`, *optional*): + The high-resolution features from the vision encoder. + attention_similarity (`torch.Tensor`, *optional*): + The attention similarity tensor. + target_embedding (`torch.Tensor`, *optional*): + The target embedding. + """ + batch_size, num_channels, height, width = image_embeddings.shape + point_batch_size = sparse_prompt_embeddings.shape[1] + # Concatenate output tokens + output_tokens = torch.cat( + [ + self.obj_score_token.weight, + self.iou_token.weight, + self.mask_tokens.weight, + ], + dim=0, + ) + output_tokens = output_tokens.repeat(batch_size, point_batch_size, 1, 1) + + if sparse_prompt_embeddings.shape[0] != 0: + tokens = torch.cat((output_tokens, sparse_prompt_embeddings), dim=2) + else: + tokens = output_tokens + point_embeddings = tokens.to(self.iou_token.weight.dtype) + + # Expand per-image data in batch direction to be per-mask + image_embeddings = image_embeddings + dense_prompt_embeddings + image_embeddings = image_embeddings.repeat_interleave(point_batch_size, dim=0) + image_positional_embeddings = image_positional_embeddings.repeat_interleave(point_batch_size, 0) + # Run the transformer + point_embeddings, image_embeddings = self.transformer( + point_embeddings=point_embeddings, + image_embeddings=image_embeddings, + image_positional_embeddings=image_positional_embeddings, + attention_similarity=attention_similarity, + target_embedding=target_embedding, + **kwargs, + ) + iou_token_out = point_embeddings[:, :, 1, :] + mask_tokens_out = point_embeddings[:, :, 2 : (2 + self.num_mask_tokens), :] + + # Upscale mask embeddings and predict masks using the mask tokens + image_embeddings = image_embeddings.transpose(2, 3).view( + batch_size * point_batch_size, num_channels, height, width + ) + + feat_s0, feat_s1 = high_resolution_features + feat_s0 = feat_s0.repeat_interleave(point_batch_size, dim=0) + feat_s1 = feat_s1.repeat_interleave(point_batch_size, dim=0) + upscaled_embedding = self.upscale_conv1(image_embeddings) + feat_s1 + upscaled_embedding = self.activation(self.upscale_layer_norm(upscaled_embedding)) + upscaled_embedding = self.activation(self.upscale_conv2(upscaled_embedding) + feat_s0) + + hyper_in_list: list[torch.Tensor] = [] + for i in range(self.num_mask_tokens): + current_mlp = self.output_hypernetworks_mlps[i] + hyper_in_list += [current_mlp(mask_tokens_out[:, :, i, :])] + hyper_in = torch.stack(hyper_in_list, dim=2) + + _, num_channels, height, width = upscaled_embedding.shape + upscaled_embedding = upscaled_embedding.view(batch_size, point_batch_size, num_channels, height * width) + masks = (hyper_in @ upscaled_embedding).view(batch_size, point_batch_size, -1, height, width) + + # Generate mask quality predictions + iou_pred = self.iou_prediction_head(iou_token_out) + object_score_logits = self.pred_obj_score_head(point_embeddings[:, :, 0, :]) + + # Select the correct mask or masks for output + if multimask_output: + mask_slice = slice(1, None) + masks = masks[:, :, mask_slice, :, :] + iou_pred = iou_pred[:, :, mask_slice] + elif self.dynamic_multimask_via_stability and not self.training: + mask_slice = slice(0, 1) + masks, iou_pred = self._dynamic_multimask_via_stability(masks, iou_pred) + else: + mask_slice = slice(0, 1) + masks = masks[:, :, mask_slice, :, :] + iou_pred = iou_pred[:, :, mask_slice] + + sam_tokens_out = mask_tokens_out[:, :, mask_slice] # [b, 3, c] shape + + return masks, iou_pred, sam_tokens_out, object_score_logits + + def _get_stability_scores(self, mask_logits): + """ + Compute stability scores of the mask logits based on the IoU between upper and + lower thresholds. + """ + mask_logits = mask_logits.flatten(-2) + stability_delta = self.dynamic_multimask_stability_delta + area_i = torch.sum(mask_logits > stability_delta, dim=-1).float() + area_u = torch.sum(mask_logits > -stability_delta, dim=-1).float() + stability_scores = torch.where(area_u > 0, area_i / area_u, 1.0) + return stability_scores + + def _dynamic_multimask_via_stability(self, all_mask_logits, all_iou_scores): + """ + When outputting a single mask, if the stability score from the current single-mask + output (based on output token 0) falls below a threshold, we instead select from + multi-mask outputs (based on output token 1~3) the mask with the highest predicted + IoU score. This is intended to ensure a valid mask for both clicking and tracking. + """ + # The best mask from multimask output tokens (1~3) + multimask_logits = all_mask_logits[:, :, 1:, :, :] + multimask_iou_scores = all_iou_scores[:, :, 1:] + best_scores_inds = torch.argmax(multimask_iou_scores, dim=-1) # [B, P] + best_scores_inds_expanded = best_scores_inds.unsqueeze(-1).unsqueeze(-1).unsqueeze(-1) + best_scores_inds_expanded = best_scores_inds_expanded.expand( + -1, -1, 1, multimask_logits.size(-2), multimask_logits.size(-1) + ) + best_multimask_logits = torch.gather(multimask_logits, 2, best_scores_inds_expanded) # [B, P, 1, H, W] + best_multimask_iou_scores = torch.gather(multimask_iou_scores, 2, best_scores_inds.unsqueeze(-1)) # [B, P, 1] + + # The mask from singlemask output token 0 and its stability score + singlemask_logits = all_mask_logits[:, :, 0:1, :, :] + singlemask_iou_scores = all_iou_scores[:, :, 0:1] + stability_scores = self._get_stability_scores(singlemask_logits) + is_stable = stability_scores >= self.dynamic_multimask_stability_thresh + + # Dynamically fall back to best multimask output upon low stability scores. + mask_logits_out = torch.where( + is_stable[..., None, None].expand_as(singlemask_logits), + singlemask_logits, + best_multimask_logits, + ) + iou_scores_out = torch.where( + is_stable.expand_as(singlemask_iou_scores), + singlemask_iou_scores, + best_multimask_iou_scores, + ) + return mask_logits_out, iou_scores_out + + +# a large negative value as a placeholder score for missing objects +NO_OBJ_SCORE = -1024.0 + + +def get_1d_sine_pe(pos_inds, dim, temperature=10000): + """ + Get 1D sine positional embedding as in the original Transformer paper. + """ + pe_dim = dim // 2 + dim_t = torch.arange(pe_dim, dtype=torch.float32, device=pos_inds.device) + dim_t = temperature ** (2 * (dim_t // 2) / pe_dim) + + pos_embed = pos_inds.unsqueeze(-1) / dim_t + pos_embed = torch.cat([pos_embed.sin(), pos_embed.cos()], dim=-1) + return pos_embed + + +@auto_docstring +class EdgeTamVideoModel(EdgeTamVideoPreTrainedModel): + _tied_weights_keys = ["prompt_encoder.shared_embedding.positional_embedding"] + # need to be ignored, as it's a buffer and will not be correctly detected as tied weight + _keys_to_ignore_on_load_missing = ["prompt_encoder.shared_embedding.positional_embedding"] + _can_record_outputs = {"mask_decoder_attentions": OutputRecorder(EdgeTamVideoTwoWayAttentionBlock, index=2)} + _keys_to_ignore_on_load_unexpected = [] + + def __init__(self, config: EdgeTamVideoConfig): + super().__init__(config) + self.shared_image_embedding = EdgeTamVideoPositionalEmbedding(config.prompt_encoder_config) + self.vision_encoder = AutoModel.from_config(config.vision_config) + self.prompt_encoder = EdgeTamVideoPromptEncoder(config.prompt_encoder_config) + # The module using it is not a PreTrainedModel subclass so we need this + config.mask_decoder_config._attn_implementation = config._attn_implementation + self.mask_decoder = EdgeTamVideoMaskDecoder(config.mask_decoder_config) + + self.num_feature_levels = config.vision_config.num_feature_levels + self.backbone_feature_sizes = config.vision_config.backbone_feature_sizes + # a single token to indicate no memory embedding from previous frames + self.hidden_dim = config.vision_config.fpn_hidden_size + self.no_memory_embedding = torch.nn.Parameter(torch.zeros(1, 1, self.hidden_dim)) + self.config = config + # For video sequence inference + self.image_size = config.image_size + self.memory_attention = EdgeTamVideoMemoryAttention(config) + self.memory_encoder = EdgeTamVideoMemoryEncoder(config) + self.no_memory_positional_encoding = torch.nn.Parameter( + torch.zeros(1, 1, config.vision_config.fpn_hidden_size) + ) + self.mem_dim = config.memory_encoder_output_channels + self.num_maskmem = config.num_maskmem # Number of memories accessible + # Temporal encoding of the memories + self.memory_temporal_positional_encoding = torch.nn.Parameter( + torch.zeros(self.num_maskmem, 1, 1, self.mem_dim) + ) + + self.no_object_pointer = torch.nn.Parameter(torch.zeros(1, self.hidden_dim)) + # A conv layer to downsample the mask prompt to stride 4 (the same stride as + # low-res SAM mask logits) and to change its scales from 0~1 to SAM logit scale, + # so that it can be fed into the SAM mask decoder to generate a pointer. + self.mask_downsample = torch.nn.Conv2d(1, 1, kernel_size=4, stride=4) + # a feedforward layer on SAM output tokens to turn them into object pointers + self.object_pointer_proj = EdgeTamVideoFeedForward(self.hidden_dim, self.hidden_dim, self.hidden_dim, 3) + + if self.config.enable_temporal_pos_encoding_for_object_pointers: + # a linear projection on temporal positional encoding in object pointers to + # avoid potential interference with spatial positional encoding + self.temporal_positional_encoding_projection_layer = torch.nn.Linear(self.hidden_dim, self.mem_dim) + else: + self.temporal_positional_encoding_projection_layer = torch.nn.Identity() + + self.occlusion_spatial_embedding_parameter = None # compatibility with Sam2 + if config.enable_occlusion_spatial_embedding: + self.occlusion_spatial_embedding_parameter = torch.nn.Parameter(torch.zeros(1, self.mem_dim)) + self.spatial_perceiver = EdgeTamVideoPerceiverResampler(config) + + self.post_init() + + def _tie_weights(self): + self.prompt_encoder.shared_embedding.positional_embedding.data = ( + self.shared_image_embedding.positional_embedding.data + ) + + def get_input_embeddings(self): + return self.vision_encoder.get_input_embeddings() + + def get_image_wide_positional_embeddings(self) -> torch.Tensor: + size = self.prompt_encoder.image_embedding_size + target_device = self.shared_image_embedding.positional_embedding.device + target_dtype = self.shared_image_embedding.positional_embedding.dtype + grid = torch.ones(size, device=target_device, dtype=target_dtype) + y_embed = grid.cumsum(dim=0) - 0.5 + x_embed = grid.cumsum(dim=1) - 0.5 + y_embed = y_embed / size[0] + x_embed = x_embed / size[1] + + positional_embedding = self.shared_image_embedding(torch.stack([x_embed, y_embed], dim=-1)) + return positional_embedding.permute(2, 0, 1).unsqueeze(0) # channel x height x width + + @torch.no_grad() + def get_image_embeddings( + self, + pixel_values: torch.FloatTensor, + **kwargs: Unpack[TransformersKwargs], + ) -> list[torch.Tensor]: + r""" + Returns the image embeddings by passing the pixel values through the vision encoder. + + Args: + pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + Input pixel values + """ + batch_size = pixel_values.shape[0] + feature_maps, _, _, _ = self.get_image_features(pixel_values, **kwargs) + + # add no memory embedding to the last feature map + feature_maps[-1] = feature_maps[-1] + self.no_memory_embedding + + # reshape feature maps to the same shape as the backbone feature sizes + image_embeddings = [ + feat.permute(1, 2, 0).view(batch_size, -1, *feat_size) + for feat, feat_size in zip(feature_maps, self.backbone_feature_sizes) + ] + + return image_embeddings + + @torch.no_grad() + def get_prompt_embeddings( + self, + input_points: Optional[torch.FloatTensor] = None, + input_labels: Optional[torch.LongTensor] = None, + input_boxes: Optional[torch.FloatTensor] = None, + input_masks: Optional[torch.LongTensor] = None, + ) -> tuple[torch.Tensor, torch.Tensor]: + r""" + Returns the prompt embeddings by passing the input points, labels, boxes and masks through the prompt encoder. + + Args: + input_points (`torch.FloatTensor` of shape `(batch_size, point_batch_size, num_points_per_image, 2)`): + Optional input points for the prompt encoder. The padding of the point is automatically done by the + processor. `point_batch_size` refers to the number of masks that we want the model to predict per + point. The model will output `point_batch_size` times 3 masks in total. + input_labels (`torch.LongTensor` of shape `(batch_size, point_batch_size, num_points_per_image)`): + Optional input labels for the prompt encoder. The padding of the labels is automatically done by the + processor, or can be fed by the user. + input_boxes (`torch.FloatTensor` of shape `(batch_size, num_boxes_per_image, 4)`): + Optional input boxes for the prompt encoder. The padding of the boxes is automatically done by the + processor. users can also pass manually the input boxes. + input_masks (`torch.LongTensor` of shape `(batch_size, image_size, image_size)`): + Optional input masks for the prompt encoder. + """ + prompt_output = self.prompt_encoder( + input_points=input_points, + input_labels=input_labels, + input_boxes=input_boxes, + input_masks=input_masks, + ) + return prompt_output + + @torch.inference_mode() + @auto_docstring(custom_intro="Propagate the objects through a streamed video frame.") + def forward( + self, + inference_session: EdgeTamVideoInferenceSession, + frame_idx: Optional[int] = None, + frame: Optional[torch.Tensor] = None, + reverse: bool = False, + ) -> EdgeTamVideoSegmentationOutput: + r""" + inference_session (`EdgeTamVideoInferenceSession`): + The video inference session object. + frame_idx (`int`, *optional*): + The index of the frame on which to run inference. No need to provide when inferring + on a new streamed frame. + frame (`torch.Tensor`, *optional*): + The frame to process. Provide when streaming. + reverse (`bool`, *optional*, defaults to `False`): + Whether to propagate in reverse. + """ + if frame is not None: + frame_idx = inference_session.add_new_frame(frame, frame_idx) + + if frame is not None and inference_session.get_obj_num() == 0: + raise ValueError("No objects are provided for tracking; please add inputs first.") + + num_objects = inference_session.get_obj_num() + pred_masks_per_obj = [None] * num_objects + # Note: We avoid batched inference here because per-object inputs (clicks/masks) + # can differ across objects. + for obj_idx in range(num_objects): + obj_id = inference_session.obj_idx_to_id(obj_idx) + has_new_inputs = obj_id in inference_session.obj_with_new_inputs + has_cond_output = frame_idx in inference_session.output_dict_per_obj[obj_idx]["cond_frame_outputs"] + # If this object has no new inputs and this frame already has a + # conditioning output, reuse the cached masks instead of recomputing. + if (not has_new_inputs) and has_cond_output: + pred_masks = inference_session.get_output(obj_idx, frame_idx, "pred_masks", is_conditioning_frame=True) + is_init_cond_frame = True + else: + # Defaults when there are no new inputs + is_init_cond_frame = False + point_inputs = None + mask_inputs = None + + if has_new_inputs: + is_init_cond_frame = frame_idx not in inference_session.frames_tracked_per_obj[obj_idx] + if is_init_cond_frame: + reverse = False + point_inputs = inference_session.point_inputs_per_obj[obj_idx].get(frame_idx, None) + mask_inputs = inference_session.mask_inputs_per_obj[obj_idx].get(frame_idx, None) + if point_inputs is not None or mask_inputs is not None: + inference_session.obj_with_new_inputs.remove(obj_id) + + current_out = self._run_single_frame_inference( + inference_session=inference_session, + obj_idx=obj_idx, + frame_idx=frame_idx, + batch_size=1, # run on the slice of a single object + is_init_cond_frame=is_init_cond_frame, + point_inputs=point_inputs, + mask_inputs=mask_inputs, + reverse=reverse, + run_mem_encoder=True, + streaming=frame is not None, + ) + inference_session.store_output( + obj_idx, frame_idx, output_value=current_out, is_conditioning_frame=is_init_cond_frame + ) + pred_masks = current_out["pred_masks"] + + pred_masks_per_obj[obj_idx] = pred_masks + if not is_init_cond_frame: + # only for tracked frames, not for initial conditioning frames + inference_session.frames_tracked_per_obj[obj_idx][frame_idx] = {"reverse": reverse} + + # Resize the output mask to the original video resolution (we directly use + # the mask scores on GPU for output to avoid any CPU conversion in between) + if len(pred_masks_per_obj) > 1: + all_pred_masks = torch.cat(pred_masks_per_obj, dim=0) + else: + all_pred_masks = pred_masks_per_obj[0] + + return EdgeTamVideoSegmentationOutput(pred_masks=all_pred_masks, frame_idx=frame_idx) + + def get_image_features( + self, + pixel_values: torch.FloatTensor, + **kwargs: Unpack[TransformersKwargs], + ) -> tuple[ + list[torch.Tensor], + list[torch.Tensor], + Optional[tuple[torch.FloatTensor, ...]], + Optional[tuple[torch.FloatTensor, ...]], + ]: + r""" + Extract and preprocess image features using the vision encoder. + + Args: + pixel_values (`torch.FloatTensor`): + Input pixel values of shape `(batch_size, num_channels, height, width)`. + + Returns: + `tuple`: A tuple containing: + - feature_maps (`list[torch.Tensor]`): List of feature maps from different levels. + - feature_maps_position_embeddings (`list[torch.Tensor]`): List of positional embeddings for each feature level. + - vision_hidden_states (`tuple[torch.FloatTensor]`, *optional*): Hidden states from the vision encoder. + - vision_attentions (`tuple[torch.FloatTensor]`, *optional*): Attention weights from the vision encoder. + """ + vision_outputs: EdgeTamVideoVisionEncoderOutput = self.vision_encoder( + pixel_values, + **kwargs, + ) + + feature_maps = vision_outputs.fpn_hidden_states + feature_maps_position_embeddings = vision_outputs.fpn_position_encoding + + # precompute projected level 0 and level 1 features in SAM decoder + # to avoid running it again on every SAM click + feature_maps = list(feature_maps) + feature_maps[0] = self.mask_decoder.conv_s0(feature_maps[0]) + feature_maps[1] = self.mask_decoder.conv_s1(feature_maps[1]) + + # flatten NxCxHxW to HWxNxC + feature_maps = [feature_map.flatten(2).permute(2, 0, 1) for feature_map in feature_maps] + feature_maps_position_embeddings = [ + feature_map_position_embedding.flatten(2).permute(2, 0, 1) + for feature_map_position_embedding in feature_maps_position_embeddings + ] + + return feature_maps, feature_maps_position_embeddings, vision_outputs.hidden_states, vision_outputs.attentions + + def _prepare_vision_features( + self, + inference_session: EdgeTamVideoInferenceSession, + frame_idx: int, + batch_size: int, + ) -> tuple[torch.Tensor, list[torch.Tensor]]: + """Prepare vision features for a frame.""" + + # Check if features are cached + if cached_features := inference_session.cache.get_vision_features(frame_idx): + vision_feats = cached_features["vision_feats"] + vision_pos_embeds = cached_features["vision_pos_embeds"] + else: + # Compute features using image encoder + image_batch = inference_session.get_frame(frame_idx).unsqueeze(0) # Add batch dimension + vision_feats, vision_pos_embeds, _, _ = self.get_image_features(image_batch) + # Cache features + inference_session.cache.cache_vision_features( + frame_idx, {"vision_feats": vision_feats, "vision_pos_embeds": vision_pos_embeds} + ) + + # Expand to batch size if needed + if batch_size > 1: + vision_feats = vision_feats.expand(batch_size, -1, -1, -1) + vision_pos_embeds = [pe.expand(batch_size, -1, -1, -1) for pe in vision_pos_embeds] + + return vision_feats, vision_pos_embeds + + def _single_frame_forward( + self, + pixel_values: Optional[torch.FloatTensor] = None, + input_points: Optional[torch.FloatTensor] = None, + input_labels: Optional[torch.LongTensor] = None, + input_boxes: Optional[torch.FloatTensor] = None, + input_masks: Optional[torch.LongTensor] = None, + image_embeddings: Optional[torch.FloatTensor] = None, + multimask_output: bool = True, + attention_similarity: Optional[torch.FloatTensor] = None, + target_embedding: Optional[torch.FloatTensor] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> EdgeTamVideoImageSegmentationOutput: + """ + input_points (`torch.FloatTensor` of shape `(batch_size, num_points, 2)`): + Input 2D spatial points, this is used by the prompt encoder to encode the prompt. Generally yields to much + better results. The points can be obtained by passing a list of list of list to the processor that will + create corresponding `torch` tensors of dimension 4. The first dimension is the image batch size, the + second dimension is the point batch size (i.e. how many segmentation masks do we want the model to predict + per input point), the third dimension is the number of points per segmentation mask (it is possible to pass + multiple points for a single mask), and the last dimension is the x (vertical) and y (horizontal) + coordinates of the point. If a different number of points is passed either for each image, or for each + mask, the processor will create "PAD" points that will correspond to the (0, 0) coordinate, and the + computation of the embedding will be skipped for these points using the labels. + input_labels (`torch.LongTensor` of shape `(batch_size, point_batch_size, num_points)`): + Input labels for the points, this is used by the prompt encoder to encode the prompt. According to the + official implementation, there are 3 types of labels + + - `1`: the point is a point that contains the object of interest + - `0`: the point is a point that does not contain the object of interest + - `-1`: the point corresponds to the background + + We added the label: + + - `-10`: the point is a padding point, thus should be ignored by the prompt encoder + + The padding labels should be automatically done by the processor. + input_boxes (`torch.FloatTensor` of shape `(batch_size, num_boxes, 4)`): + Input boxes for the points, this is used by the prompt encoder to encode the prompt. Generally yields to + much better generated masks. The boxes can be obtained by passing a list of list of list to the processor, + that will generate a `torch` tensor, with each dimension corresponding respectively to the image batch + size, the number of boxes per image and the coordinates of the top left and bottom right point of the box. + In the order (`x1`, `y1`, `x2`, `y2`): + + - `x1`: the x coordinate of the top left point of the input box + - `y1`: the y coordinate of the top left point of the input box + - `x2`: the x coordinate of the bottom right point of the input box + - `y2`: the y coordinate of the bottom right point of the input box + input_masks (`torch.FloatTensor` of shape `(batch_size, image_size, image_size)`): + SAM model also accepts segmentation masks as input. The mask will be embedded by the prompt encoder to + generate a corresponding embedding, that will be fed later on to the mask decoder. These masks needs to be + manually fed by the user, and they need to be of shape (`batch_size`, `image_size`, `image_size`). + image_embeddings (`torch.FloatTensor` of shape `(batch_size, output_channels, window_size, window_size)`): + Image embeddings, this is used by the mask decoder to generate masks and iou scores. For more memory + efficient computation, users can first retrieve the image embeddings using the `get_image_embeddings` + method, and then feed them to the `forward` method instead of feeding the `pixel_values`. + multimask_output (`bool`, *optional*): + In the original implementation and paper, the model always outputs 3 masks per image (or per point / per + bounding box if relevant). However, it is possible to just output a single mask, that corresponds to the + "best" mask, by specifying `multimask_output=False`. + attention_similarity (`torch.FloatTensor`, *optional*): + Attention similarity tensor, to be provided to the mask decoder for target-guided attention in case the + model is used for personalization as introduced in [PerSAM](https://huggingface.co/papers/2305.03048). + target_embedding (`torch.FloatTensor`, *optional*): + Embedding of the target concept, to be provided to the mask decoder for target-semantic prompting in case + the model is used for personalization as introduced in [PerSAM](https://huggingface.co/papers/2305.03048). + """ + if not ((pixel_values is None) ^ (image_embeddings is None)): + raise ValueError("Exactly one of pixel_values or image_embeddings must be provided.") + if input_points is not None and input_boxes is not None: + if input_points.shape[1] != input_boxes.shape[1]: + raise ValueError( + f"You should provide as many bounding boxes as input points per box. Got {input_points.shape[1]} and {input_boxes.shape[1]}." + ) + elif input_points is not None: + num_objects = input_points.shape[1] + elif input_boxes is not None: + num_objects = input_boxes.shape[1] + elif input_masks is not None: + num_objects = input_masks.shape[1] + else: + num_objects = 1 + + image_positional_embeddings = self.get_image_wide_positional_embeddings() + # repeat with batch size + batch_size = pixel_values.shape[0] if pixel_values is not None else image_embeddings[-1].shape[0] + image_positional_embeddings = image_positional_embeddings.repeat(batch_size, 1, 1, 1) + + vision_attentions = None + vision_hidden_states = None + + if pixel_values is not None: + feature_maps, _, vision_hidden_states, vision_attentions = self.get_image_features( + pixel_values, + **kwargs, + ) + + # add no memory embedding to the last feature map + feature_maps[-1] = feature_maps[-1] + self.no_memory_embedding + + # reshape feature maps to the same shape as the backbone feature sizes + image_embeddings = [ + feat.permute(1, 2, 0).view(batch_size, -1, *feat_size) + for feat, feat_size in zip(feature_maps, self.backbone_feature_sizes) + ] + + if input_points is not None and input_labels is None: + input_labels = torch.ones_like(input_points[:, :, :, 0], dtype=torch.int, device=input_points.device) + + if input_points is None and input_boxes is None: + # If no points are provide, pad with an empty point (with label -1) + input_points = torch.zeros( + batch_size, 1, 1, 2, dtype=image_embeddings[-1].dtype, device=image_embeddings[-1].device + ) + input_labels = -torch.ones(batch_size, 1, 1, dtype=torch.int32, device=image_embeddings[-1].device) + + if input_masks is not None: + # If mask_inputs is provided, downsize it into low-res mask input if needed + # and feed it as a dense mask prompt into the SAM mask encoder + if input_masks.shape[-2:] != self.prompt_encoder.mask_input_size: + input_masks = F.interpolate( + input_masks.float(), + size=self.prompt_encoder.mask_input_size, + align_corners=False, + mode="bilinear", + antialias=True, # use antialias for downsampling + ).to(input_masks.dtype) + + sparse_embeddings, dense_embeddings = self.prompt_encoder( + input_points=input_points, + input_labels=input_labels, + input_boxes=input_boxes, + input_masks=input_masks, + ) + low_res_multimasks, iou_scores, sam_output_tokens, object_score_logits = self.mask_decoder( + image_embeddings=image_embeddings[-1], + image_positional_embeddings=image_positional_embeddings, + sparse_prompt_embeddings=sparse_embeddings, + dense_prompt_embeddings=dense_embeddings, + multimask_output=multimask_output, + high_resolution_features=image_embeddings[:-1], + attention_similarity=attention_similarity, + target_embedding=target_embedding, + **kwargs, + ) + + is_obj_appearing = object_score_logits > 0 + # Mask used for spatial memories is always a *hard* choice between obj and no obj, + # consistent with the actual mask prediction + low_res_multimasks = torch.where( + is_obj_appearing[:, None, None], + low_res_multimasks, + NO_OBJ_SCORE, + ) + + # convert masks from possibly bfloat16 (or float16) to float32 + # (older PyTorch versions before 2.1 don't support `interpolate` on bf16) + high_res_multimasks = ( + F.interpolate( + low_res_multimasks.squeeze(1).float(), + size=(self.image_size, self.image_size), + mode="bilinear", + align_corners=False, + ) + .unsqueeze(1) + .to(low_res_multimasks.dtype) + ) + sam_output_token = sam_output_tokens[:, :, 0] + if multimask_output: + # take the best mask prediction (with the highest IoU estimation) + best_iou_inds = torch.argmax(iou_scores, dim=-1) + batch_inds = torch.arange(batch_size, device=high_res_multimasks.device) + object_batch_inds = torch.arange(num_objects, device=high_res_multimasks.device) + low_res_masks = low_res_multimasks[batch_inds, object_batch_inds, best_iou_inds] + high_res_masks = high_res_multimasks[batch_inds, object_batch_inds, best_iou_inds] + if sam_output_tokens.size(2) > 1: + sam_output_token = sam_output_tokens[batch_inds, object_batch_inds, best_iou_inds] + else: + low_res_masks, high_res_masks = low_res_multimasks[:, :, 0], high_res_multimasks[:, :, 0] + + # Extract object pointer from the SAM output token (with occlusion handling) + object_pointer = self.object_pointer_proj(sam_output_token) + lambda_is_obj_appearing = is_obj_appearing.to(object_pointer.dtype) + + object_pointer = lambda_is_obj_appearing * object_pointer + object_pointer = object_pointer + (1 - lambda_is_obj_appearing) * self.no_object_pointer + + return EdgeTamVideoImageSegmentationOutput( + iou_scores=iou_scores, + pred_masks=low_res_masks, + high_res_masks=high_res_masks, + object_pointer=object_pointer, + object_score_logits=object_score_logits, + image_embeddings=image_embeddings, + vision_hidden_states=vision_hidden_states, + vision_attentions=vision_attentions, + ) + + def _use_mask_as_output( + self, + backbone_features: torch.Tensor, + high_res_features: list[torch.Tensor], + mask_inputs: torch.Tensor, + ) -> EdgeTamVideoImageSegmentationOutput: + """ + Directly turn binary `mask_inputs` into a output mask logits without using SAM. + (same input and output shapes as in forward above). + """ + # Use -10/+20 as logits for neg/pos pixels (very close to 0/1 in prob after sigmoid). + out_scale, out_bias = 20.0, -10.0 # sigmoid(-10.0)=4.5398e-05 + mask_inputs_float = mask_inputs.to(backbone_features[0].dtype) + high_res_masks = mask_inputs_float * out_scale + out_bias + low_res_masks = F.interpolate( + high_res_masks.float(), + size=(high_res_masks.size(-2) // 4, high_res_masks.size(-1) // 4), + align_corners=False, + mode="bilinear", + antialias=True, # use antialias for downsampling + ).to(backbone_features[0].dtype) + # a dummy IoU prediction of all 1's under mask input + iou_scores = mask_inputs.new_ones(mask_inputs.size(0), 1).to(backbone_features[0].dtype) + # produce an object pointer using the SAM decoder from the mask input + object_pointer = self._single_frame_forward( + input_masks=self.mask_downsample(mask_inputs_float.to(backbone_features[0].dtype)), + image_embeddings=high_res_features + [backbone_features], + ).object_pointer + # In this method, we are treating mask_input as output, e.g. using it directly to create spatial mem; + # Below, we follow the same design axiom to use mask_input to decide if obj appears or not instead of relying + # on the object_scores from the SAM decoder. + is_obj_appearing = torch.any(mask_inputs.flatten(1).float() > 0.0, dim=1) + is_obj_appearing = is_obj_appearing[..., None] + lambda_is_obj_appearing = is_obj_appearing.to(backbone_features[0].dtype) + object_score_logits = out_scale * lambda_is_obj_appearing + out_bias + object_pointer = lambda_is_obj_appearing * object_pointer + object_pointer = object_pointer + (1 - lambda_is_obj_appearing) * self.no_object_pointer + return EdgeTamVideoImageSegmentationOutput( + iou_scores=iou_scores, + pred_masks=low_res_masks, + high_res_masks=high_res_masks, + object_pointer=object_pointer, + object_score_logits=object_score_logits, + image_embeddings=high_res_features + [backbone_features], + ) + + def _gather_memory_frame_outputs( + self, + inference_session: EdgeTamVideoInferenceSession, + obj_idx: int, + frame_idx: int, + track_in_reverse_time: bool = False, + ) -> list[tuple[int, dict]]: + """ + Get memory frames from conditioning and non-conditioning outputs. + + Returns: + List of (relative_temporal_offset, output_data) tuples. + """ + temporal_positions_and_previous_outputs = [] + + # Add conditioning frame outputs (no limit here, as is the case in the original checkpoints) + conditioning_outputs = inference_session.output_dict_per_obj[obj_idx]["cond_frame_outputs"] + if not conditioning_outputs: + raise ValueError( + "maskmem_features in conditioning outputs cannot be empty when not is_initial_conditioning_frame" + ) + + # Store (temporal_position, output_data) tuples + temporal_positions_and_previous_outputs = [(0, out) for out in conditioning_outputs.values()] + + # Add non-conditioning memory frames (up to self.num_maskmem - 1) + # These are typically frames tracked by the model without direct user input. + # Frames are selected with a stride, prioritizing the most recent ones. Here we only support stride = 1 for simplicity. + for relative_temporal_offset in range(self.num_maskmem - 1, 0, -1): + # relative_temporal_offset: how many frames before (or after if reversing) the current frame + if not track_in_reverse_time: + previous_frame_idx = frame_idx - relative_temporal_offset + else: + previous_frame_idx = frame_idx + relative_temporal_offset + + # check if the output is already stored without using get_output to avoid unnecessary memory transfers between CPU and GPU + output_data = inference_session.output_dict_per_obj[obj_idx]["non_cond_frame_outputs"].get( + previous_frame_idx, None + ) + + temporal_positions_and_previous_outputs.append((relative_temporal_offset, output_data)) + + return temporal_positions_and_previous_outputs + + def _build_memory_attention_inputs( + self, + temporal_positions_and_previous_outputs: list[tuple[int, dict]], + device: torch.device, + ) -> tuple[list[torch.Tensor], list[torch.Tensor]]: + """ + Concatenate memory features and positional embeddings from previous frames. + + Returns: + Tuple of (memories_to_concatenate, memory_positional_embeddings_to_concatenate). + """ + memories_to_concatenate = [] + memory_positional_embeddings_to_concatenate = [] + + for relative_temporal_offset, prev_output_data in temporal_positions_and_previous_outputs: + if prev_output_data is None: + continue # Skip if no output data for this temporal position (e.g., padding frames) + + # Load memory features (potentially from CPU to GPU) + # Features are flattened: (Batch, Channels, H, W) -> (H*W, Batch, Channels) + memory_features = prev_output_data["maskmem_features"].to(device, non_blocking=True) + memories_to_concatenate.append(memory_features.permute(1, 0, 2)) + + # Spatial positional encoding (potentially from CPU to GPU) + spatial_memory_pos_embed = prev_output_data["maskmem_pos_enc"].to(device, non_blocking=True) + spatial_memory_pos_embed = spatial_memory_pos_embed.squeeze(1).permute(1, 0, 2) + + # Add temporal positional encoding + # self.memory_temporal_positional_encoding shape: (NumMaskMem, 1, 1, MemDim) + combined_memory_pos_embed = ( + spatial_memory_pos_embed + self.memory_temporal_positional_encoding[relative_temporal_offset - 1] + ) + memory_positional_embeddings_to_concatenate.append(combined_memory_pos_embed) + + return memories_to_concatenate, memory_positional_embeddings_to_concatenate + + def _get_object_pointers( + self, + inference_session: EdgeTamVideoInferenceSession, + obj_idx: int, + frame_idx: int, + num_total_frames: int, + device: torch.device, + track_in_reverse_time: bool = False, + streaming: bool = False, + ) -> tuple[list[int], list[torch.Tensor], int]: + """ + Get object pointers and their positional embeddings from past frames. + + Returns: + Tuple of (temporal_offsets, pointer_tokens, max_object_pointers_to_use). + """ + temporal_position_sign_multiplier = -1 if track_in_reverse_time else 1 + + # Determine max object pointers to use + if streaming: + max_object_pointers_to_use = self.config.max_object_pointers_in_encoder + else: + max_object_pointers_to_use = min(num_total_frames, self.config.max_object_pointers_in_encoder) + + temporal_offsets: list[int] = [] + pointer_tokens: list[torch.Tensor] = [] + + # Add object pointers from selected conditioning frames + # Optionally, only include pointers from past frames during evaluation + conditioning_outputs = inference_session.output_dict_per_obj[obj_idx]["cond_frame_outputs"] + eligible_conditioning_outputs = conditioning_outputs + if not self.training: + eligible_conditioning_outputs = { + temporal_idx: out + for temporal_idx, out in conditioning_outputs.items() + if (temporal_idx >= frame_idx if track_in_reverse_time else temporal_idx <= frame_idx) + } + + for temporal_idx, out_data in eligible_conditioning_outputs.items(): + temporal_difference = (frame_idx - temporal_idx) * temporal_position_sign_multiplier + temporal_offsets.append(temporal_difference) + pointer_tokens.append(out_data["object_pointer"].to(device)) + + # Add object pointers from non-conditioning frames (up to max_object_pointers_to_use - 1) + for t_diff_offset in range(1, max_object_pointers_to_use): + ref_frame_idx = frame_idx + t_diff_offset if track_in_reverse_time else frame_idx - t_diff_offset + if ref_frame_idx < 0 or ( + not streaming and num_total_frames is not None and ref_frame_idx >= num_total_frames + ): + break # Stop if frame index is out of bounds + + # check if the output is already stored without using get_output to avoid unnecessary memory transfers between CPU and GPU + out_data = inference_session.output_dict_per_obj[obj_idx]["non_cond_frame_outputs"].get( + ref_frame_idx, None + ) + if out_data is not None: + temporal_offsets.append(t_diff_offset) + pointer_tokens.append(out_data["object_pointer"].to(device)) + + return temporal_offsets, pointer_tokens, max_object_pointers_to_use + + def _process_object_pointers( + self, + temporal_offsets: list[int], + pointer_tokens: list[torch.Tensor], + max_object_pointers_to_use: int, + batch_size: int, + num_channels: int, + device: torch.device, + ) -> tuple[torch.Tensor, torch.Tensor]: + """ + Process object pointers and compute their positional embeddings. + + Returns: + Tuple of (object_pointers, object_pointers_pos_embed). + """ + if not pointer_tokens: + return None, None + + # Stack object pointers: List of (Batch, Channels) -> (SeqLen_ptr, Batch, Channels) + object_pointers = torch.stack(pointer_tokens, dim=0) + + if self.config.enable_temporal_pos_encoding_for_object_pointers: + max_temporal_diff = float(max_object_pointers_to_use - 1) + # Determine dimensionality for temporal positional encoding of pointers + pointer_tpos_dim = num_channels + + # Normalize temporal differences before sine PE calculation + normalized_temporal_diffs = ( + torch.tensor(temporal_offsets, device=device, dtype=torch.float32) / max_temporal_diff + ) + sine_pe = get_1d_sine_pe(normalized_temporal_diffs, dim=pointer_tpos_dim).to(object_pointers.dtype) + projected_sine_pe = self.temporal_positional_encoding_projection_layer(sine_pe) + object_pointers_pos_embed = projected_sine_pe.unsqueeze(1).expand(-1, batch_size, self.mem_dim) + else: + object_pointers_pos_embed = object_pointers.new_zeros( + len(temporal_offsets), batch_size, self.mem_dim, dtype=object_pointers.dtype + ) + + if self.mem_dim < num_channels: + # If memory dimension is smaller, reshape/split pointers and repeat positional encoding + num_splits = num_channels // self.mem_dim + object_pointers = object_pointers.reshape(-1, batch_size, num_splits, self.mem_dim) + object_pointers = object_pointers.permute(0, 2, 1, 3).flatten( + 0, 1 + ) # (SeqLen_ptr*num_splits, Batch, MemDim) + object_pointers_pos_embed = object_pointers_pos_embed.repeat_interleave(num_splits, dim=0) + + return object_pointers, object_pointers_pos_embed + + def _prepare_memory_conditioned_features( + self, + inference_session: EdgeTamVideoInferenceSession, + frame_idx: int, + obj_idx: int, + is_initial_conditioning_frame: bool, + current_vision_features: list[torch.Tensor], + current_vision_positional_embeddings: list[torch.Tensor], + num_total_frames: int, + track_in_reverse_time: bool = False, + streaming: bool = False, + ) -> torch.Tensor: + """ + Fuse current frame's visual features with memory from previous frames for enhanced object tracking. + + This method conditions the current frame's visual features on temporal memory from previous frames, + enabling consistent object tracking across video sequences. For initial conditioning frames, it uses + no-memory embeddings. For subsequent frames, it retrieves and integrates memory features from both + conditioning frames (user interactions) and non-conditioning frames (tracked results) via cross-attention. + + Args: + inference_session (`EdgeTamVideoInferenceSession`): + The video inference session object. + frame_idx (`int`): + Index of the current frame being processed. + obj_idx (`int`): + Index of the object being processed. + is_initial_conditioning_frame (`bool`): + Whether this is an initial conditioning frame with user inputs (True) or a subsequent + tracking frame (False). + current_vision_features (`torch.Tensor`): + Highest-level vision features of shape `(seq_len, batch_size, channels)`. + current_vision_positional_embeddings (`torch.Tensor`): + Positional embedding tensors corresponding to the highest-level vision features. + num_total_frames (`int`): + Total number of frames in the video sequence. + track_in_reverse_time (`bool`, *optional*, defaults to `False`): + Whether tracking is performed in reverse temporal order. + streaming (`bool`, *optional*, defaults to `False`): + Whether this is streaming inference mode. + + Returns: + `torch.Tensor`: Memory-conditioned feature tensor of shape `(batch_size, channels, height, width)` + suitable for input to the SAM decoder. + """ + # Get dimensions from the highest-level (lowest-resolution) feature map + batch_size = current_vision_features.size(1) + num_channels = self.hidden_dim + height, width = self.backbone_feature_sizes[-1] + device = current_vision_features.device + + # If memory is disabled (e.g., for single image SAM), return current features directly. + if self.num_maskmem == 0: + # Permute (SeqLen, Batch, Channels) -> (Batch, Channels, SeqLen) then view as (Batch, Channels, Height, Width) + # Assuming SeqLen = Height * Width for the last feature map + current_feature_map = current_vision_features.permute(1, 2, 0).view( + batch_size, num_channels, height, width + ) + return current_feature_map + + # Step 1: Handle initial conditioning frames + if is_initial_conditioning_frame: + # For initial conditioning frames, no prior memory is used directly in this block. + # If configured, directly add a learnable "no memory" embedding. + # current_vision_features has shape (SeqLen, Batch, Channels) + conditioned_feature_map_flat = current_vision_features + self.no_memory_embedding + # Reshape to (Batch, Channels, Height, Width) + conditioned_feature_map = conditioned_feature_map_flat.permute(1, 2, 0).view( + batch_size, num_channels, height, width + ) + return conditioned_feature_map + + # Step 2: Get memory frames and concatenate their features + temporal_positions_and_previous_outputs = self._gather_memory_frame_outputs( + inference_session, obj_idx, frame_idx, track_in_reverse_time + ) + + memories_to_concatenate, memory_positional_embeddings_to_concatenate = self._build_memory_attention_inputs( + temporal_positions_and_previous_outputs, device + ) + num_spatial_memory_tokens = len(memories_to_concatenate) + + # Step 3: Get and process object pointers + temporal_offsets, pointer_tokens, max_object_pointers_to_use = self._get_object_pointers( + inference_session, obj_idx, frame_idx, num_total_frames, device, track_in_reverse_time, streaming + ) + + num_object_pointer_tokens = 0 + if pointer_tokens: + object_pointers, object_pointers_pos_embed = self._process_object_pointers( + temporal_offsets, pointer_tokens, max_object_pointers_to_use, batch_size, num_channels, device + ) + + if object_pointers is not None: + memories_to_concatenate.append(object_pointers) + memory_positional_embeddings_to_concatenate.append(object_pointers_pos_embed) + num_object_pointer_tokens = object_pointers.shape[0] + + # Step 4: Concatenate all retrieved memories and their positional embeddings + combined_memory = torch.cat(memories_to_concatenate, dim=0) + combined_memory_positional_embeddings = torch.cat(memory_positional_embeddings_to_concatenate, dim=0) + + # Step 5: Forward through the memory attention mechanism + conditioned_feature_map_flat = self.memory_attention( + current_vision_features=current_vision_features, + current_vision_position_embeddings=current_vision_positional_embeddings, + memory=combined_memory, + memory_posision_embeddings=combined_memory_positional_embeddings, # Corrected typo from API + num_object_pointer_tokens=num_object_pointer_tokens, + num_spatial_memory_tokens=num_spatial_memory_tokens, + ) + + # Reshape from (Batch, H*W, Channels) to (Batch, Channels, Height, Width) + conditioned_feature_map = ( + conditioned_feature_map_flat.squeeze(1).permute(0, 2, 1).view(batch_size, num_channels, height, width) + ) + return conditioned_feature_map + + def _use_multimask(self, is_init_cond_frame: bool, point_inputs: Optional[dict]) -> bool: + """Whether to use multimask output in the SAM head.""" + num_pts = 0 if point_inputs is None else point_inputs["point_labels"].size(2) + multimask_output = ( + self.config.multimask_output_in_sam + and (is_init_cond_frame or self.config.multimask_output_for_tracking) + and (self.config.multimask_min_pt_num <= num_pts <= self.config.multimask_max_pt_num) + ) + return multimask_output + + def _run_single_frame_inference( + self, + inference_session: EdgeTamVideoInferenceSession, + frame_idx: int, + obj_idx: int, + batch_size: int, + is_init_cond_frame: bool, + point_inputs: Optional[torch.Tensor], + mask_inputs: Optional[torch.Tensor], + reverse: bool, + run_mem_encoder: bool, + prev_sam_mask_logits: Optional[torch.Tensor] = None, + streaming: bool = False, + ) -> dict[str, Any]: + """ + Perform a single tracking step for video object segmentation. + + Args: + inference_session (`EdgeTamVideoInferenceSession`): + The video inference session object. + frame_idx (`int`): + Index of the current frame. + obj_idx (`int`): + Index of the current object. + batch_size (`int`): + Batch size of the current frame. + is_init_cond_frame (`bool`): + Whether this is an initial conditioning frame with user inputs. + point_inputs (`dict`, *optional*): + Point prompt inputs for the current frame. + mask_inputs (`torch.Tensor`, *optional*): + Mask prompt inputs for the current frame. + reverse (`bool`, *optional*, defaults to `False`): + Whether to track in reverse time order. + run_mem_encoder (`bool`, *optional*, defaults to `True`): + Whether to run the memory encoder on predicted masks. + prev_sam_mask_logits (`torch.Tensor`, *optional*): + Previously predicted SAM mask logits that can be fed with new clicks. + streaming (`bool`, *optional*, defaults to `False`): + Whether this is streaming inference. + + Returns: + `dict`: Dictionary containing the tracking results for the current frame, including: + - pred_masks: Predicted low-resolution masks. + - object_pointer: Object pointer for memory. + - object_score_logits: Object score logits (inference only). + - maskmem_features: Memory features for future frames. + - maskmem_pos_enc: Memory positional encodings. + """ + # Retrieve correct image features + current_vision_feats, current_vision_pos_embeds = self._prepare_vision_features( + inference_session, frame_idx, batch_size + ) + # point and mask should not appear as input simultaneously on the same frame + if point_inputs is not None and mask_inputs is not None: + raise ValueError( + "point_inputs and mask_inputs should not appear as input simultaneously on the same frame" + ) + # High-resolution feature maps for the SAM head, reshape (HW)BC => BCHW + if len(current_vision_feats) > 1: + high_res_features = [ + x.permute(1, 2, 0).view(x.size(1), x.size(2), *s) + for x, s in zip(current_vision_feats[:-1], self.backbone_feature_sizes[:-1]) + ] + else: + high_res_features = None + if mask_inputs is not None: + # We directly output the mask input (see it as a GT mask) without using a SAM prompt encoder + mask decoder. + pix_feat = current_vision_feats[-1].permute(1, 2, 0) + pix_feat = pix_feat.view(-1, self.hidden_dim, *self.backbone_feature_sizes[-1]) + sam_outputs = self._use_mask_as_output(pix_feat, high_res_features, mask_inputs) + else: + # fused the visual feature with previous memory features in the memory bank + pix_feat = self._prepare_memory_conditioned_features( + inference_session=inference_session, + frame_idx=frame_idx, + obj_idx=obj_idx, + is_initial_conditioning_frame=is_init_cond_frame, + current_vision_features=current_vision_feats[-1], + current_vision_positional_embeddings=current_vision_pos_embeds[-1], + num_total_frames=inference_session.num_frames, + track_in_reverse_time=reverse, + streaming=streaming, + ) + # apply SAM-style segmentation head + # here we might feed previously predicted low-res SAM mask logits into the SAM mask decoder, + # e.g. in demo where such logits come from earlier interaction instead of correction sampling + # (in this case, any `mask_inputs` shouldn't reach here as they are sent to _use_mask_as_output instead) + if prev_sam_mask_logits is not None: + mask_inputs = prev_sam_mask_logits + multimask_output = self._use_multimask(is_init_cond_frame, point_inputs) + sam_outputs = self._single_frame_forward( + pixel_values=None, # Vision features already computed + input_points=point_inputs["point_coords"] if point_inputs is not None else None, + input_labels=point_inputs["point_labels"] if point_inputs is not None else None, + input_masks=mask_inputs, + image_embeddings=high_res_features + [pix_feat], + multimask_output=multimask_output, + ) + + # Finally run the memory encoder on the predicted mask to encode + # it into a new memory feature (which will be used to condition vision features in future frames) + maskmem_features = None + maskmem_pos_enc = None + if run_mem_encoder and self.num_maskmem > 0: + maskmem_features, maskmem_pos_enc = self._encode_new_memory( + current_vision_feats=current_vision_feats[-1], + pred_masks_high_res=sam_outputs.high_res_masks, + object_score_logits=sam_outputs.object_score_logits, + is_mask_from_pts=(point_inputs is not None or mask_inputs is not None), + ) + + current_out = { + "pred_masks": sam_outputs.pred_masks, + "object_pointer": sam_outputs.object_pointer, + "maskmem_features": maskmem_features if maskmem_features is not None else None, + "maskmem_pos_enc": maskmem_pos_enc, + } + if not self.training: + current_out["object_score_logits"] = sam_outputs.object_score_logits + + return current_out + + def _encode_new_memory( + self, + current_vision_feats: torch.Tensor, + pred_masks_high_res: torch.Tensor, + object_score_logits: torch.Tensor, + is_mask_from_pts: bool, + ) -> tuple[torch.Tensor, list[torch.Tensor]]: + """Encode the current image and its prediction into a memory feature.""" + batch_size = current_vision_feats.size(1) # batch size on this frame + channels = self.hidden_dim + height, width = self.backbone_feature_sizes[-1] # top-level (lowest-resolution) feature size + # top-level feature, (HW)BC => BCHW + pix_feat = current_vision_feats.permute(1, 2, 0).view(batch_size, channels, height, width) + if is_mask_from_pts and not self.training: + # binarize the mask logits + mask_for_mem = (pred_masks_high_res > 0).to(pred_masks_high_res.dtype) + else: + # apply sigmoid on the raw mask logits to turn them into range (0, 1) + mask_for_mem = torch.sigmoid(pred_masks_high_res) + # apply scale and bias terms to the sigmoid probabilities + mask_for_mem = mask_for_mem * self.config.sigmoid_scale_for_mem_enc + mask_for_mem = mask_for_mem + self.config.sigmoid_bias_for_mem_enc + + maskmem_features, maskmem_pos_enc = self.memory_encoder( + pix_feat, + mask_for_mem, + ) + # add a no-object embedding to the spatial memory to indicate that the frame + # is predicted to be occluded (i.e. no object is appearing in the frame) + if self.occlusion_spatial_embedding_parameter is not None: + is_obj_appearing = (object_score_logits > 0).float() + maskmem_features += (1 - is_obj_appearing[..., None]) * self.occlusion_spatial_embedding_parameter[ + ..., None, None + ].expand(*maskmem_features.shape) + + maskmem_pos_enc = maskmem_pos_enc.to(pred_masks_high_res.dtype) + maskmem_features, maskmem_pos_enc = self.spatial_perceiver(maskmem_features, maskmem_pos_enc) + maskmem_features = maskmem_features.to(pred_masks_high_res.dtype) + maskmem_pos_enc = maskmem_pos_enc.to(pred_masks_high_res.dtype) + + return maskmem_features, maskmem_pos_enc + + @torch.inference_mode() + @auto_docstring( + custom_intro=""" + Propagate the objects through the video frames. Used when initializing an inference session with a whole video. + Yields EdgeTamVideoSegmentationOutput for each frame. + """ + ) + def propagate_in_video_iterator( + self, + inference_session: EdgeTamVideoInferenceSession, + start_frame_idx: Optional[int] = None, + max_frame_num_to_track: Optional[int] = None, + reverse: bool = False, + ) -> Iterator[EdgeTamVideoSegmentationOutput]: + r""" + inference_session (`EdgeTamVideoInferenceSession`): + The video inference session object. + start_frame_idx (`int`, *optional*): + The starting frame index for propagation. + Need to be provided if `forward` hasn't been called on new inputs yet. + If not provided, the starting frame index will be the earliest frame with input points. + max_frame_num_to_track (`int`, *optional*): + The maximum number of frames to track. + reverse (`bool`, *optional*, defaults to `False`): + Whether to propagate in reverse. + """ + num_frames = inference_session.num_frames + + # set start index, end index, and processing order + if start_frame_idx is None: + # default: start from the earliest frame with input points + frames_with_inputs = [ + frame_idx + for obj_output_dict in inference_session.output_dict_per_obj.values() + for frame_idx in obj_output_dict["cond_frame_outputs"] + ] + if not frames_with_inputs: + raise ValueError( + "Cannot determine the starting frame index; please specify it manually, or run inference on a frame with inputs first." + ) + start_frame_idx = min(frames_with_inputs) + if max_frame_num_to_track is None: + # default: track all the frames in the video + max_frame_num_to_track = num_frames + if reverse: + end_frame_idx = max(start_frame_idx - max_frame_num_to_track, 0) + if start_frame_idx > 0: + processing_order = range(start_frame_idx, end_frame_idx - 1, -1) + else: + processing_order = [] # skip reverse tracking if starting from frame 0 + else: + end_frame_idx = min(start_frame_idx + max_frame_num_to_track, num_frames - 1) + processing_order = range(start_frame_idx, end_frame_idx + 1) + + for frame_idx in tqdm(processing_order, desc="propagate in video"): + edgetam_video_output = self(inference_session, frame_idx=frame_idx, reverse=reverse) + yield edgetam_video_output + + +__all__ = ["EdgeTamVideoModel", "EdgeTamVideoInferenceSession", "EdgeTamVideoPreTrainedModel"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/edgetam_video/modular_edgetam_video.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/edgetam_video/modular_edgetam_video.py new file mode 100644 index 0000000000000000000000000000000000000000..b520cd5a756bb18b99fb29dccce72a3d8fd47a64 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/edgetam_video/modular_edgetam_video.py @@ -0,0 +1,1243 @@ +# coding=utf-8 +# Copyright 2025 the HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +from typing import Callable, Optional + +import torch +import torch.nn as nn +import torch.utils.checkpoint +from torch import Tensor + +from transformers.models.sam2.modeling_sam2 import ( + eager_attention_forward, + window_partition, +) +from transformers.utils.generic import OutputRecorder + +from ...activations import ACT2FN +from ...configuration_utils import PretrainedConfig +from ...modeling_flash_attention_utils import FlashAttentionKwargs +from ...modeling_utils import ALL_ATTENTION_FUNCTIONS +from ...processing_utils import Unpack +from ...pytorch_utils import compile_compatible_method_lru_cache +from ...utils import ( + auto_docstring, +) +from ..auto import CONFIG_MAPPING, AutoConfig +from ..sam2_video.configuration_sam2_video import ( + Sam2VideoConfig, + Sam2VideoMaskDecoderConfig, + Sam2VideoPromptEncoderConfig, +) +from ..sam2_video.modeling_sam2_video import ( + Sam2VideoAttention, + Sam2VideoFeedForward, + Sam2VideoInferenceSession, + Sam2VideoLayerNorm, + Sam2VideoMemoryAttention, + Sam2VideoMemoryEncoder, + Sam2VideoMemoryFuserCXBlock, + Sam2VideoModel, + Sam2VideoPositionEmbeddingSine, + Sam2VideoPreTrainedModel, + Sam2VideoTwoWayAttentionBlock, + Sam2VideoVisionEncoderOutput, + Sam2VideoVisionRotaryEmbedding, + rotate_pairwise, +) + + +class EdgeTamVideoPromptEncoderConfig(Sam2VideoPromptEncoderConfig): + pass + + +class EdgeTamVideoMaskDecoderConfig(Sam2VideoMaskDecoderConfig): + pass + + +class EdgeTamVideoConfig(Sam2VideoConfig): + r""" + [`EdgeTamVideoConfig`] is the configuration class to store the configuration of a [`EdgeTamVideoModel`]. It is used to instantiate a + EDGETAM model according to the specified arguments, defining the memory attention, memory encoder, and image encoder + configs. Instantiating a configuration defaults will yield a similar configuration to that of the SAM 2.1 Hiera-tiny + [facebook/EdgeTAM](https://huggingface.co/facebook/EdgeTAM) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + vision_config (Union[`dict`, `EdgeTamVideoVisionConfig`], *optional*): + Dictionary of configuration options used to initialize [`EdgeTamVideoVisionConfig`]. + prompt_encoder_config (Union[`dict`, `EdgeTamVideoPromptEncoderConfig`], *optional*): + Dictionary of configuration options used to initialize [`EdgeTamVideoPromptEncoderConfig`]. + mask_decoder_config (Union[`dict`, `EdgeTamVideoMaskDecoderConfig`], *optional*): + Dictionary of configuration options used to initialize [`EdgeTamMaskDecoderConfig`]. + initializer_range (`float`, *optional*, defaults to 0.02): + Standard deviation for parameter initialization. + num_maskmem (`int`, *optional*, defaults to 7): + The number of memory slots for the mask memory. + image_size (`int`, *optional*, defaults to 1024): + The size of the input images. + sigmoid_scale_for_mem_enc (`float`, *optional*, defaults to 20.0): + Scale factor for the sigmoid function in the memory encoder. + sigmoid_bias_for_mem_enc (`float`, *optional*, defaults to -10.0): + Bias for the sigmoid function in the memory encoder. + enable_occlusion_spatial_embedding (`bool`, *optional*, defaults to `True`): + Whether to enable spatial embedding for occlusions. + multimask_output_in_sam (`bool`, *optional*, defaults to `True`): + Whether to output multiple masks from the SAM head. + multimask_min_pt_num (`int`, *optional*, defaults to 0): + The minimum number of points to trigger multimask output. + multimask_max_pt_num (`int`, *optional*, defaults to 1): + The maximum number of points to trigger multimask output. + multimask_output_for_tracking (`bool`, *optional*, defaults to `True`): + Whether to use multimask output for tracking. + max_object_pointers_in_encoder (`int`, *optional*, defaults to 16): + The maximum number of object pointers in the encoder. + enable_temporal_pos_encoding_for_object_pointers (`bool`, *optional*, defaults to `True`): + Whether to enable temporal positional encoding for object pointers. + memory_attention_hidden_size (`int`, *optional*, defaults to 256): + Dimensionality of the memory attention hidden states. + memory_attention_num_layers (`int`, *optional*, defaults to 2): + The number of layers in the memory attention module. + memory_attention_num_attention_heads (`int`, *optional*, defaults to 1): + Number of attention heads for each attention layer in the memory attention. + memory_attention_downsample_rate (`int`, *optional*, defaults to 1): + The downsample rate for the attention layers. + memory_attention_mlp_hidden_size (`int`, *optional*, defaults to 2048): + The dimension of the feedforward network in the memory attention module. + memory_attention_mlp_hidden_act (`str`, *optional*, defaults to `"relu"`): + The non-linear activation function in the feedforward network in the memory attention module. + memory_attention_dropout (`float`, *optional*, defaults to 0.1): + The dropout rate for the memory attention module. + memory_attention_rope_theta (`float`, *optional*, defaults to 10000): + The Rope theta parameter. + memory_attention_rope_feat_sizes (`Tuple[int, int]`, *optional*, defaults to `[64, 64]`): + The feature sizes for the Rope positional encoding. + memory_attention_rope_k_sizes (`List[int]`, *optional*, defaults to `[16, 16]`): + The key feature sizes for the RoPE positional encoding in memory attention. + memory_attention_rope_dropout (`float`, *optional*, defaults to 0.1): + The dropout rate for the Rope positional encoding. + perceiver_resampler_num_latents (`int`, *optional*, defaults to 256): + The number of 1D latent tokens in the perceiver resampler. + perceiver_resampler_num_latents_2d (`int`, *optional*, defaults to 256): + The number of 2D latent tokens in the perceiver resampler. + perceiver_resampler_hidden_size (`int`, *optional*, defaults to 64): + The hidden size of the perceiver resampler. + perceiver_resampler_mlp_intermediate_size (`int`, *optional*, defaults to 256): + The intermediate size of the feedforward network in the perceiver resampler. + perceiver_resampler_num_attention_heads (`int`, *optional*, defaults to 1): + The number of attention heads in the perceiver resampler. + perceiver_resampler_attention_head_dim (`int`, *optional*, defaults to 64): + The dimension of each attention head in the perceiver resampler. + perceiver_resampler_num_layers (`int`, *optional*, defaults to 2): + The number of layers in the perceiver resampler. + perceiver_resampler_hidden_dropout (`float`, *optional*, defaults to 0.0): + The dropout rate for the hidden layers in the perceiver resampler. + perceiver_resampler_attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout rate for the attention layers in the perceiver resampler. + memory_encoder_hidden_size (`int`, *optional*, defaults to 256): + Dimensionality of the memory encoder hidden states. + memory_encoder_output_channels (`int`, *optional*, defaults to 64): + The number of output channels for the memory encoder. + mask_downsampler_embed_dim (`int`, *optional*, defaults to 256): + The dimension of the mask downsampler embedding. + memory_fuser_intermediate_dim (`int`, *optional*, defaults to 1024): + The intermediate dimension of the memory fuser feedforward network. + mask_downsampler_kernel_size (`int`, *optional*, defaults to 3): + The kernel size for the mask downsampler. + mask_downsampler_stride (`int`, *optional*, defaults to 2): + The stride for the mask downsampler. + mask_downsampler_padding (`int`, *optional*, defaults to 1): + The padding for the mask downsampler. + mask_downsampler_total_stride (`int`, *optional*, defaults to 16): + The total stride for the mask downsampler. + mask_downsampler_hidden_act (`str`, *optional*, defaults to `"gelu"`): + The non-linear activation function in the mask downsampler. + memory_fuser_num_layers (`int`, *optional*, defaults to 2): + The number of layers in the memory fuser. + memory_fuser_embed_dim (`int`, *optional*, defaults to 256): + The dimension of the memory fuser embedding. + memory_fuser_kernel_size (`int`, *optional*, defaults to 7): + The kernel size for the memory fuser. + memory_fuser_padding (`int`, *optional*, defaults to 3): + The padding for the memory fuser. + memory_fuser_layer_scale_init_value (`float`, *optional*, defaults to 1e-06): + The initial value for the layer scale in the memory fuser. + memory_fuser_hidden_act (`str`, *optional*, defaults to `"gelu"`): + The non-linear activation function in the memory fuser. + + Example: + + ```python + >>> from transformers import ( + ... EdgeTamVisionConfig, + ... EdgeTamVideoPromptEncoderConfig, + ... EdgeTamVideoMaskDecoderConfig, + ... EdgeTamVideoModel, + ... EdgeTamVideoConfig, + ... ) + + >>> # Initializing a EdgeTamVideoConfig with `"facebook/edgetam.1_hiera_tiny"` style configuration + >>> configuration = EdgeTamVideoConfig() + + >>> # Initializing a EdgeTamVideoModel (with random weights) from the `"facebook/edgetam.1_hiera_tiny"` style configuration + >>> model = EdgeTamVideoModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + + >>> # We can also initialize a EdgeTamConfig from a EdgeTamVisionConfig, EdgeTamPromptEncoderConfig, and EdgeTamMaskDecoderConfig + + >>> # Initializing EDGETAM vision encoder, memory attention, and memory encoder configurations + >>> vision_config = EdgeTamVisionConfig() + >>> prompt_encoder_config = EdgeTamVideoPromptEncoderConfig() + >>> mask_decoder_config = EdgeTamVideoMaskDecoderConfig() + + >>> config = EdgeTamVideoConfig(vision_config, prompt_encoder_config, mask_decoder_config) + ```""" + + model_type = "edgetam_video" + sub_configs = { + "vision_config": AutoConfig, + "prompt_encoder_config": EdgeTamVideoPromptEncoderConfig, + "mask_decoder_config": EdgeTamVideoMaskDecoderConfig, + } + + def __init__( + self, + vision_config=None, + prompt_encoder_config=None, + mask_decoder_config=None, + initializer_range=0.02, + num_maskmem=7, + image_size=1024, + sigmoid_scale_for_mem_enc=20.0, + sigmoid_bias_for_mem_enc=-10.0, + enable_occlusion_spatial_embedding=True, + multimask_output_in_sam=True, + multimask_min_pt_num=0, + multimask_max_pt_num=1, + multimask_output_for_tracking=True, + max_object_pointers_in_encoder=16, + enable_temporal_pos_encoding_for_object_pointers=True, + # memory attention + memory_attention_hidden_size=256, + memory_attention_num_layers=2, + memory_attention_num_attention_heads=1, + memory_attention_downsample_rate=1, + memory_attention_mlp_hidden_size=2048, + memory_attention_mlp_hidden_act="relu", + memory_attention_dropout=0.1, + memory_attention_rope_theta=10000, + memory_attention_rope_feat_sizes=None, + memory_attention_rope_k_sizes=None, + memory_attention_rope_dropout=0.1, + # spatial perceiver resampler + perceiver_resampler_num_latents=256, + perceiver_resampler_num_latents_2d=256, + perceiver_resampler_hidden_size=64, + perceiver_resampler_mlp_intermediate_size=256, + perceiver_resampler_num_attention_heads=1, + perceiver_resampler_attention_head_dim=64, + perceiver_resampler_num_layers=2, + perceiver_resampler_hidden_dropout=0.0, + perceiver_resampler_attention_dropout=0.0, + # memory encoder + memory_encoder_hidden_size=256, + memory_encoder_output_channels=64, + mask_downsampler_embed_dim=256, + memory_fuser_intermediate_dim=1024, + mask_downsampler_kernel_size=3, + mask_downsampler_stride=2, + mask_downsampler_padding=1, + mask_downsampler_total_stride=16, + mask_downsampler_hidden_act="gelu", + memory_fuser_num_layers=2, + memory_fuser_embed_dim=256, + memory_fuser_kernel_size=7, + memory_fuser_padding=3, + memory_fuser_layer_scale_init_value=1e-6, + memory_fuser_hidden_act="gelu", + **kwargs, + ): + PretrainedConfig.__init__(**kwargs) + vision_config = vision_config if vision_config is not None else {} + prompt_encoder_config = prompt_encoder_config if prompt_encoder_config is not None else {} + mask_decoder_config = mask_decoder_config if mask_decoder_config is not None else {} + memory_attention_rope_feat_sizes = ( + [64, 64] if memory_attention_rope_feat_sizes is None else memory_attention_rope_feat_sizes + ) + memory_attention_rope_k_sizes = ( + [16, 16] if memory_attention_rope_k_sizes is None else memory_attention_rope_k_sizes + ) + + if isinstance(vision_config, dict): + vision_config["model_type"] = vision_config.get("model_type", "sam2_vision_model") + vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config) + if isinstance(prompt_encoder_config, EdgeTamVideoPromptEncoderConfig): + prompt_encoder_config = prompt_encoder_config.to_dict() + if isinstance(mask_decoder_config, EdgeTamVideoMaskDecoderConfig): + mask_decoder_config = mask_decoder_config.to_dict() + + self.vision_config = vision_config + self.prompt_encoder_config = EdgeTamVideoPromptEncoderConfig(**prompt_encoder_config) + self.mask_decoder_config = EdgeTamVideoMaskDecoderConfig(**mask_decoder_config) + + self.initializer_range = initializer_range + self.num_maskmem = num_maskmem # default 1 input frame + 6 previous frames + self.image_size = image_size + self.sigmoid_scale_for_mem_enc = sigmoid_scale_for_mem_enc # scale factor for mask sigmoid prob + self.sigmoid_bias_for_mem_enc = sigmoid_bias_for_mem_enc # bias factor for mask sigmoid prob + self.enable_occlusion_spatial_embedding = enable_occlusion_spatial_embedding + self.multimask_output_in_sam = multimask_output_in_sam + self.multimask_min_pt_num = multimask_min_pt_num + self.multimask_max_pt_num = multimask_max_pt_num + self.multimask_output_for_tracking = multimask_output_for_tracking + self.max_object_pointers_in_encoder = max_object_pointers_in_encoder + self.enable_temporal_pos_encoding_for_object_pointers = enable_temporal_pos_encoding_for_object_pointers + + # memory attention + self.memory_attention_hidden_size = memory_attention_hidden_size + self.memory_attention_num_layers = memory_attention_num_layers + self.memory_attention_num_attention_heads = memory_attention_num_attention_heads + self.memory_attention_downsample_rate = memory_attention_downsample_rate + self.memory_attention_mlp_hidden_size = memory_attention_mlp_hidden_size + self.memory_attention_mlp_hidden_act = memory_attention_mlp_hidden_act + self.memory_attention_dropout = memory_attention_dropout + self.memory_attention_rope_theta = memory_attention_rope_theta + self.memory_attention_rope_feat_sizes = memory_attention_rope_feat_sizes + self.memory_attention_rope_k_sizes = memory_attention_rope_k_sizes + self.memory_attention_rope_dropout = memory_attention_rope_dropout + + # spatial perceiver resampler + self.perceiver_resampler_num_latents = perceiver_resampler_num_latents + self.perceiver_resampler_num_latents_2d = perceiver_resampler_num_latents_2d + self.perceiver_resampler_hidden_size = perceiver_resampler_hidden_size + self.perceiver_resampler_mlp_intermediate_size = perceiver_resampler_mlp_intermediate_size + self.perceiver_resampler_attention_head_dim = perceiver_resampler_attention_head_dim + self.perceiver_resampler_num_attention_heads = perceiver_resampler_num_attention_heads + self.perceiver_resampler_num_layers = perceiver_resampler_num_layers + self.perceiver_resampler_hidden_dropout = perceiver_resampler_hidden_dropout + self.perceiver_resampler_attention_dropout = perceiver_resampler_attention_dropout + + # memory encoder + self.memory_encoder_hidden_size = memory_encoder_hidden_size + self.memory_encoder_output_channels = memory_encoder_output_channels + self.mask_downsampler_embed_dim = mask_downsampler_embed_dim + self.mask_downsampler_kernel_size = mask_downsampler_kernel_size + self.mask_downsampler_stride = mask_downsampler_stride + self.mask_downsampler_padding = mask_downsampler_padding + self.mask_downsampler_total_stride = mask_downsampler_total_stride + self.mask_downsampler_hidden_act = mask_downsampler_hidden_act + self.memory_fuser_num_layers = memory_fuser_num_layers + self.memory_fuser_embed_dim = memory_fuser_embed_dim + self.memory_fuser_intermediate_dim = memory_fuser_intermediate_dim + self.memory_fuser_kernel_size = memory_fuser_kernel_size + self.memory_fuser_padding = memory_fuser_padding + self.memory_fuser_layer_scale_init_value = memory_fuser_layer_scale_init_value + self.memory_fuser_hidden_act = memory_fuser_hidden_act + + +class EdgeTamVideoLayerNorm(Sam2VideoLayerNorm): + pass + + +class EdgeTamVideoMemoryFuserCXBlock(Sam2VideoMemoryFuserCXBlock): + pass + + +class EdgeTamVideoVisionEncoderOutput(Sam2VideoVisionEncoderOutput): + pass + + +class EdgeTamVideoVisionRotaryEmbedding(Sam2VideoVisionRotaryEmbedding): + def __init__(self, config: EdgeTamVideoConfig, end_x: Optional[int] = None, end_y: Optional[int] = None): + nn.Module.__init__() + dim = config.memory_attention_hidden_size // ( + config.memory_attention_downsample_rate * config.memory_attention_num_attention_heads + ) + # Ensure even dimension for proper axial splitting + if dim % 4 != 0: + raise ValueError("Dimension must be divisible by 4 for axial RoPE") + end_x, end_y = config.memory_attention_rope_feat_sizes if end_x is None else (end_x, end_y) + freqs = 1.0 / (config.memory_attention_rope_theta ** (torch.arange(0, dim, 4)[: (dim // 4)].float() / dim)) + + # Generate 2D position indices for axial rotary embedding + flattened_indices = torch.arange(end_x * end_y, dtype=torch.long) + x_positions = flattened_indices % end_x + y_positions = torch.div(flattened_indices, end_x, rounding_mode="floor") + freqs_x = torch.outer(x_positions, freqs).float() + freqs_y = torch.outer(y_positions, freqs).float() + inv_freq = torch.cat([freqs_x, freqs_y], dim=-1) + inv_freq = inv_freq.repeat_interleave(2, dim=-1) + # directly register the cos and sin embeddings as we have a fixed feature shape + self.register_buffer("rope_embeddings_cos", inv_freq.cos(), persistent=False) + self.register_buffer("rope_embeddings_sin", inv_freq.sin(), persistent=False) + + +class EdgeTamVideoAttention(Sam2VideoAttention): + pass + + +def apply_rotary_pos_emb_2d_self_attn( + q: torch.Tensor, + k: torch.Tensor, + cos: torch.Tensor, + sin: torch.Tensor, +) -> tuple[torch.Tensor, torch.Tensor]: + """ + Apply rotary position embedding to query and key tensors for self-attention. + + Args: + q: Query tensor of shape (..., seq_len, head_dim) + k: Key tensor of shape (..., seq_len, head_dim) + cos: Cosine position embedding of shape (seq_len, head_dim) + sin: Sine position embedding of shape (seq_len, head_dim) + + Returns: + Rotated (q, k) tensors + """ + # Apply RoPE to queries + q_embed = q.float() # force upscale to float32 as in the original implementation + q_embed = (q_embed * cos) + (rotate_pairwise(q_embed) * sin) + + # Apply RoPE to keys (same embeddings as queries for self-attention) + k_embed = k.float() # force upscale to float32 as in the original implementation + k_embed = (k_embed * cos) + (rotate_pairwise(k_embed) * sin) + + return q_embed.type_as(q), k_embed.type_as(k) + + +def apply_rotary_pos_emb_2d_cross_attn( + q: torch.Tensor, + k: torch.Tensor, + cos: torch.Tensor, + sin: torch.Tensor, + cos_k: torch.Tensor, + sin_k: torch.Tensor, + num_k_exclude_rope: int = 0, + repeat_freqs_k: int = 1, +) -> tuple[torch.Tensor, torch.Tensor]: + """ + Apply rotary position embedding to query and key tensors for cross-attention. + + Args: + q: Query tensor of shape (..., seq_len, head_dim) + k: Key tensor of shape (..., seq_len, head_dim) + cos: Cosine position embedding of shape (seq_len, head_dim) + sin: Sine position embedding of shape (seq_len, head_dim) + cos_k: Cosine position embedding for keys of shape (seq_len, head_dim) + sin_k: Sine position embedding for keys of shape (seq_len, head_dim) + num_k_exclude_rope: Number of tokens at end of k to exclude from RoPE (e.g., object pointer tokens) + repeat_freqs_k: Frequency repetition for keys in cross-attention (e.g., for spatial memory tokens) + + Returns: + Rotated (q, k) tensors + """ + # Apply RoPE to queries (always straightforward) + q_embed = q.float() + q_embed = (q_embed * cos) + (rotate_pairwise(q_embed) * sin) + + # Split keys: RoPE tokens and excluded tokens (e.g., object pointers) + num_total_k_tokens = k.shape[-2] + k_for_rope = k[..., : num_total_k_tokens - num_k_exclude_rope, :] + k_excluded = k[..., num_total_k_tokens - num_k_exclude_rope :, :] + + # Early return if no keys need RoPE + if k_for_rope.shape[-2] == 0: + return q_embed.type_as(q), k_excluded + + batch_size, num_heads, k_seq_len, channels_per_head = k_for_rope.shape + + # Handle temporal/spatial token structure for memory + # Keys have temporal + spatial structure, only spatial tokens get RoPE + tokens_per_group = k_seq_len // repeat_freqs_k + spatial_tokens = cos_k.shape[-2] + temporal_tokens = tokens_per_group - spatial_tokens + + # Reshape and separate temporal/spatial tokens + k_grouped = k_for_rope.view(batch_size, num_heads, repeat_freqs_k, tokens_per_group, channels_per_head) + k_temporal = k_grouped[..., :temporal_tokens, :].reshape(batch_size, num_heads, -1, channels_per_head) + k_spatial = k_grouped[..., temporal_tokens:, :].reshape(batch_size, num_heads, -1, channels_per_head) + + # Only apply RoPE to spatial tokens + k_rope_input = k_spatial + + # Prepare position embeddings for repeated groups + if repeat_freqs_k > 1: + cos_k = cos_k.repeat(1, 1, repeat_freqs_k, 1) + sin_k = sin_k.repeat(1, 1, repeat_freqs_k, 1) + + # Apply RoPE to spatial tokens + k_spatial_embed = k_rope_input.float() + k_spatial_embed = (k_spatial_embed * cos_k) + (rotate_pairwise(k_spatial_embed) * sin_k) + + # Reconstruct: temporal + spatial tokens back to original structure + k_spatial_reshaped = k_spatial_embed.view(batch_size, num_heads, repeat_freqs_k, -1, channels_per_head) + k_temporal_reshaped = k_temporal.view(batch_size, num_heads, repeat_freqs_k, -1, channels_per_head) + k_final = torch.cat([k_temporal_reshaped, k_spatial_reshaped], dim=3) + k_final = k_final.view(batch_size, num_heads, k_seq_len, channels_per_head) + + # Combine RoPE-processed keys with excluded tokens + k_embed = torch.cat([k_final.type_as(k), k_excluded], dim=-2) + return q_embed.type_as(q), k_embed + + +class EdgeTamVideoRoPESelfAttention(nn.Module): + """Self-attention with rotary position encoding.""" + + def __init__(self, config: EdgeTamVideoConfig): + super().__init__() + self.config = config + self.hidden_size = config.memory_attention_hidden_size + self.internal_dim = self.hidden_size // config.memory_attention_downsample_rate + self.num_attention_heads = config.memory_attention_num_attention_heads + self.head_dim = self.internal_dim // config.memory_attention_num_attention_heads + self.scaling = self.head_dim**-0.5 + self.is_causal = False + + self.q_proj = nn.Linear(self.hidden_size, self.internal_dim) + self.k_proj = nn.Linear(self.hidden_size, self.internal_dim) + self.v_proj = nn.Linear(self.hidden_size, self.internal_dim) + self.o_proj = nn.Linear(self.internal_dim, self.hidden_size) + self.dropout_p = config.memory_attention_rope_dropout + + def forward( + self, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + position_embeddings: tuple[torch.Tensor, torch.Tensor], + **kwargs: Unpack[FlashAttentionKwargs], + ) -> Tensor: + # Input projections + batch_size, point_batch_size = query.shape[:2] + new_shape = (batch_size * point_batch_size, -1, self.num_attention_heads, self.head_dim) + + query = self.q_proj(query).view(*new_shape).transpose(1, 2) + key = self.k_proj(key).view(*new_shape).transpose(1, 2) + value = self.v_proj(value).view(*new_shape).transpose(1, 2) + + cos, sin = position_embeddings + # Apply rotary position encoding for self-attention + query, key = apply_rotary_pos_emb_2d_self_attn(query, key, cos=cos, sin=sin) + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + attn_output, attn_weights = attention_interface( + self, + query, + key, + value, + attention_mask=None, + dropout=0.0 if not self.training else self.dropout_p, + scaling=self.scaling, + is_causal=self.is_causal, + **kwargs, + ) + attn_output = attn_output.reshape( + batch_size, point_batch_size, -1, self.num_attention_heads * self.head_dim + ).contiguous() + attn_output = self.o_proj(attn_output) + return attn_output, attn_weights + + +class EdgeTamVideoRoPECrossAttention(nn.Module): + """Cross-attention with rotary position encoding.""" + + def __init__(self, config: EdgeTamVideoConfig, kv_in_dim: int): + super().__init__() + self.config = config + self.hidden_size = config.memory_attention_hidden_size + self.internal_dim = self.hidden_size // config.memory_attention_downsample_rate + self.num_attention_heads = config.memory_attention_num_attention_heads + self.head_dim = self.internal_dim // config.memory_attention_num_attention_heads + self.scaling = self.head_dim**-0.5 + self.is_causal = False + + self.kv_in_dim = kv_in_dim + + self.q_proj = nn.Linear(self.hidden_size, self.internal_dim) + self.k_proj = nn.Linear(self.kv_in_dim, self.internal_dim) + self.v_proj = nn.Linear(self.kv_in_dim, self.internal_dim) + self.o_proj = nn.Linear(self.internal_dim, self.hidden_size) + self.dropout_p = config.memory_attention_rope_dropout + + def forward( + self, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + position_embeddings: tuple[torch.Tensor, torch.Tensor], + position_embeddings_k: tuple[torch.Tensor, torch.Tensor], + num_k_exclude_rope: int = 0, + rope_k_repeat: int = 0, + **kwargs: Unpack[FlashAttentionKwargs], + ) -> Tensor: + # Input projections + batch_size, point_batch_size = query.shape[:2] + new_shape = (batch_size * point_batch_size, -1, self.num_attention_heads, self.head_dim) + + query = self.q_proj(query).view(*new_shape).transpose(1, 2) + key = self.k_proj(key).view(*new_shape).transpose(1, 2) + value = self.v_proj(value).view(*new_shape).transpose(1, 2) + + cos, sin = position_embeddings + cos_k, sin_k = position_embeddings_k + # Apply rotary position encoding for cross-attention + query, key = apply_rotary_pos_emb_2d_cross_attn( + query, + key, + cos=cos, + sin=sin, + cos_k=cos_k, + sin_k=sin_k, + repeat_freqs_k=rope_k_repeat, + num_k_exclude_rope=num_k_exclude_rope, + ) + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + attn_output, attn_weights = attention_interface( + self, + query, + key, + value, + attention_mask=None, + dropout=0.0 if not self.training else self.dropout_p, + scaling=self.scaling, + is_causal=self.is_causal, + **kwargs, + ) + attn_output = attn_output.reshape( + batch_size, point_batch_size, -1, self.num_attention_heads * self.head_dim + ).contiguous() + attn_output = self.o_proj(attn_output) + return attn_output, attn_weights + + +class EdgeTamVideoTwoWayAttentionBlock(Sam2VideoTwoWayAttentionBlock): + pass + + +class EdgeTamVideoPositionEmbeddingSine(Sam2VideoPositionEmbeddingSine): + # maxsize=2 because we need to cache the forward method for both memory encoder and perceiver resampler + @compile_compatible_method_lru_cache(maxsize=2) + def forward(self, **super_kwargs): + return super().forward(**super_kwargs) + + +class EdgeTamVideoMemoryEncoder(Sam2VideoMemoryEncoder): + pass + + +class EdgeTamVideoFeedForward(Sam2VideoFeedForward): + pass + + +class EdgeTamVideoPreTrainedModel(Sam2VideoPreTrainedModel): + pass + + +class EdgeTamVideoInferenceSession(Sam2VideoInferenceSession): + pass + + +class EdgeTamVideoMemoryAttentionMLP(nn.Module): + def __init__(self, config: EdgeTamVideoConfig): + super().__init__() + self.config = config + self.hidden_size = config.memory_attention_hidden_size + self.intermediate_size = config.memory_attention_mlp_hidden_size + self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size) + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size) + self.dropout = nn.Dropout(config.memory_attention_dropout) + self.act_fn = ACT2FN[config.memory_attention_mlp_hidden_act] + + def forward(self, x): + return self.down_proj(self.dropout(self.act_fn(self.up_proj(x)))) + + +class EdgeTamVideoMemoryAttentionLayer(nn.Module): + def __init__(self, config: EdgeTamVideoConfig): + super().__init__() + hidden_size = config.memory_attention_hidden_size + self.self_attn = EdgeTamVideoRoPESelfAttention(config) + self.cross_attn_image = EdgeTamVideoRoPECrossAttention(config, kv_in_dim=64) + + # MLP module + self.mlp = EdgeTamVideoMemoryAttentionMLP(config) + + self.layer_norm1 = nn.LayerNorm(hidden_size) + self.layer_norm2 = nn.LayerNorm(hidden_size) + self.layer_norm3 = nn.LayerNorm(hidden_size) + self.dropout1 = nn.Dropout(config.memory_attention_dropout) + self.dropout2 = nn.Dropout(config.memory_attention_dropout) + self.dropout3 = nn.Dropout(config.memory_attention_dropout) + + def forward( + self, + queries: Tensor, + keys: Tensor, + key_point_embedding: Tensor, + rope_position_embeddings: tuple[Tensor, Tensor], + rope_position_embeddings_k: Optional[tuple[Tensor, Tensor]] = None, + num_k_exclude_rope: int = 0, + rope_k_repeat: int = 0, + ) -> torch.Tensor: + # Self-Attention + query = self.layer_norm1(queries) + query, _ = self.self_attn(query=query, key=query, value=query, position_embeddings=rope_position_embeddings) + queries = queries + self.dropout1(query) + + # Cross-Attention + query = self.layer_norm2(queries) + query, _ = self.cross_attn_image( + query=query, + key=keys + key_point_embedding, + value=keys, + position_embeddings=rope_position_embeddings, + position_embeddings_k=rope_position_embeddings_k, + num_k_exclude_rope=num_k_exclude_rope, + rope_k_repeat=rope_k_repeat, + ) + queries = queries + self.dropout2(query) + # MLP + query = self.layer_norm3(queries) + query = self.mlp(query) + queries = queries + self.dropout3(query) + return queries + + +class EdgeTamVideoMemoryAttention(Sam2VideoMemoryAttention): + def __init__(self, config: EdgeTamVideoConfig): + super().__init__() + self.rotary_emb_k = EdgeTamVideoVisionRotaryEmbedding( + config, end_x=config.memory_attention_rope_k_sizes[0], end_y=config.memory_attention_rope_k_sizes[1] + ) + + def forward( + self, + current_vision_features: torch.Tensor, + memory: torch.Tensor, + current_vision_position_embeddings: Optional[Tensor] = None, + memory_posision_embeddings: Optional[Tensor] = None, + num_object_pointer_tokens: int = 0, + num_spatial_memory_tokens: int = -1, + ): + """ + Args: + current_vision_features (`torch.FloatTensor`): + The current vision features used for self-attention. + memory (`torch.FloatTensor`): + The memory features used for cross-attention. + current_vision_position_embeddings (`torch.FloatTensor`, *optional*): + The position embeddings for the current vision features. + memory_posision_embeddings (`torch.FloatTensor`, *optional*): + The position embeddings for the memory features. + num_object_pointer_tokens (`int`, *optional*, defaults to 0): + The number of object pointer tokens. + """ + output = current_vision_features + if current_vision_position_embeddings is not None: + output = output + 0.1 * current_vision_position_embeddings + + # Convert to batch first + output = output.transpose(0, 1) + memory = memory.transpose(0, 1).unsqueeze(1) + memory_posision_embeddings = memory_posision_embeddings.transpose(0, 1).unsqueeze(1) + rope_position_embeddings = self.rotary_emb() + rope_position_embeddings_k = self.rotary_emb_k() + for layer in self.layers: + output = layer( + queries=output.unsqueeze(1) if output.ndim == 3 else output, + keys=memory, + key_point_embedding=memory_posision_embeddings, + rope_position_embeddings=rope_position_embeddings, + rope_position_embeddings_k=rope_position_embeddings_k, + num_k_exclude_rope=num_object_pointer_tokens, + rope_k_repeat=num_spatial_memory_tokens, + ) + + normed_output = self.layer_norm(output) + + # Convert back to seq first + normed_output = normed_output.transpose(0, 1) + + return normed_output + + +class EdgeTamVideoPerceiverMLP(nn.Module): + def __init__(self, config: EdgeTamVideoConfig): + super().__init__() + self.hidden_size = config.perceiver_resampler_hidden_size + self.intermediate_size = config.perceiver_resampler_mlp_intermediate_size + + self.layer_norm = nn.LayerNorm(self.hidden_size) + self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) + self.act_fn = nn.GELU() + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.layer_norm(hidden_states) + hidden_states = self.down_proj(self.act_fn(self.up_proj(hidden_states))) + return hidden_states + + +class EdgeTamVideoPerceiverAttention(nn.Module): + def __init__(self, config: EdgeTamVideoConfig): + super().__init__() + self.config = config + self.hidden_size = config.perceiver_resampler_hidden_size + self.num_attention_heads = config.perceiver_resampler_num_attention_heads + self.head_dim = config.perceiver_resampler_attention_head_dim + self.attention_dropout = config.perceiver_resampler_attention_dropout + + self.inner_dim = self.head_dim * self.num_attention_heads + self.scaling = self.head_dim**-0.5 + self.is_causal = False + + self.q_proj = nn.Linear(self.hidden_size, self.inner_dim, bias=False) + self.k_proj = nn.Linear(self.hidden_size, self.inner_dim, bias=False) + self.v_proj = nn.Linear(self.hidden_size, self.inner_dim, bias=False) + self.o_proj = nn.Linear(self.inner_dim, self.hidden_size, bias=False) + + def forward( + self, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + positional_encoding: Optional[torch.Tensor] = None, + **kwargs, + ) -> torch.Tensor: + # Project queries, keys, and values + query = self.q_proj(query) + key = self.k_proj(key) + value = self.v_proj(value) + + # Reshape for multi-head attention + batch_size, seq_len_q = query.shape[:2] + query = query.view(batch_size, seq_len_q, self.num_attention_heads, self.head_dim).transpose(1, 2) + seq_len_kv = key.shape[1] + key = key.view(batch_size, seq_len_kv, self.num_attention_heads, self.head_dim).transpose(1, 2) + value = value.view(batch_size, seq_len_kv, self.num_attention_heads, self.head_dim).transpose(1, 2) + + # Add positional encoding if provided + if positional_encoding is not None: + pos_encoding = positional_encoding.view( + batch_size, seq_len_kv, self.num_attention_heads, self.head_dim + ).transpose(1, 2) + key = key + pos_encoding + value = value + pos_encoding + + # Apply attention + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + attn_output, _ = attention_interface( + self, + query, + key, + value, + attention_mask=None, + dropout=0.0 if not self.training else self.attention_dropout, + scaling=self.scaling, + is_causal=self.is_causal, + **kwargs, + ) + + # Reshape output + attn_output = attn_output.transpose(1, 2).contiguous().view(batch_size, seq_len_q, self.inner_dim) + return self.o_proj(attn_output) + + +class EdgeTamVideoPerceiverEncoderLayer(nn.Module): + def __init__(self, config: EdgeTamVideoConfig): + super().__init__() + + self.cross_attention = EdgeTamVideoPerceiverAttention(config) + self.mlp = EdgeTamVideoPerceiverMLP(config) + self.dropout = nn.Dropout(config.perceiver_resampler_hidden_dropout) + + self.self_attention = EdgeTamVideoPerceiverAttention(config) + self.self_mlp = EdgeTamVideoPerceiverMLP(config) + + # Layer norms moved from attention classes to here + self.layer_norm_input = nn.LayerNorm(config.perceiver_resampler_hidden_size) + self.layer_norm_latents = nn.LayerNorm(config.perceiver_resampler_hidden_size) + self.layer_norm_self = nn.LayerNorm(config.perceiver_resampler_hidden_size) + + def forward( + self, + latents: torch.Tensor, + input_features: torch.Tensor, + positional_encoding: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + # Cross attention with layer norms + normalized_latents = self.layer_norm_latents(latents) + normalized_input = self.layer_norm_input(input_features) + cross_attention_output = self.cross_attention( + query=normalized_latents, + key=normalized_input, + value=normalized_input, + positional_encoding=positional_encoding, + ) + latents = latents + self.dropout(cross_attention_output) + + mlp_output = self.mlp(latents) + latents = latents + mlp_output + + # Self attention with layer norm + normalized_latents_self = self.layer_norm_self(latents) + self_attention_output = self.self_attention( + query=normalized_latents_self, key=normalized_latents_self, value=normalized_latents_self + ) + latents = latents + self_attention_output + + self_mlp_output = self.self_mlp(latents) + latents = latents + self_mlp_output + + return latents + + +class EdgeTamVideoPerceiverResampler(nn.Module): + def __init__(self, config: EdgeTamVideoConfig): + super().__init__() + self.config = config + self.hidden_size = config.perceiver_resampler_hidden_size + self.num_latents_1d = config.perceiver_resampler_num_latents + self.num_latents_2d = config.perceiver_resampler_num_latents_2d + self.num_layers = config.perceiver_resampler_num_layers + + if self.num_latents_1d > 0: + self.latents_1d = nn.Parameter(torch.randn(self.num_latents_1d, self.hidden_size)) + if self.num_latents_2d > 0: + self.latents_2d = nn.Parameter(torch.randn(self.num_latents_2d, self.hidden_size)) + + self.positional_encoding = EdgeTamVideoPositionEmbeddingSine( + num_pos_feats=self.hidden_size // 2, normalize=True + ) + + self.layers = nn.ModuleList([EdgeTamVideoPerceiverEncoderLayer(config) for _ in range(self.num_layers)]) + + self.layer_norm = nn.LayerNorm(self.hidden_size) + + def forward( + self, + hidden_states: torch.Tensor, + positional_encoding: Optional[torch.Tensor] = None, + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: + output_latents = [] + output_positional_encodings = [] + + if self.num_latents_1d > 0: + latents_1d, pos_1d = self._forward_1d(hidden_states, positional_encoding) + output_latents.append(latents_1d) + output_positional_encodings.append(pos_1d) + + if self.num_latents_2d > 0: + latents_2d, pos_2d = self._forward_2d(hidden_states) + output_latents.append(latents_2d) + output_positional_encodings.append(pos_2d) + + combined_latents = torch.cat(output_latents, dim=1) + + combined_positional_encoding = None + if positional_encoding is not None and output_positional_encodings: + combined_positional_encoding = torch.cat(output_positional_encodings, dim=1) + + return combined_latents, combined_positional_encoding + + def _forward_1d( + self, + hidden_states: torch.Tensor, + positional_encoding: Optional[torch.Tensor] = None, + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: + batch_size = hidden_states.shape[0] + + latents = self.latents_1d.unsqueeze(0).expand(batch_size, -1, -1) + flattened_features = hidden_states.permute(0, 2, 3, 1).flatten(1, 2) + + positional_features = None + if positional_encoding is not None: + positional_features = positional_encoding.permute(0, 2, 3, 1).flatten(1, 2) + + for layer in self.layers: + latents = layer(latents, flattened_features, positional_features) + + latents = self.layer_norm(latents) + + output_positional_encoding = None + if positional_encoding is not None: + output_positional_encoding = torch.zeros_like(latents) + + return latents, output_positional_encoding + + def _forward_2d(self, hidden_states: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: + batch_size, channels, height, width = hidden_states.shape + + latents_2d = self.latents_2d.unsqueeze(0).expand(batch_size, -1, -1).view(-1, 1, channels) + + num_windows_per_dim = int(math.sqrt(self.num_latents_2d)) + window_size = height // num_windows_per_dim + + windowed_input = hidden_states.permute(0, 2, 3, 1) + windowed_features, _ = window_partition(windowed_input, window_size) + windowed_features = windowed_features.flatten(1, 2) + + for layer in self.layers: + latents_2d = layer(latents_2d, windowed_features, positional_encoding=None) + + latents_2d = latents_2d.view(batch_size, num_windows_per_dim, num_windows_per_dim, channels).permute( + 0, 3, 1, 2 + ) + + positional_encoding_2d = self.positional_encoding(latents_2d.shape, latents_2d.device, latents_2d.dtype).to( + dtype=hidden_states.dtype + ) + positional_encoding_2d = positional_encoding_2d.permute(0, 2, 3, 1).flatten(1, 2) + + latents_2d = latents_2d.permute(0, 2, 3, 1).flatten(1, 2) + latents_2d = self.layer_norm(latents_2d) + + return latents_2d, positional_encoding_2d + + +@auto_docstring +class EdgeTamVideoModel(Sam2VideoModel): + _tied_weights_keys = ["prompt_encoder.shared_embedding.positional_embedding"] + # need to be ignored, as it's a buffer and will not be correctly detected as tied weight + _keys_to_ignore_on_load_missing = ["prompt_encoder.shared_embedding.positional_embedding"] + _keys_to_ignore_on_load_unexpected = [] + _can_record_outputs = {"mask_decoder_attentions": OutputRecorder(EdgeTamVideoTwoWayAttentionBlock, index=2)} + + def __init__(self, config: EdgeTamVideoConfig): + super().__init__(config) + self.spatial_perceiver = EdgeTamVideoPerceiverResampler(config) + + self.post_init() + + def _build_memory_attention_inputs( + self, + temporal_positions_and_previous_outputs: list[tuple[int, dict]], + device: torch.device, + ) -> tuple[list[torch.Tensor], list[torch.Tensor]]: + """ + Concatenate memory features and positional embeddings from previous frames. + + Returns: + Tuple of (memories_to_concatenate, memory_positional_embeddings_to_concatenate). + """ + memories_to_concatenate = [] + memory_positional_embeddings_to_concatenate = [] + + for relative_temporal_offset, prev_output_data in temporal_positions_and_previous_outputs: + if prev_output_data is None: + continue # Skip if no output data for this temporal position (e.g., padding frames) + + # Load memory features (potentially from CPU to GPU) + # Features are flattened: (Batch, Channels, H, W) -> (H*W, Batch, Channels) + memory_features = prev_output_data["maskmem_features"].to(device, non_blocking=True) + memories_to_concatenate.append(memory_features.permute(1, 0, 2)) + + # Spatial positional encoding (potentially from CPU to GPU) + spatial_memory_pos_embed = prev_output_data["maskmem_pos_enc"].to(device, non_blocking=True) + spatial_memory_pos_embed = spatial_memory_pos_embed.squeeze(1).permute(1, 0, 2) + + # Add temporal positional encoding + # self.memory_temporal_positional_encoding shape: (NumMaskMem, 1, 1, MemDim) + combined_memory_pos_embed = ( + spatial_memory_pos_embed + self.memory_temporal_positional_encoding[relative_temporal_offset - 1] + ) + memory_positional_embeddings_to_concatenate.append(combined_memory_pos_embed) + + return memories_to_concatenate, memory_positional_embeddings_to_concatenate + + def _prepare_memory_conditioned_features( + self, + inference_session: EdgeTamVideoInferenceSession, + frame_idx: int, + obj_idx: int, + is_initial_conditioning_frame: bool, + current_vision_features: list[torch.Tensor], + current_vision_positional_embeddings: list[torch.Tensor], + num_total_frames: int, + track_in_reverse_time: bool = False, + streaming: bool = False, + ) -> torch.Tensor: + """ + Fuse current frame's visual features with memory from previous frames for enhanced object tracking. + + This method conditions the current frame's visual features on temporal memory from previous frames, + enabling consistent object tracking across video sequences. For initial conditioning frames, it uses + no-memory embeddings. For subsequent frames, it retrieves and integrates memory features from both + conditioning frames (user interactions) and non-conditioning frames (tracked results) via cross-attention. + + Args: + inference_session (`EdgeTamVideoInferenceSession`): + The video inference session object. + frame_idx (`int`): + Index of the current frame being processed. + obj_idx (`int`): + Index of the object being processed. + is_initial_conditioning_frame (`bool`): + Whether this is an initial conditioning frame with user inputs (True) or a subsequent + tracking frame (False). + current_vision_features (`torch.Tensor`): + Highest-level vision features of shape `(seq_len, batch_size, channels)`. + current_vision_positional_embeddings (`torch.Tensor`): + Positional embedding tensors corresponding to the highest-level vision features. + num_total_frames (`int`): + Total number of frames in the video sequence. + track_in_reverse_time (`bool`, *optional*, defaults to `False`): + Whether tracking is performed in reverse temporal order. + streaming (`bool`, *optional*, defaults to `False`): + Whether this is streaming inference mode. + + Returns: + `torch.Tensor`: Memory-conditioned feature tensor of shape `(batch_size, channels, height, width)` + suitable for input to the SAM decoder. + """ + # Get dimensions from the highest-level (lowest-resolution) feature map + batch_size = current_vision_features.size(1) + num_channels = self.hidden_dim + height, width = self.backbone_feature_sizes[-1] + device = current_vision_features.device + + # If memory is disabled (e.g., for single image SAM), return current features directly. + if self.num_maskmem == 0: + # Permute (SeqLen, Batch, Channels) -> (Batch, Channels, SeqLen) then view as (Batch, Channels, Height, Width) + # Assuming SeqLen = Height * Width for the last feature map + current_feature_map = current_vision_features.permute(1, 2, 0).view( + batch_size, num_channels, height, width + ) + return current_feature_map + + # Step 1: Handle initial conditioning frames + if is_initial_conditioning_frame: + # For initial conditioning frames, no prior memory is used directly in this block. + # If configured, directly add a learnable "no memory" embedding. + # current_vision_features has shape (SeqLen, Batch, Channels) + conditioned_feature_map_flat = current_vision_features + self.no_memory_embedding + # Reshape to (Batch, Channels, Height, Width) + conditioned_feature_map = conditioned_feature_map_flat.permute(1, 2, 0).view( + batch_size, num_channels, height, width + ) + return conditioned_feature_map + + # Step 2: Get memory frames and concatenate their features + temporal_positions_and_previous_outputs = self._gather_memory_frame_outputs( + inference_session, obj_idx, frame_idx, track_in_reverse_time + ) + + memories_to_concatenate, memory_positional_embeddings_to_concatenate = self._build_memory_attention_inputs( + temporal_positions_and_previous_outputs, device + ) + num_spatial_memory_tokens = len(memories_to_concatenate) + + # Step 3: Get and process object pointers + temporal_offsets, pointer_tokens, max_object_pointers_to_use = self._get_object_pointers( + inference_session, obj_idx, frame_idx, num_total_frames, device, track_in_reverse_time, streaming + ) + + num_object_pointer_tokens = 0 + if pointer_tokens: + object_pointers, object_pointers_pos_embed = self._process_object_pointers( + temporal_offsets, pointer_tokens, max_object_pointers_to_use, batch_size, num_channels, device + ) + + if object_pointers is not None: + memories_to_concatenate.append(object_pointers) + memory_positional_embeddings_to_concatenate.append(object_pointers_pos_embed) + num_object_pointer_tokens = object_pointers.shape[0] + + # Step 4: Concatenate all retrieved memories and their positional embeddings + combined_memory = torch.cat(memories_to_concatenate, dim=0) + combined_memory_positional_embeddings = torch.cat(memory_positional_embeddings_to_concatenate, dim=0) + + # Step 5: Forward through the memory attention mechanism + conditioned_feature_map_flat = self.memory_attention( + current_vision_features=current_vision_features, + current_vision_position_embeddings=current_vision_positional_embeddings, + memory=combined_memory, + memory_posision_embeddings=combined_memory_positional_embeddings, # Corrected typo from API + num_object_pointer_tokens=num_object_pointer_tokens, + num_spatial_memory_tokens=num_spatial_memory_tokens, + ) + + # Reshape from (Batch, H*W, Channels) to (Batch, Channels, Height, Width) + conditioned_feature_map = ( + conditioned_feature_map_flat.squeeze(1).permute(0, 2, 1).view(batch_size, num_channels, height, width) + ) + return conditioned_feature_map + + def _encode_new_memory( + self, + current_vision_feats: torch.Tensor, + pred_masks_high_res: torch.Tensor, + object_score_logits: torch.Tensor, + is_mask_from_pts: bool, + ) -> tuple[torch.Tensor, list[torch.Tensor]]: + """Encode the current image and its prediction into a memory feature.""" + batch_size = current_vision_feats.size(1) # batch size on this frame + channels = self.hidden_dim + height, width = self.backbone_feature_sizes[-1] # top-level (lowest-resolution) feature size + # top-level feature, (HW)BC => BCHW + pix_feat = current_vision_feats.permute(1, 2, 0).view(batch_size, channels, height, width) + if is_mask_from_pts and not self.training: + # binarize the mask logits + mask_for_mem = (pred_masks_high_res > 0).to(pred_masks_high_res.dtype) + else: + # apply sigmoid on the raw mask logits to turn them into range (0, 1) + mask_for_mem = torch.sigmoid(pred_masks_high_res) + # apply scale and bias terms to the sigmoid probabilities + mask_for_mem = mask_for_mem * self.config.sigmoid_scale_for_mem_enc + mask_for_mem = mask_for_mem + self.config.sigmoid_bias_for_mem_enc + + maskmem_features, maskmem_pos_enc = self.memory_encoder( + pix_feat, + mask_for_mem, + ) + # add a no-object embedding to the spatial memory to indicate that the frame + # is predicted to be occluded (i.e. no object is appearing in the frame) + if self.occlusion_spatial_embedding_parameter is not None: + is_obj_appearing = (object_score_logits > 0).float() + maskmem_features += (1 - is_obj_appearing[..., None]) * self.occlusion_spatial_embedding_parameter[ + ..., None, None + ].expand(*maskmem_features.shape) + + maskmem_pos_enc = maskmem_pos_enc.to(pred_masks_high_res.dtype) + maskmem_features, maskmem_pos_enc = self.spatial_perceiver(maskmem_features, maskmem_pos_enc) + maskmem_features = maskmem_features.to(pred_masks_high_res.dtype) + maskmem_pos_enc = maskmem_pos_enc.to(pred_masks_high_res.dtype) + + return maskmem_features, maskmem_pos_enc + + +__all__ = [ + "EdgeTamVideoMaskDecoderConfig", + "EdgeTamVideoPromptEncoderConfig", + "EdgeTamVideoConfig", + "EdgeTamVideoModel", + "EdgeTamVideoInferenceSession", + "EdgeTamVideoPreTrainedModel", +] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/efficientloftr/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/efficientloftr/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..53e51c91c17b4ff51c672f369e43deb35faa0fea --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/efficientloftr/__init__.py @@ -0,0 +1,29 @@ +# Copyright 2025 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_efficientloftr import * + from .image_processing_efficientloftr import * + from .image_processing_efficientloftr_fast import * + from .modeling_efficientloftr import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/efficientloftr/configuration_efficientloftr.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/efficientloftr/configuration_efficientloftr.py new file mode 100644 index 0000000000000000000000000000000000000000..d2dff4de3745aa322357252ffa46b9dbfd13a0d8 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/efficientloftr/configuration_efficientloftr.py @@ -0,0 +1,199 @@ +# Copyright 2025 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Optional + +from ...configuration_utils import PretrainedConfig +from ...modeling_rope_utils import rope_config_validation + + +class EfficientLoFTRConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`EfficientLoFTRFromKeypointMatching`]. + It is used to instantiate a EfficientLoFTR model according to the specified arguments, defining the model + architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the + EfficientLoFTR [zju-community/efficientloftr](https://huggingface.co/zju-community/efficientloftr) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + stage_num_blocks (`List`, *optional*, defaults to [1, 2, 4, 14]): + The number of blocks in each stages + out_features (`List`, *optional*, defaults to [64, 64, 128, 256]): + The number of channels in each stage + stage_stride (`List`, *optional*, defaults to [2, 1, 2, 2]): + The stride used in each stage + hidden_size (`int`, *optional*, defaults to 256): + The dimension of the descriptors. + activation_function (`str`, *optional*, defaults to `"relu"`): + The activation function used in the backbone + q_aggregation_kernel_size (`int`, *optional*, defaults to 4): + The kernel size of the aggregation of query states in the fusion network + kv_aggregation_kernel_size (`int`, *optional*, defaults to 4): + The kernel size of the aggregation of key and value states in the fusion network + q_aggregation_stride (`int`, *optional*, defaults to 4): + The stride of the aggregation of query states in the fusion network + kv_aggregation_stride (`int`, *optional*, defaults to 4): + The stride of the aggregation of key and value states in the fusion network + num_attention_layers (`int`, *optional*, defaults to 4): + Number of attention layers in the LocalFeatureTransformer + num_attention_heads (`int`, *optional*, defaults to 8): + The number of heads in the GNN layers. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + attention_bias (`bool`, *optional*, defaults to `False`): + Whether to use a bias in the query, key, value and output projection layers during attention. + mlp_activation_function (`str`, *optional*, defaults to `"leaky_relu"`): + Activation function used in the attention mlp layer. + coarse_matching_skip_softmax (`bool`, *optional*, defaults to `False`): + Whether to skip softmax or not at the coarse matching step. + coarse_matching_threshold (`float`, *optional*, defaults to 0.2): + The threshold for the minimum score required for a match. + coarse_matching_temperature (`float`, *optional*, defaults to 0.1): + The temperature to apply to the coarse similarity matrix + coarse_matching_border_removal (`int`, *optional*, defaults to 2): + The size of the border to remove during coarse matching + fine_kernel_size (`int`, *optional*, defaults to 8): + Kernel size used for the fine feature matching + batch_norm_eps (`float`, *optional*, defaults to 1e-05): + The epsilon used by the batch normalization layers. + rope_theta (`float`, *optional*, defaults to 10000.0): + The base period of the RoPE embeddings. + partial_rotary_factor (`float`, *optional*, defaults to 4.0): + Dim factor for the RoPE embeddings, in EfficientLoFTR, frequencies should be generated for + the whole hidden_size, so this factor is used to compensate. + rope_scaling (`Dict`, *optional*): + Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type + and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value + accordingly. + Expected contents: + `rope_type` (`str`): + The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', + 'llama3', '2d'], with 'default' being the original RoPE implementation. + `dim` (`int`): The dimension of the RoPE embeddings. + fine_matching_slice_dim (`int`, *optional*, defaults to 8): + The size of the slice used to divide the fine features for the first and second fine matching stages. + fine_matching_regress_temperature (`float`, *optional*, defaults to 10.0): + The temperature to apply to the fine similarity matrix + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + + Examples: + ```python + >>> from transformers import EfficientLoFTRConfig, EfficientLoFTRForKeypointMatching + + >>> # Initializing a EfficientLoFTR configuration + >>> configuration = EfficientLoFTRConfig() + + >>> # Initializing a model from the EfficientLoFTR configuration + >>> model = EfficientLoFTRForKeypointMatching(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ``` + """ + + model_type = "efficientloftr" + + def __init__( + self, + stage_num_blocks: Optional[list[int]] = None, + out_features: Optional[list[int]] = None, + stage_stride: Optional[list[int]] = None, + hidden_size: int = 256, + activation_function: str = "relu", + q_aggregation_kernel_size: int = 4, + kv_aggregation_kernel_size: int = 4, + q_aggregation_stride: int = 4, + kv_aggregation_stride: int = 4, + num_attention_layers: int = 4, + num_attention_heads: int = 8, + attention_dropout: float = 0.0, + attention_bias: bool = False, + mlp_activation_function: str = "leaky_relu", + coarse_matching_skip_softmax: bool = False, + coarse_matching_threshold: float = 0.2, + coarse_matching_temperature: float = 0.1, + coarse_matching_border_removal: int = 2, + fine_kernel_size: int = 8, + batch_norm_eps: float = 1e-5, + rope_theta: float = 10000.0, + partial_rotary_factor: float = 4.0, + rope_scaling: Optional[dict] = None, + fine_matching_slice_dim: int = 8, + fine_matching_regress_temperature: float = 10.0, + initializer_range: float = 0.02, + **kwargs, + ): + # Stage level of RepVGG + self.stage_num_blocks = stage_num_blocks if stage_num_blocks is not None else [1, 2, 4, 14] + self.stage_stride = stage_stride if stage_stride is not None else [2, 1, 2, 2] + self.out_features = out_features if out_features is not None else [64, 64, 128, 256] + self.stage_in_channels = [1] + self.out_features[:-1] + + # Block level of RepVGG + self.stage_block_stride = [ + [stride] + [1] * (num_blocks - 1) for stride, num_blocks in zip(self.stage_stride, self.stage_num_blocks) + ] + self.stage_block_out_channels = [ + [self.out_features[stage_idx]] * num_blocks for stage_idx, num_blocks in enumerate(self.stage_num_blocks) + ] + self.stage_block_in_channels = [ + [self.stage_in_channels[stage_idx]] + self.stage_block_out_channels[stage_idx][:-1] + for stage_idx in range(len(self.stage_num_blocks)) + ] + + # Fine matching level of EfficientLoFTR + self.fine_fusion_dims = list(reversed(self.out_features))[:-1] + + self.hidden_size = hidden_size + if self.hidden_size != self.out_features[-1]: + raise ValueError( + f"hidden_size should be equal to the last value in out_features. hidden_size = {self.hidden_size}, out_features = {self.out_features[-1]}" + ) + + self.activation_function = activation_function + self.q_aggregation_kernel_size = q_aggregation_kernel_size + self.kv_aggregation_kernel_size = kv_aggregation_kernel_size + self.q_aggregation_stride = q_aggregation_stride + self.kv_aggregation_stride = kv_aggregation_stride + self.num_attention_layers = num_attention_layers + self.num_attention_heads = num_attention_heads + self.attention_dropout = attention_dropout + self.attention_bias = attention_bias + self.intermediate_size = self.hidden_size * 2 + self.mlp_activation_function = mlp_activation_function + self.coarse_matching_skip_softmax = coarse_matching_skip_softmax + self.coarse_matching_threshold = coarse_matching_threshold + self.coarse_matching_temperature = coarse_matching_temperature + self.coarse_matching_border_removal = coarse_matching_border_removal + self.fine_kernel_size = fine_kernel_size + self.batch_norm_eps = batch_norm_eps + self.fine_matching_slice_dim = fine_matching_slice_dim + self.fine_matching_regress_temperature = fine_matching_regress_temperature + + self.num_key_value_heads = num_attention_heads + self.rope_theta = rope_theta + self.rope_scaling = rope_scaling if rope_scaling is not None else {"rope_type": "default"} + + # for compatibility with "default" rope type + self.partial_rotary_factor = partial_rotary_factor + rope_config_validation(self) + + self.initializer_range = initializer_range + + super().__init__(**kwargs) + + +__all__ = ["EfficientLoFTRConfig"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/efficientloftr/image_processing_efficientloftr.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/efficientloftr/image_processing_efficientloftr.py new file mode 100644 index 0000000000000000000000000000000000000000..58ce0e96f5b8664720505d89ddd1f15ace7e5e21 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/efficientloftr/image_processing_efficientloftr.py @@ -0,0 +1,461 @@ +# Copyright 2025 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Image processor class for SuperPoint.""" + +from typing import Optional, Union + +import numpy as np + +from ... import is_torch_available, is_vision_available +from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict +from ...image_transforms import resize, to_channel_dimension_format +from ...image_utils import ( + ChannelDimension, + ImageInput, + ImageType, + PILImageResampling, + get_image_type, + infer_channel_dimension_format, + is_pil_image, + is_scaled_image, + is_valid_image, + to_numpy_array, + valid_images, + validate_preprocess_arguments, +) +from ...utils import TensorType, logging, requires_backends + + +if is_torch_available(): + import torch + +if is_vision_available(): + import PIL + from PIL import Image, ImageDraw + + from .modeling_efficientloftr import KeypointMatchingOutput + +logger = logging.get_logger(__name__) + + +# Copied from transformers.models.superpoint.image_processing_superpoint.is_grayscale +def is_grayscale( + image: np.ndarray, + input_data_format: Optional[Union[str, ChannelDimension]] = None, +): + if input_data_format == ChannelDimension.FIRST: + if image.shape[0] == 1: + return True + return np.all(image[0, ...] == image[1, ...]) and np.all(image[1, ...] == image[2, ...]) + elif input_data_format == ChannelDimension.LAST: + if image.shape[-1] == 1: + return True + return np.all(image[..., 0] == image[..., 1]) and np.all(image[..., 1] == image[..., 2]) + + +# Copied from transformers.models.superpoint.image_processing_superpoint.convert_to_grayscale +def convert_to_grayscale( + image: ImageInput, + input_data_format: Optional[Union[str, ChannelDimension]] = None, +) -> ImageInput: + """ + Converts an image to grayscale format using the NTSC formula. Only support numpy and PIL Image. TODO support torch + and tensorflow grayscale conversion + + This function is supposed to return a 1-channel image, but it returns a 3-channel image with the same value in each + channel, because of an issue that is discussed in : + https://github.com/huggingface/transformers/pull/25786#issuecomment-1730176446 + + Args: + image (Image): + The image to convert. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. + """ + requires_backends(convert_to_grayscale, ["vision"]) + + if isinstance(image, np.ndarray): + if is_grayscale(image, input_data_format=input_data_format): + return image + if input_data_format == ChannelDimension.FIRST: + gray_image = image[0, ...] * 0.2989 + image[1, ...] * 0.5870 + image[2, ...] * 0.1140 + gray_image = np.stack([gray_image] * 3, axis=0) + elif input_data_format == ChannelDimension.LAST: + gray_image = image[..., 0] * 0.2989 + image[..., 1] * 0.5870 + image[..., 2] * 0.1140 + gray_image = np.stack([gray_image] * 3, axis=-1) + return gray_image + + if not isinstance(image, PIL.Image.Image): + return image + + image = image.convert("L") + return image + + +# Copied from transformers.models.superglue.image_processing_superglue.validate_and_format_image_pairs +def validate_and_format_image_pairs(images: ImageInput): + error_message = ( + "Input images must be a one of the following :", + " - A pair of PIL images.", + " - A pair of 3D arrays.", + " - A list of pairs of PIL images.", + " - A list of pairs of 3D arrays.", + ) + + def _is_valid_image(image): + """images is a PIL Image or a 3D array.""" + return is_pil_image(image) or ( + is_valid_image(image) and get_image_type(image) != ImageType.PIL and len(image.shape) == 3 + ) + + if isinstance(images, list): + if len(images) == 2 and all((_is_valid_image(image)) for image in images): + return images + if all( + isinstance(image_pair, list) + and len(image_pair) == 2 + and all(_is_valid_image(image) for image in image_pair) + for image_pair in images + ): + return [image for image_pair in images for image in image_pair] + raise ValueError(error_message) + + +class EfficientLoFTRImageProcessor(BaseImageProcessor): + r""" + Constructs a EfficientLoFTR image processor. + + Args: + do_resize (`bool`, *optional*, defaults to `True`): + Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden + by `do_resize` in the `preprocess` method. + size (`Dict[str, int]` *optional*, defaults to `{"height": 480, "width": 640}`): + Resolution of the output image after `resize` is applied. Only has an effect if `do_resize` is set to + `True`. Can be overridden by `size` in the `preprocess` method. + resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`): + Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method. + do_rescale (`bool`, *optional*, defaults to `True`): + Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in + the `preprocess` method. + rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): + Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess` + method. + do_grayscale (`bool`, *optional*, defaults to `True`): + Whether to convert the image to grayscale. Can be overridden by `do_grayscale` in the `preprocess` method. + """ + + model_input_names = ["pixel_values"] + + def __init__( + self, + do_resize: bool = True, + size: Optional[dict[str, int]] = None, + resample: PILImageResampling = PILImageResampling.BILINEAR, + do_rescale: bool = True, + rescale_factor: float = 1 / 255, + do_grayscale: bool = True, + **kwargs, + ) -> None: + super().__init__(**kwargs) + size = size if size is not None else {"height": 480, "width": 640} + size = get_size_dict(size, default_to_square=False) + + self.do_resize = do_resize + self.size = size + self.resample = resample + self.do_rescale = do_rescale + self.rescale_factor = rescale_factor + self.do_grayscale = do_grayscale + + # Copied from transformers.models.superpoint.image_processing_superpoint.SuperPointImageProcessor.resize + def resize( + self, + image: np.ndarray, + size: dict[str, int], + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ): + """ + Resize an image. + + Args: + image (`np.ndarray`): + Image to resize. + size (`dict[str, int]`): + Dictionary of the form `{"height": int, "width": int}`, specifying the size of the output image. + data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format of the output image. If not provided, it will be inferred from the input + image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + """ + size = get_size_dict(size, default_to_square=False) + + return resize( + image, + size=(size["height"], size["width"]), + data_format=data_format, + input_data_format=input_data_format, + **kwargs, + ) + + # Copied from transformers.models.superglue.image_processing_superglue.SuperGlueImageProcessor.preprocess + def preprocess( + self, + images, + do_resize: Optional[bool] = None, + size: Optional[dict[str, int]] = None, + resample: Optional[PILImageResampling] = None, + do_rescale: Optional[bool] = None, + rescale_factor: Optional[float] = None, + do_grayscale: Optional[bool] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + data_format: ChannelDimension = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ) -> BatchFeature: + """ + Preprocess an image or batch of images. + + Args: + images (`ImageInput`): + Image pairs to preprocess. Expects either a list of 2 images or a list of list of 2 images list with + pixel values ranging from 0 to 255. If passing in images with pixel values between 0 and 1, set + `do_rescale=False`. + do_resize (`bool`, *optional*, defaults to `self.do_resize`): + Whether to resize the image. + size (`dict[str, int]`, *optional*, defaults to `self.size`): + Size of the output image after `resize` has been applied. If `size["shortest_edge"]` >= 384, the image + is resized to `(size["shortest_edge"], size["shortest_edge"])`. Otherwise, the smaller edge of the + image will be matched to `int(size["shortest_edge"]/ crop_pct)`, after which the image is cropped to + `(size["shortest_edge"], size["shortest_edge"])`. Only has an effect if `do_resize` is set to `True`. + resample (`PILImageResampling`, *optional*, defaults to `self.resample`): + Resampling filter to use if resizing the image. This can be one of `PILImageResampling`, filters. Only + has an effect if `do_resize` is set to `True`. + do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): + Whether to rescale the image values between [0 - 1]. + rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): + Rescale factor to rescale the image by if `do_rescale` is set to `True`. + do_grayscale (`bool`, *optional*, defaults to `self.do_grayscale`): + Whether to convert the image to grayscale. + return_tensors (`str` or `TensorType`, *optional*): + The type of tensors to return. Can be one of: + - Unset: Return a list of `np.ndarray`. + - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`. + - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. + - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. + - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. + data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - Unset: Use the channel dimension format of the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + """ + + do_resize = do_resize if do_resize is not None else self.do_resize + resample = resample if resample is not None else self.resample + do_rescale = do_rescale if do_rescale is not None else self.do_rescale + rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor + do_grayscale = do_grayscale if do_grayscale is not None else self.do_grayscale + + size = size if size is not None else self.size + size = get_size_dict(size, default_to_square=False) + + # Validate and convert the input images into a flattened list of images for all subsequent processing steps. + images = validate_and_format_image_pairs(images) + + if not valid_images(images): + raise ValueError( + "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " + "torch.Tensor, tf.Tensor or jax.ndarray." + ) + + validate_preprocess_arguments( + do_resize=do_resize, + size=size, + resample=resample, + do_rescale=do_rescale, + rescale_factor=rescale_factor, + ) + + # All transformations expect numpy arrays. + images = [to_numpy_array(image) for image in images] + + if is_scaled_image(images[0]) and do_rescale: + logger.warning_once( + "It looks like you are trying to rescale already rescaled images. If the input" + " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again." + ) + + if input_data_format is None: + # We assume that all images have the same channel dimension format. + input_data_format = infer_channel_dimension_format(images[0]) + + all_images = [] + for image in images: + if do_resize: + image = self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format) + + if do_rescale: + image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format) + + if do_grayscale: + image = convert_to_grayscale(image, input_data_format=input_data_format) + + image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) + all_images.append(image) + + # Convert back the flattened list of images into a list of pairs of images. + image_pairs = [all_images[i : i + 2] for i in range(0, len(all_images), 2)] + + data = {"pixel_values": image_pairs} + + return BatchFeature(data=data, tensor_type=return_tensors) + + def post_process_keypoint_matching( + self, + outputs: "KeypointMatchingOutput", + target_sizes: Union[TensorType, list[tuple]], + threshold: float = 0.0, + ) -> list[dict[str, torch.Tensor]]: + """ + Converts the raw output of [`KeypointMatchingOutput`] into lists of keypoints, scores and descriptors + with coordinates absolute to the original image sizes. + Args: + outputs ([`KeypointMatchingOutput`]): + Raw outputs of the model. + target_sizes (`torch.Tensor` or `List[Tuple[Tuple[int, int]]]`, *optional*): + Tensor of shape `(batch_size, 2, 2)` or list of tuples of tuples (`Tuple[int, int]`) containing the + target size `(height, width)` of each image in the batch. This must be the original image size (before + any processing). + threshold (`float`, *optional*, defaults to 0.0): + Threshold to filter out the matches with low scores. + Returns: + `List[Dict]`: A list of dictionaries, each dictionary containing the keypoints in the first and second image + of the pair, the matching scores and the matching indices. + """ + if outputs.matches.shape[0] != len(target_sizes): + raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the mask") + if not all(len(target_size) == 2 for target_size in target_sizes): + raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch") + + if isinstance(target_sizes, list): + image_pair_sizes = torch.tensor(target_sizes, device=outputs.matches.device) + else: + if target_sizes.shape[1] != 2 or target_sizes.shape[2] != 2: + raise ValueError( + "Each element of target_sizes must contain the size (h, w) of each image of the batch" + ) + image_pair_sizes = target_sizes + + keypoints = outputs.keypoints.clone() + keypoints = keypoints * image_pair_sizes.flip(-1).reshape(-1, 2, 1, 2) + keypoints = keypoints.to(torch.int32) + + results = [] + for keypoints_pair, matches, scores in zip(keypoints, outputs.matches, outputs.matching_scores): + # Filter out matches with low scores + valid_matches = torch.logical_and(scores > threshold, matches > -1) + + matched_keypoints0 = keypoints_pair[0][valid_matches[0]] + matched_keypoints1 = keypoints_pair[1][valid_matches[1]] + matching_scores = scores[0][valid_matches[0]] + + results.append( + { + "keypoints0": matched_keypoints0, + "keypoints1": matched_keypoints1, + "matching_scores": matching_scores, + } + ) + + return results + + def visualize_keypoint_matching( + self, + images: ImageInput, + keypoint_matching_output: list[dict[str, torch.Tensor]], + ) -> list["Image.Image"]: + """ + Plots the image pairs side by side with the detected keypoints as well as the matching between them. + + Args: + images (`ImageInput`): + Image pairs to plot. Same as `EfficientLoFTRImageProcessor.preprocess`. Expects either a list of 2 + images or a list of list of 2 images list with pixel values ranging from 0 to 255. + keypoint_matching_output (List[Dict[str, torch.Tensor]]]): + A post processed keypoint matching output + + Returns: + `List[PIL.Image.Image]`: A list of PIL images, each containing the image pairs side by side with the detected + keypoints as well as the matching between them. + """ + images = validate_and_format_image_pairs(images) + images = [to_numpy_array(image) for image in images] + image_pairs = [images[i : i + 2] for i in range(0, len(images), 2)] + + results = [] + for image_pair, pair_output in zip(image_pairs, keypoint_matching_output): + height0, width0 = image_pair[0].shape[:2] + height1, width1 = image_pair[1].shape[:2] + plot_image = np.zeros((max(height0, height1), width0 + width1, 3), dtype=np.uint8) + plot_image[:height0, :width0] = image_pair[0] + plot_image[:height1, width0:] = image_pair[1] + + plot_image_pil = Image.fromarray(plot_image) + draw = ImageDraw.Draw(plot_image_pil) + + keypoints0_x, keypoints0_y = pair_output["keypoints0"].unbind(1) + keypoints1_x, keypoints1_y = pair_output["keypoints1"].unbind(1) + for keypoint0_x, keypoint0_y, keypoint1_x, keypoint1_y, matching_score in zip( + keypoints0_x, keypoints0_y, keypoints1_x, keypoints1_y, pair_output["matching_scores"] + ): + color = self._get_color(matching_score) + draw.line( + (keypoint0_x, keypoint0_y, keypoint1_x + width0, keypoint1_y), + fill=color, + width=3, + ) + draw.ellipse((keypoint0_x - 2, keypoint0_y - 2, keypoint0_x + 2, keypoint0_y + 2), fill="black") + draw.ellipse( + (keypoint1_x + width0 - 2, keypoint1_y - 2, keypoint1_x + width0 + 2, keypoint1_y + 2), + fill="black", + ) + + results.append(plot_image_pil) + return results + + def _get_color(self, score): + """Maps a score to a color.""" + r = int(255 * (1 - score)) + g = int(255 * score) + b = 0 + return (r, g, b) + + +__all__ = ["EfficientLoFTRImageProcessor"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/efficientloftr/image_processing_efficientloftr_fast.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/efficientloftr/image_processing_efficientloftr_fast.py new file mode 100644 index 0000000000000000000000000000000000000000..1463ef405f37db1fad1f7d5435494197f5bef9f9 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/efficientloftr/image_processing_efficientloftr_fast.py @@ -0,0 +1,314 @@ +# coding=utf-8 +# Copyright 2025 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Fast Image processor class for EfficientLoFTR.""" + +from typing import TYPE_CHECKING, Optional, Union + +import torch +from PIL import Image, ImageDraw + +from ...image_processing_utils import BatchFeature +from ...image_processing_utils_fast import ( + BaseImageProcessorFast, + DefaultFastImageProcessorKwargs, + group_images_by_shape, + reorder_images, +) +from ...image_utils import ( + ImageInput, + ImageType, + PILImageResampling, + SizeDict, + get_image_type, + is_pil_image, + is_valid_image, +) +from ...processing_utils import Unpack +from ...utils import ( + TensorType, + auto_docstring, +) + + +if TYPE_CHECKING: + from .modeling_efficientloftr import KeypointMatchingOutput + +import torchvision.transforms.v2.functional as F + + +def _is_valid_image(image): + return is_pil_image(image) or ( + is_valid_image(image) and get_image_type(image) != ImageType.PIL and len(image.shape) == 3 + ) + + +def flatten_pair_images(images): + # Handle the pair validation and flattening similar to slow processor + if isinstance(images, list): + if len(images) == 2 and all((_is_valid_image(image) or isinstance(image, torch.Tensor)) for image in images): + # Single pair of images - keep as is, they'll be processed by the base class + return images + elif all( + isinstance(image_pair, list) + and len(image_pair) == 2 + and all(_is_valid_image(image) or isinstance(image, torch.Tensor) for image in image_pair) + for image_pair in images + ): + # Multiple pairs - flatten them + images = [image for image_pair in images for image in image_pair] + return images + raise ValueError( + "Input images must be a one of the following :", + " - A pair of PIL images.", + " - A pair of 3D arrays.", + " - A list of pairs of PIL images.", + " - A list of pairs of 3D arrays.", + ) + + +def is_grayscale( + image: "torch.Tensor", +): + """Checks if an image is grayscale (all RGB channels are identical).""" + if image.ndim < 3 or image.shape[0 if image.ndim == 3 else 1] == 1: + return True + return torch.all(image[..., 0, :, :] == image[..., 1, :, :]) and torch.all( + image[..., 1, :, :] == image[..., 2, :, :] + ) + + +def convert_to_grayscale( + image: "torch.Tensor", +) -> "torch.Tensor": + """ + Converts an image to grayscale format using the NTSC formula. Only support torch.Tensor. + + This function is supposed to return a 1-channel image, but it returns a 3-channel image with the same value in each + channel, because of an issue that is discussed in : + https://github.com/huggingface/transformers/pull/25786#issuecomment-1730176446 + + Args: + image (torch.Tensor): + The image to convert. + """ + if is_grayscale(image): + return image + return F.rgb_to_grayscale(image, num_output_channels=3) + + +class EfficientLoFTRFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): + r""" + do_grayscale (`bool`, *optional*, defaults to `True`): + Whether to convert the image to grayscale. Can be overridden by `do_grayscale` in the `preprocess` method. + """ + + do_grayscale: Optional[bool] = True + + +@auto_docstring +class EfficientLoFTRImageProcessorFast(BaseImageProcessorFast): + resample = PILImageResampling.BILINEAR + size = {"height": 480, "width": 640} + default_to_square = False + do_resize = True + do_rescale = True + rescale_factor = 1 / 255 + do_normalize = None + valid_kwargs = EfficientLoFTRFastImageProcessorKwargs + + def __init__(self, **kwargs: Unpack[EfficientLoFTRFastImageProcessorKwargs]): + super().__init__(**kwargs) + + @auto_docstring + def preprocess(self, images: ImageInput, **kwargs: Unpack[EfficientLoFTRFastImageProcessorKwargs]) -> BatchFeature: + return super().preprocess(images, **kwargs) + + def _prepare_images_structure( + self, + images: ImageInput, + **kwargs, + ) -> ImageInput: + # we need to handle image pairs validation and flattening + return flatten_pair_images(images) + + def _preprocess( + self, + images: list["torch.Tensor"], + size: Union[dict[str, int], SizeDict], + rescale_factor: float, + do_rescale: bool, + do_resize: bool, + interpolation: Optional["F.InterpolationMode"], + do_grayscale: bool, + disable_grouping: bool, + return_tensors: Union[str, TensorType], + **kwargs, + ) -> BatchFeature: + grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping) + processed_images_grouped = {} + + for shape, stacked_images in grouped_images.items(): + if do_resize: + stacked_images = self.resize(stacked_images, size=size, interpolation=interpolation) + processed_images_grouped[shape] = stacked_images + resized_images = reorder_images(processed_images_grouped, grouped_images_index) + + grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping) + processed_images_grouped = {} + for shape, stacked_images in grouped_images.items(): + if do_rescale: + stacked_images = self.rescale(stacked_images, rescale_factor) + if do_grayscale: + stacked_images = convert_to_grayscale(stacked_images) + processed_images_grouped[shape] = stacked_images + + processed_images = reorder_images(processed_images_grouped, grouped_images_index) + + # Convert back to pairs format + image_pairs = [processed_images[i : i + 2] for i in range(0, len(processed_images), 2)] + + # Stack each pair into a single tensor to match slow processor format + stacked_pairs = [torch.stack(pair, dim=0) for pair in image_pairs] + + # Return in same format as slow processor + image_pairs = torch.stack(stacked_pairs, dim=0) if return_tensors else stacked_pairs + + return BatchFeature(data={"pixel_values": image_pairs}) + + def post_process_keypoint_matching( + self, + outputs: "KeypointMatchingOutput", + target_sizes: Union[TensorType, list[tuple]], + threshold: float = 0.0, + ) -> list[dict[str, torch.Tensor]]: + """ + Converts the raw output of [`KeypointMatchingOutput`] into lists of keypoints, scores and descriptors + with coordinates absolute to the original image sizes. + Args: + outputs ([`KeypointMatchingOutput`]): + Raw outputs of the model. + target_sizes (`torch.Tensor` or `List[Tuple[Tuple[int, int]]]`, *optional*): + Tensor of shape `(batch_size, 2, 2)` or list of tuples of tuples (`Tuple[int, int]`) containing the + target size `(height, width)` of each image in the batch. This must be the original image size (before + any processing). + threshold (`float`, *optional*, defaults to 0.0): + Threshold to filter out the matches with low scores. + Returns: + `List[Dict]`: A list of dictionaries, each dictionary containing the keypoints in the first and second image + of the pair, the matching scores and the matching indices. + """ + if outputs.matches.shape[0] != len(target_sizes): + raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the mask") + if not all(len(target_size) == 2 for target_size in target_sizes): + raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch") + + if isinstance(target_sizes, list): + image_pair_sizes = torch.tensor(target_sizes, device=outputs.matches.device) + else: + if target_sizes.shape[1] != 2 or target_sizes.shape[2] != 2: + raise ValueError( + "Each element of target_sizes must contain the size (h, w) of each image of the batch" + ) + image_pair_sizes = target_sizes + + keypoints = outputs.keypoints.clone() + keypoints = keypoints * image_pair_sizes.flip(-1).reshape(-1, 2, 1, 2) + keypoints = keypoints.to(torch.int32) + + results = [] + for keypoints_pair, matches, scores in zip(keypoints, outputs.matches, outputs.matching_scores): + # Filter out matches with low scores + valid_matches = torch.logical_and(scores > threshold, matches > -1) + + matched_keypoints0 = keypoints_pair[0][valid_matches[0]] + matched_keypoints1 = keypoints_pair[1][valid_matches[1]] + matching_scores = scores[0][valid_matches[0]] + + results.append( + { + "keypoints0": matched_keypoints0, + "keypoints1": matched_keypoints1, + "matching_scores": matching_scores, + } + ) + + return results + + def visualize_keypoint_matching( + self, + images, + keypoint_matching_output: list[dict[str, torch.Tensor]], + ) -> list["Image.Image"]: + """ + Plots the image pairs side by side with the detected keypoints as well as the matching between them. + + Args: + images: + Image pairs to plot. Same as `EfficientLoFTRImageProcessor.preprocess`. Expects either a list of 2 + images or a list of list of 2 images list with pixel values ranging from 0 to 255. + keypoint_matching_output (List[Dict[str, torch.Tensor]]]): + A post processed keypoint matching output + + Returns: + `List[PIL.Image.Image]`: A list of PIL images, each containing the image pairs side by side with the detected + keypoints as well as the matching between them. + """ + from ...image_utils import to_numpy_array + from .image_processing_efficientloftr import validate_and_format_image_pairs + + images = validate_and_format_image_pairs(images) + images = [to_numpy_array(image) for image in images] + image_pairs = [images[i : i + 2] for i in range(0, len(images), 2)] + + results = [] + for image_pair, pair_output in zip(image_pairs, keypoint_matching_output): + height0, width0 = image_pair[0].shape[:2] + height1, width1 = image_pair[1].shape[:2] + plot_image = torch.zeros((max(height0, height1), width0 + width1, 3), dtype=torch.uint8) + plot_image[:height0, :width0] = torch.from_numpy(image_pair[0]) + plot_image[:height1, width0:] = torch.from_numpy(image_pair[1]) + + plot_image_pil = Image.fromarray(plot_image.numpy()) + draw = ImageDraw.Draw(plot_image_pil) + + keypoints0_x, keypoints0_y = pair_output["keypoints0"].unbind(1) + keypoints1_x, keypoints1_y = pair_output["keypoints1"].unbind(1) + for keypoint0_x, keypoint0_y, keypoint1_x, keypoint1_y, matching_score in zip( + keypoints0_x, keypoints0_y, keypoints1_x, keypoints1_y, pair_output["matching_scores"] + ): + color = self._get_color(matching_score) + draw.line( + (keypoint0_x, keypoint0_y, keypoint1_x + width0, keypoint1_y), + fill=color, + width=3, + ) + draw.ellipse((keypoint0_x - 2, keypoint0_y - 2, keypoint0_x + 2, keypoint0_y + 2), fill="black") + draw.ellipse( + (keypoint1_x + width0 - 2, keypoint1_y - 2, keypoint1_x + width0 + 2, keypoint1_y + 2), + fill="black", + ) + + results.append(plot_image_pil) + return results + + def _get_color(self, score): + """Maps a score to a color.""" + r = int(255 * (1 - score)) + g = int(255 * score) + b = 0 + return (r, g, b) + + +__all__ = ["EfficientLoFTRImageProcessorFast"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/efficientloftr/modeling_efficientloftr.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/efficientloftr/modeling_efficientloftr.py new file mode 100644 index 0000000000000000000000000000000000000000..f5a20a7cc87dea233db5f4ddc3313df44f72c9d5 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/efficientloftr/modeling_efficientloftr.py @@ -0,0 +1,1325 @@ +# Copyright 2025 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from dataclasses import dataclass +from typing import Callable, Optional, Union + +import torch +from torch import nn + +from ...activations import ACT2CLS, ACT2FN +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_outputs import BackboneOutput +from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS +from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel +from ...processing_utils import Unpack +from ...pytorch_utils import compile_compatible_method_lru_cache +from ...utils import ( + ModelOutput, + TransformersKwargs, + auto_docstring, + can_return_tuple, + torch_int, +) +from ...utils.generic import check_model_inputs +from .configuration_efficientloftr import EfficientLoFTRConfig + + +@dataclass +@auto_docstring( + custom_intro=""" + Base class for outputs of keypoint matching models. Due to the nature of keypoint detection and matching, the number + of keypoints is not fixed and can vary from image to image, which makes batching non-trivial. In the batch of + images, the maximum number of matches is set as the dimension of the matches and matching scores. The mask tensor is + used to indicate which values in the keypoints, matches and matching_scores tensors are keypoint matching + information. + """ +) +class KeypointMatchingOutput(ModelOutput): + r""" + matches (`torch.FloatTensor` of shape `(batch_size, 2, num_matches)`): + Index of keypoint matched in the other image. + matching_scores (`torch.FloatTensor` of shape `(batch_size, 2, num_matches)`): + Scores of predicted matches. + keypoints (`torch.FloatTensor` of shape `(batch_size, num_keypoints, 2)`): + Absolute (x, y) coordinates of predicted keypoints in a given image. + hidden_states (`tuple[torch.FloatTensor, ...]`, *optional*): + Tuple of `torch.FloatTensor` (one for the output of each stage) of shape `(batch_size, 2, num_channels, + num_keypoints)`, returned when `output_hidden_states=True` is passed or when + `config.output_hidden_states=True`) + attentions (`tuple[torch.FloatTensor, ...]`, *optional*): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, 2, num_heads, num_keypoints, + num_keypoints)`, returned when `output_attentions=True` is passed or when `config.output_attentions=True`) + """ + + matches: Optional[torch.FloatTensor] = None + matching_scores: Optional[torch.FloatTensor] = None + keypoints: Optional[torch.FloatTensor] = None + hidden_states: Optional[tuple[torch.FloatTensor]] = None + attentions: Optional[tuple[torch.FloatTensor]] = None + + +@compile_compatible_method_lru_cache(maxsize=32) +def compute_embeddings(inv_freq: torch.Tensor, embed_height: int, embed_width: int, hidden_size: int) -> torch.Tensor: + i_indices = torch.ones(embed_height, embed_width, dtype=inv_freq.dtype, device=inv_freq.device) + j_indices = torch.ones(embed_height, embed_width, dtype=inv_freq.dtype, device=inv_freq.device) + i_indices = i_indices.cumsum(0).unsqueeze(-1) + j_indices = j_indices.cumsum(1).unsqueeze(-1) + + emb = torch.zeros(1, embed_height, embed_width, hidden_size // 2, dtype=inv_freq.dtype, device=inv_freq.device) + emb[:, :, :, 0::2] = i_indices * inv_freq + emb[:, :, :, 1::2] = j_indices * inv_freq + + return emb + + +class EfficientLoFTRRotaryEmbedding(nn.Module): + inv_freq: torch.Tensor # fix linting for `register_buffer` + + def __init__(self, config: EfficientLoFTRConfig, device=None): + super().__init__() + self.config = config + self.rope_type = config.rope_scaling["rope_type"] + self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + + inv_freq, _ = self.rope_init_fn(self.config, device) + inv_freq_expanded = inv_freq[None, None, None, :].float().expand(1, 1, 1, -1) + + self.register_buffer("inv_freq", inv_freq_expanded, persistent=False) + + @torch.no_grad() + def forward( + self, x: torch.Tensor, position_ids: Optional[tuple[torch.LongTensor, torch.LongTensor]] = None + ) -> tuple[torch.Tensor, torch.Tensor]: + feats_height, feats_width = x.shape[-2:] + embed_height = (feats_height - self.config.q_aggregation_kernel_size) // self.config.q_aggregation_stride + 1 + embed_width = (feats_width - self.config.q_aggregation_kernel_size) // self.config.q_aggregation_stride + 1 + device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" + with torch.autocast(device_type=device_type, enabled=False): # Force float32 + emb = compute_embeddings(self.inv_freq, embed_height, embed_width, self.config.hidden_size) + sin = emb.sin() + cos = emb.cos() + + sin = sin.repeat_interleave(2, dim=-1) + cos = cos.repeat_interleave(2, dim=-1) + + sin = sin.to(device=x.device, dtype=x.dtype) + cos = cos.to(device=x.device, dtype=x.dtype) + + return cos, sin + + +# Copied from transformers.models.rt_detr_v2.modeling_rt_detr_v2.RTDetrV2ConvNormLayer with RTDetrV2->EfficientLoFTR +class EfficientLoFTRConvNormLayer(nn.Module): + def __init__(self, config, in_channels, out_channels, kernel_size, stride, padding=None, activation=None): + super().__init__() + self.conv = nn.Conv2d( + in_channels, + out_channels, + kernel_size, + stride, + padding=(kernel_size - 1) // 2 if padding is None else padding, + bias=False, + ) + self.norm = nn.BatchNorm2d(out_channels, config.batch_norm_eps) + self.activation = nn.Identity() if activation is None else ACT2CLS[activation]() + + def forward(self, hidden_state): + hidden_state = self.conv(hidden_state) + hidden_state = self.norm(hidden_state) + hidden_state = self.activation(hidden_state) + return hidden_state + + +class EfficientLoFTRRepVGGBlock(GradientCheckpointingLayer): + """ + RepVGG architecture block introduced by the work "RepVGG: Making VGG-style ConvNets Great Again". + """ + + def __init__(self, config: EfficientLoFTRConfig, stage_idx: int, block_idx: int): + super().__init__() + in_channels = config.stage_block_in_channels[stage_idx][block_idx] + out_channels = config.stage_block_out_channels[stage_idx][block_idx] + stride = config.stage_block_stride[stage_idx][block_idx] + activation = config.activation_function + self.conv1 = EfficientLoFTRConvNormLayer( + config, in_channels, out_channels, kernel_size=3, stride=stride, padding=1 + ) + self.conv2 = EfficientLoFTRConvNormLayer( + config, in_channels, out_channels, kernel_size=1, stride=stride, padding=0 + ) + self.identity = nn.BatchNorm2d(in_channels) if in_channels == out_channels and stride == 1 else None + self.activation = nn.Identity() if activation is None else ACT2FN[activation] + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + if self.identity is not None: + identity_out = self.identity(hidden_states) + else: + identity_out = 0 + hidden_states = self.conv1(hidden_states) + self.conv2(hidden_states) + identity_out + hidden_states = self.activation(hidden_states) + return hidden_states + + +class EfficientLoFTRRepVGGStage(nn.Module): + def __init__(self, config: EfficientLoFTRConfig, stage_idx: int): + super().__init__() + self.blocks = nn.ModuleList([]) + for block_idx in range(config.stage_num_blocks[stage_idx]): + self.blocks.append( + EfficientLoFTRRepVGGBlock( + config, + stage_idx, + block_idx, + ) + ) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + for block in self.blocks: + hidden_states = block(hidden_states) + return hidden_states + + +class EfficientLoFTRepVGG(nn.Module): + def __init__(self, config: EfficientLoFTRConfig): + super().__init__() + + self.stages = nn.ModuleList([]) + + for stage_idx in range(len(config.stage_stride)): + stage = EfficientLoFTRRepVGGStage(config, stage_idx) + self.stages.append(stage) + + def forward(self, hidden_states: torch.Tensor) -> list[torch.Tensor]: + outputs = [] + for stage in self.stages: + hidden_states = stage(hidden_states) + outputs.append(hidden_states) + + # Exclude first stage in outputs + outputs = outputs[1:] + return outputs + + +class EfficientLoFTRAggregationLayer(nn.Module): + def __init__(self, config: EfficientLoFTRConfig): + super().__init__() + + hidden_size = config.hidden_size + + self.q_aggregation = nn.Conv2d( + hidden_size, + hidden_size, + kernel_size=config.q_aggregation_kernel_size, + padding=0, + stride=config.q_aggregation_stride, + bias=False, + groups=hidden_size, + ) + self.kv_aggregation = torch.nn.MaxPool2d( + kernel_size=config.kv_aggregation_kernel_size, stride=config.kv_aggregation_stride + ) + self.norm = nn.LayerNorm(hidden_size) + + def forward( + self, + hidden_states: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor] = None, + ) -> tuple[torch.Tensor, torch.Tensor]: + query_states = hidden_states + is_cross_attention = encoder_hidden_states is not None + kv_states = encoder_hidden_states if is_cross_attention else hidden_states + + query_states = self.q_aggregation(query_states) + kv_states = self.kv_aggregation(kv_states) + query_states = query_states.permute(0, 2, 3, 1) + kv_states = kv_states.permute(0, 2, 3, 1) + hidden_states = self.norm(query_states) + encoder_hidden_states = self.norm(kv_states) + return hidden_states, encoder_hidden_states + + +# Copied from transformers.models.cohere.modeling_cohere.rotate_half +def rotate_half(x): + # Split and rotate. Note that this function is different from e.g. Llama. + x1 = x[..., ::2] + x2 = x[..., 1::2] + rot_x = torch.stack([-x2, x1], dim=-1).flatten(-2) + return rot_x + + +# Copied from transformers.models.cohere.modeling_cohere.apply_rotary_pos_emb +def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1): + """Applies Rotary Position Embedding to the query and key tensors. + + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + position_ids (`torch.Tensor`, *optional*): + Deprecated and unused. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + dtype = q.dtype + q = q.float() + k = k.float() + cos = cos.unsqueeze(unsqueeze_dim) + sin = sin.unsqueeze(unsqueeze_dim) + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed.to(dtype=dtype), k_embed.to(dtype=dtype) + + +# Copied from transformers.models.cohere.modeling_cohere.repeat_kv +def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: + """ + This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, + num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) + """ + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) + + +# Copied from transformers.models.llama.modeling_llama.eager_attention_forward +def eager_attention_forward( + module: nn.Module, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attention_mask: Optional[torch.Tensor], + scaling: float, + dropout: float = 0.0, + **kwargs: Unpack[TransformersKwargs], +): + key_states = repeat_kv(key, module.num_key_value_groups) + value_states = repeat_kv(value, module.num_key_value_groups) + + attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling + if attention_mask is not None: + causal_mask = attention_mask[:, :, :, : key_states.shape[-2]] + attn_weights = attn_weights + causal_mask + + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype) + attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) + attn_output = torch.matmul(attn_weights, value_states) + attn_output = attn_output.transpose(1, 2).contiguous() + + return attn_output, attn_weights + + +class EfficientLoFTRAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config: EfficientLoFTRConfig, layer_idx: int): + super().__init__() + self.config = config + self.layer_idx = layer_idx + self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads) + self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads + self.scaling = self.head_dim**-0.5 + self.attention_dropout = config.attention_dropout + self.is_causal = False + + self.q_proj = nn.Linear( + config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias + ) + self.k_proj = nn.Linear( + config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias + ) + self.v_proj = nn.Linear( + config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias + ) + self.o_proj = nn.Linear( + config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias + ) + + def forward( + self, + hidden_states: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor] = None, + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: + batch_size, seq_len, dim = hidden_states.shape + input_shape = hidden_states.shape[:-1] + + query_states = self.q_proj(hidden_states).view(batch_size, seq_len, -1, dim) + + is_cross_attention = encoder_hidden_states is not None + current_states = encoder_hidden_states if is_cross_attention else hidden_states + + key_states = self.k_proj(current_states).view(batch_size, seq_len, -1, dim) + value_states = self.v_proj(current_states).view(batch_size, seq_len, -1, self.head_dim).transpose(1, 2) + + if position_embeddings is not None: + cos, sin = position_embeddings + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, unsqueeze_dim=2) + + query_states = query_states.view(batch_size, seq_len, -1, self.head_dim).transpose(1, 2) + key_states = key_states.view(batch_size, seq_len, -1, self.head_dim).transpose(1, 2) + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + attn_output, attn_weights = attention_interface( + self, + query_states, + key_states, + value_states, + attention_mask=None, + dropout=0.0 if not self.training else self.attention_dropout, + scaling=self.scaling, + **kwargs, + ) + + attn_output = attn_output.reshape(*input_shape, -1).contiguous() + attn_output = self.o_proj(attn_output) + return attn_output, attn_weights + + +class EfficientLoFTRMLP(nn.Module): + def __init__(self, config: EfficientLoFTRConfig): + super().__init__() + hidden_size = config.hidden_size + intermediate_size = config.intermediate_size + self.fc1 = nn.Linear(hidden_size * 2, intermediate_size, bias=False) + self.activation = ACT2FN[config.mlp_activation_function] + self.fc2 = nn.Linear(intermediate_size, hidden_size, bias=False) + self.layer_norm = nn.LayerNorm(hidden_size) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.fc1(hidden_states) + hidden_states = self.activation(hidden_states) + hidden_states = self.fc2(hidden_states) + hidden_states = self.layer_norm(hidden_states) + return hidden_states + + +class EfficientLoFTRAggregatedAttention(nn.Module): + def __init__(self, config: EfficientLoFTRConfig, layer_idx: int): + super().__init__() + + self.q_aggregation_kernel_size = config.q_aggregation_kernel_size + self.aggregation = EfficientLoFTRAggregationLayer(config) + self.attention = EfficientLoFTRAttention(config, layer_idx) + self.mlp = EfficientLoFTRMLP(config) + + def forward( + self, + hidden_states: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor] = None, + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> torch.Tensor: + batch_size, embed_dim, _, _ = hidden_states.shape + + # Aggregate features + aggregated_hidden_states, aggregated_encoder_hidden_states = self.aggregation( + hidden_states, encoder_hidden_states + ) + _, aggregated_h, aggregated_w, _ = aggregated_hidden_states.shape + + # Multi-head attention + aggregated_hidden_states = aggregated_hidden_states.reshape(batch_size, -1, embed_dim) + aggregated_encoder_hidden_states = aggregated_encoder_hidden_states.reshape(batch_size, -1, embed_dim) + attn_output, _ = self.attention( + aggregated_hidden_states, + aggregated_encoder_hidden_states, + position_embeddings=position_embeddings, + **kwargs, + ) + + # Upsample features + # (batch_size, seq_len, embed_dim) -> (batch_size, embed_dim, h, w) with seq_len = h * w + attn_output = attn_output.permute(0, 2, 1) + attn_output = attn_output.reshape(batch_size, embed_dim, aggregated_h, aggregated_w) + attn_output = torch.nn.functional.interpolate( + attn_output, scale_factor=self.q_aggregation_kernel_size, mode="bilinear", align_corners=False + ) + intermediate_states = torch.cat([hidden_states, attn_output], dim=1) + intermediate_states = intermediate_states.permute(0, 2, 3, 1) + output_states = self.mlp(intermediate_states) + output_states = output_states.permute(0, 3, 1, 2) + + hidden_states = hidden_states + output_states + + return hidden_states + + +class EfficientLoFTRLocalFeatureTransformerLayer(GradientCheckpointingLayer): + def __init__(self, config: EfficientLoFTRConfig, layer_idx: int): + super().__init__() + + self.self_attention = EfficientLoFTRAggregatedAttention(config, layer_idx) + self.cross_attention = EfficientLoFTRAggregatedAttention(config, layer_idx) + + def forward( + self, + hidden_states: torch.Tensor, + position_embeddings: tuple[torch.Tensor, torch.Tensor], + **kwargs: Unpack[TransformersKwargs], + ) -> torch.Tensor: + batch_size, _, embed_dim, height, width = hidden_states.shape + + hidden_states = hidden_states.reshape(-1, embed_dim, height, width) + hidden_states = self.self_attention(hidden_states, position_embeddings=position_embeddings, **kwargs) + + ### + # Implementation of a bug in the original implementation regarding the cross-attention + # See : https://github.com/zju3dv/MatchAnything/issues/26 + hidden_states = hidden_states.reshape(-1, 2, embed_dim, height, width) + features_0 = hidden_states[:, 0] + features_1 = hidden_states[:, 1] + features_0 = self.cross_attention(features_0, features_1, **kwargs) + features_1 = self.cross_attention(features_1, features_0, **kwargs) + hidden_states = torch.stack((features_0, features_1), dim=1) + ### + + return hidden_states + + +class EfficientLoFTRLocalFeatureTransformer(nn.Module): + def __init__(self, config: EfficientLoFTRConfig): + super().__init__() + self.layers = nn.ModuleList( + [ + EfficientLoFTRLocalFeatureTransformerLayer(config, layer_idx=i) + for i in range(config.num_attention_layers) + ] + ) + + def forward( + self, + hidden_states: torch.Tensor, + position_embeddings: tuple[torch.Tensor, torch.Tensor], + **kwargs: Unpack[TransformersKwargs], + ) -> torch.Tensor: + for layer in self.layers: + hidden_states = layer(hidden_states, position_embeddings=position_embeddings, **kwargs) + return hidden_states + + +class EfficientLoFTROutConvBlock(nn.Module): + def __init__(self, config: EfficientLoFTRConfig, hidden_size: int, intermediate_size: int): + super().__init__() + + self.out_conv1 = nn.Conv2d(hidden_size, intermediate_size, kernel_size=1, stride=1, padding=0, bias=False) + self.out_conv2 = nn.Conv2d( + intermediate_size, intermediate_size, kernel_size=3, stride=1, padding=1, bias=False + ) + self.batch_norm = nn.BatchNorm2d(intermediate_size) + self.activation = ACT2CLS[config.mlp_activation_function]() + self.out_conv3 = nn.Conv2d(intermediate_size, hidden_size, kernel_size=3, stride=1, padding=1, bias=False) + + def forward(self, hidden_states: torch.Tensor, residual_states: torch.Tensor) -> torch.Tensor: + residual_states = self.out_conv1(residual_states) + residual_states = residual_states + hidden_states + residual_states = self.out_conv2(residual_states) + residual_states = self.batch_norm(residual_states) + residual_states = self.activation(residual_states) + residual_states = self.out_conv3(residual_states) + residual_states = nn.functional.interpolate( + residual_states, scale_factor=2.0, mode="bilinear", align_corners=False + ) + return residual_states + + +class EfficientLoFTRFineFusionLayer(nn.Module): + def __init__(self, config: EfficientLoFTRConfig): + super().__init__() + + self.fine_kernel_size = config.fine_kernel_size + + fine_fusion_dims = config.fine_fusion_dims + self.out_conv = nn.Conv2d( + fine_fusion_dims[0], fine_fusion_dims[0], kernel_size=1, stride=1, padding=0, bias=False + ) + self.out_conv_layers = nn.ModuleList() + for i in range(1, len(fine_fusion_dims)): + out_conv = EfficientLoFTROutConvBlock(config, fine_fusion_dims[i], fine_fusion_dims[i - 1]) + self.out_conv_layers.append(out_conv) + + def forward_pyramid( + self, + hidden_states: torch.Tensor, + residual_states: list[torch.Tensor], + ) -> torch.Tensor: + hidden_states = self.out_conv(hidden_states) + hidden_states = nn.functional.interpolate( + hidden_states, scale_factor=2.0, mode="bilinear", align_corners=False + ) + for i, layer in enumerate(self.out_conv_layers): + hidden_states = layer(hidden_states, residual_states[i]) + + return hidden_states + + def forward( + self, + coarse_features: torch.Tensor, + residual_features: list[torch.Tensor], + ) -> tuple[torch.Tensor, torch.Tensor]: + """ + For each image pair, compute the fine features of pixels. + In both images, compute a patch of fine features center cropped around each coarse pixel. + In the first image, the feature patch is kernel_size large and long. + In the second image, it is (kernel_size + 2) large and long. + """ + batch_size, _, embed_dim, coarse_height, coarse_width = coarse_features.shape + + coarse_features = coarse_features.reshape(-1, embed_dim, coarse_height, coarse_width) + residual_features = list(reversed(residual_features)) + + # 1. Fine feature extraction + fine_features = self.forward_pyramid(coarse_features, residual_features) + _, fine_embed_dim, fine_height, fine_width = fine_features.shape + + fine_features = fine_features.reshape(batch_size, 2, fine_embed_dim, fine_height, fine_width) + fine_features_0 = fine_features[:, 0] + fine_features_1 = fine_features[:, 1] + + # 2. Unfold all local windows in crops + stride = int(fine_height // coarse_height) + fine_features_0 = nn.functional.unfold( + fine_features_0, kernel_size=self.fine_kernel_size, stride=stride, padding=0 + ) + _, _, seq_len = fine_features_0.shape + fine_features_0 = fine_features_0.reshape(batch_size, -1, self.fine_kernel_size**2, seq_len) + fine_features_0 = fine_features_0.permute(0, 3, 2, 1) + + fine_features_1 = nn.functional.unfold( + fine_features_1, kernel_size=self.fine_kernel_size + 2, stride=stride, padding=1 + ) + fine_features_1 = fine_features_1.reshape(batch_size, -1, (self.fine_kernel_size + 2) ** 2, seq_len) + fine_features_1 = fine_features_1.permute(0, 3, 2, 1) + + return fine_features_0, fine_features_1 + + +@auto_docstring +class EfficientLoFTRPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = EfficientLoFTRConfig + base_model_prefix = "efficientloftr" + main_input_name = "pixel_values" + supports_gradient_checkpointing = True + _supports_flash_attn = True + _supports_sdpa = True + _can_record_outputs = { + "hidden_states": EfficientLoFTRRepVGGBlock, + "attentions": EfficientLoFTRAttention, + } + + def _init_weights(self, module: nn.Module) -> None: + """Initialize the weights""" + if isinstance(module, (nn.Linear, nn.Conv2d, nn.Conv1d, nn.BatchNorm2d)): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + # Copied from transformers.models.superpoint.modeling_superpoint.SuperPointPreTrainedModel.extract_one_channel_pixel_values with SuperPoint->EfficientLoFTR + def extract_one_channel_pixel_values(self, pixel_values: torch.FloatTensor) -> torch.FloatTensor: + """ + Assuming pixel_values has shape (batch_size, 3, height, width), and that all channels values are the same, + extract the first channel value to get a tensor of shape (batch_size, 1, height, width) for EfficientLoFTR. This is + a workaround for the issue discussed in : + https://github.com/huggingface/transformers/pull/25786#issuecomment-1730176446 + + Args: + pixel_values: torch.FloatTensor of shape (batch_size, 3, height, width) + + Returns: + pixel_values: torch.FloatTensor of shape (batch_size, 1, height, width) + + """ + return pixel_values[:, 0, :, :][:, None, :, :] + + +@auto_docstring( + custom_intro=""" + EfficientLoFTR model taking images as inputs and outputting the features of the images. + """ +) +class EfficientLoFTRModel(EfficientLoFTRPreTrainedModel): + def __init__(self, config: EfficientLoFTRConfig): + super().__init__(config) + + self.config = config + self.backbone = EfficientLoFTRepVGG(config) + self.local_feature_transformer = EfficientLoFTRLocalFeatureTransformer(config) + self.rotary_emb = EfficientLoFTRRotaryEmbedding(config=config) + + self.post_init() + + @check_model_inputs + @auto_docstring + def forward( + self, + pixel_values: torch.FloatTensor, + labels: Optional[torch.LongTensor] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> BackboneOutput: + r""" + Examples: + + ```python + >>> from transformers import AutoImageProcessor, AutoModel + >>> import torch + >>> from PIL import Image + >>> import requests + + >>> url = "https://github.com/magicleap/SuperGluePretrainedNetwork/blob/master/assets/phototourism_sample_images/london_bridge_78916675_4568141288.jpg?raw=true" + >>> image1 = Image.open(requests.get(url, stream=True).raw) + >>> url = "https://github.com/magicleap/SuperGluePretrainedNetwork/blob/master/assets/phototourism_sample_images/london_bridge_19481797_2295892421.jpg?raw=true" + >>> image2 = Image.open(requests.get(url, stream=True).raw) + >>> images = [image1, image2] + + >>> processor = AutoImageProcessor.from_pretrained("zju-community/efficient_loftr") + >>> model = AutoModel.from_pretrained("zju-community/efficient_loftr") + + >>> with torch.no_grad(): + >>> inputs = processor(images, return_tensors="pt") + >>> outputs = model(**inputs) + ```""" + if labels is not None: + raise ValueError("EfficientLoFTR is not trainable, no labels should be provided.") + + if pixel_values.ndim != 5 or pixel_values.size(1) != 2: + raise ValueError("Input must be a 5D tensor of shape (batch_size, 2, num_channels, height, width)") + + batch_size, _, channels, height, width = pixel_values.shape + pixel_values = pixel_values.reshape(batch_size * 2, channels, height, width) + pixel_values = self.extract_one_channel_pixel_values(pixel_values) + + # 1. Local Feature CNN + features = self.backbone(pixel_values) + # Last stage outputs are coarse outputs + coarse_features = features[-1] + # Rest is residual features used in EfficientLoFTRFineFusionLayer + residual_features = features[:-1] + coarse_embed_dim, coarse_height, coarse_width = coarse_features.shape[-3:] + + # 2. Coarse-level LoFTR module + cos, sin = self.rotary_emb(coarse_features) + cos = cos.expand(batch_size * 2, -1, -1, -1).reshape(batch_size * 2, -1, coarse_embed_dim) + sin = sin.expand(batch_size * 2, -1, -1, -1).reshape(batch_size * 2, -1, coarse_embed_dim) + position_embeddings = (cos, sin) + + coarse_features = coarse_features.reshape(batch_size, 2, coarse_embed_dim, coarse_height, coarse_width) + coarse_features = self.local_feature_transformer( + coarse_features, position_embeddings=position_embeddings, **kwargs + ) + + features = (coarse_features,) + tuple(residual_features) + + return BackboneOutput(feature_maps=features) + + +def mask_border(tensor: torch.Tensor, border_margin: int, value: Union[bool, float, int]) -> torch.Tensor: + """ + Mask a tensor border with a given value + + Args: + tensor (`torch.Tensor` of shape `(batch_size, height_0, width_0, height_1, width_1)`): + The tensor to mask + border_margin (`int`) : + The size of the border + value (`Union[bool, int, float]`): + The value to place in the tensor's borders + + Returns: + tensor (`torch.Tensor` of shape `(batch_size, height_0, width_0, height_1, width_1)`): + The masked tensor + """ + if border_margin <= 0: + return tensor + + tensor[:, :border_margin] = value + tensor[:, :, :border_margin] = value + tensor[:, :, :, :border_margin] = value + tensor[:, :, :, :, :border_margin] = value + tensor[:, -border_margin:] = value + tensor[:, :, -border_margin:] = value + tensor[:, :, :, -border_margin:] = value + tensor[:, :, :, :, -border_margin:] = value + + return tensor + + +def create_meshgrid( + height: Union[int, torch.Tensor], + width: Union[int, torch.Tensor], + normalized_coordinates: bool = False, + device: Optional[torch.device] = None, + dtype: Optional[torch.dtype] = None, +) -> torch.Tensor: + """ + Copied from kornia library : kornia/kornia/utils/grid.py:26 + + Generate a coordinate grid for an image. + + When the flag ``normalized_coordinates`` is set to True, the grid is + normalized to be in the range :math:`[-1,1]` to be consistent with the pytorch + function :py:func:`torch.nn.functional.grid_sample`. + + Args: + height (`int`): + The image height (rows). + width (`int`): + The image width (cols). + normalized_coordinates (`bool`): + Whether to normalize coordinates in the range :math:`[-1,1]` in order to be consistent with the + PyTorch function :py:func:`torch.nn.functional.grid_sample`. + device (`torch.device`): + The device on which the grid will be generated. + dtype (`torch.dtype`): + The data type of the generated grid. + + Return: + grid (`torch.Tensor` of shape `(1, height, width, 2)`): + The grid tensor. + + Example: + >>> create_meshgrid(2, 2) + tensor([[[[-1., -1.], + [ 1., -1.]], + + [[-1., 1.], + [ 1., 1.]]]]) + + >>> create_meshgrid(2, 2, normalized_coordinates=False) + tensor([[[[0., 0.], + [1., 0.]], + + [[0., 1.], + [1., 1.]]]]) + + """ + xs = torch.linspace(0, width - 1, width, device=device, dtype=dtype) + ys = torch.linspace(0, height - 1, height, device=device, dtype=dtype) + if normalized_coordinates: + xs = (xs / (width - 1) - 0.5) * 2 + ys = (ys / (height - 1) - 0.5) * 2 + grid = torch.stack(torch.meshgrid(ys, xs, indexing="ij"), dim=-1) + grid = grid.permute(1, 0, 2).unsqueeze(0) + return grid + + +def spatial_expectation2d(input: torch.Tensor, normalized_coordinates: bool = True) -> torch.Tensor: + r""" + Copied from kornia library : kornia/geometry/subpix/dsnt.py:76 + Compute the expectation of coordinate values using spatial probabilities. + + The input heatmap is assumed to represent a valid spatial probability distribution, + which can be achieved using :func:`~kornia.geometry.subpixel.spatial_softmax2d`. + + Args: + input (`torch.Tensor` of shape `(batch_size, embed_dim, height, width)`): + The input tensor representing dense spatial probabilities. + normalized_coordinates (`bool`): + Whether to return the coordinates normalized in the range of :math:`[-1, 1]`. Otherwise, it will return + the coordinates in the range of the input shape. + + Returns: + output (`torch.Tensor` of shape `(batch_size, embed_dim, 2)`) + Expected value of the 2D coordinates. Output order of the coordinates is (x, y). + + Examples: + >>> heatmaps = torch.tensor([[[ + ... [0., 0., 0.], + ... [0., 0., 0.], + ... [0., 1., 0.]]]]) + >>> spatial_expectation2d(heatmaps, False) + tensor([[[1., 2.]]]) + + """ + batch_size, embed_dim, height, width = input.shape + + # Create coordinates grid. + grid = create_meshgrid(height, width, normalized_coordinates, input.device) + grid = grid.to(input.dtype) + + pos_x = grid[..., 0].reshape(-1) + pos_y = grid[..., 1].reshape(-1) + + input_flat = input.view(batch_size, embed_dim, -1) + + # Compute the expectation of the coordinates. + expected_y = torch.sum(pos_y * input_flat, -1, keepdim=True) + expected_x = torch.sum(pos_x * input_flat, -1, keepdim=True) + + output = torch.cat([expected_x, expected_y], -1) + + return output.view(batch_size, embed_dim, 2) + + +@auto_docstring( + custom_intro=""" + EfficientLoFTR model taking images as inputs and outputting the matching of them. + """ +) +class EfficientLoFTRForKeypointMatching(EfficientLoFTRPreTrainedModel): + """EfficientLoFTR dense image matcher + + Given two images, we determine the correspondences by: + 1. Extracting coarse and fine features through a backbone + 2. Transforming coarse features through self and cross attention + 3. Matching coarse features to obtain coarse coordinates of matches + 4. Obtaining full resolution fine features by fusing transformed and backbone coarse features + 5. Refining the coarse matches using fine feature patches centered at each coarse match in a two-stage refinement + + Yifan Wang, Xingyi He, Sida Peng, Dongli Tan and Xiaowei Zhou. + Efficient LoFTR: Semi-Dense Local Feature Matching with Sparse-Like Speed + In CVPR, 2024. https://huggingface.co/papers/2403.04765 + """ + + def __init__(self, config: EfficientLoFTRConfig): + super().__init__(config) + + self.config = config + self.efficientloftr = EfficientLoFTRModel(config) + self.refinement_layer = EfficientLoFTRFineFusionLayer(config) + + self.post_init() + + def _get_matches_from_scores(self, scores: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: + """ + Based on a keypoint score matrix, compute the best keypoint matches between the first and second image. + Since each image pair can have different number of matches, the matches are concatenated together for all pair + in the batch and a batch_indices tensor is returned to specify which match belong to which element in the batch. + + Note: + This step can be done as a postprocessing step, because does not involve any model weights/params. + However, we keep it in the modeling code for consistency with other keypoint matching models AND for + easier torch.compile/torch.export (all ops are in torch). + + Args: + scores (`torch.Tensor` of shape `(batch_size, height_0, width_0, height_1, width_1)`): + Scores of keypoints + + Returns: + matched_indices (`torch.Tensor` of shape `(2, num_matches)`): + Indices representing which pixel in the first image matches which pixel in the second image + matching_scores (`torch.Tensor` of shape `(num_matches,)`): + Scores of each match + """ + batch_size, height0, width0, height1, width1 = scores.shape + + scores = scores.view(batch_size, height0 * width0, height1 * width1) + + # For each keypoint, get the best match + max_0 = scores.max(2, keepdim=True).values + max_1 = scores.max(1, keepdim=True).values + + # 1. Thresholding + mask = scores > self.config.coarse_matching_threshold + + # 2. Border removal + mask = mask.reshape(batch_size, height0, width0, height1, width1) + mask = mask_border(mask, self.config.coarse_matching_border_removal, False) + mask = mask.reshape(batch_size, height0 * width0, height1 * width1) + + # 3. Mutual nearest neighbors + mask = mask * (scores == max_0) * (scores == max_1) + + # 4. Fine coarse matches + masked_scores = scores * mask + matching_scores_0, max_indices_0 = masked_scores.max(1) + matching_scores_1, max_indices_1 = masked_scores.max(2) + + matching_indices = torch.cat([max_indices_0, max_indices_1]).reshape(batch_size, 2, -1) + matching_scores = torch.stack([matching_scores_0, matching_scores_1], dim=1) + + # For the keypoints not meeting the threshold score, set the indices to -1 which corresponds to no matches found + matching_indices = torch.where(matching_scores > 0, matching_indices, -1) + + return matching_indices, matching_scores + + def _coarse_matching( + self, coarse_features: torch.Tensor, coarse_scale: float + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """ + For each image pair, compute the matching confidence between each coarse element (by default (image_height / 8) + * (image_width / 8 elements)) from the first image to the second image. + + Note: + This step can be done as a postprocessing step, because does not involve any model weights/params. + However, we keep it in the modeling code for consistency with other keypoint matching models AND for + easier torch.compile/torch.export (all ops are in torch). + + Args: + coarse_features (`torch.Tensor` of shape `(batch_size, 2, hidden_size, coarse_height, coarse_width)`): + Coarse features + coarse_scale (`float`): Scale between the image size and the coarse size + + Returns: + keypoints (`torch.Tensor` of shape `(batch_size, 2, num_matches, 2)`): + Keypoints coordinates. + matching_scores (`torch.Tensor` of shape `(batch_size, 2, num_matches)`): + The confidence matching score of each keypoint. + matched_indices (`torch.Tensor` of shape `(batch_size, 2, num_matches)`): + Indices which indicates which keypoint in an image matched with which keypoint in the other image. For + both image in the pair. + """ + batch_size, _, embed_dim, height, width = coarse_features.shape + + # (batch_size, 2, embed_dim, height, width) -> (batch_size, 2, height * width, embed_dim) + coarse_features = coarse_features.permute(0, 1, 3, 4, 2) + coarse_features = coarse_features.reshape(batch_size, 2, -1, embed_dim) + + coarse_features = coarse_features / coarse_features.shape[-1] ** 0.5 + coarse_features_0 = coarse_features[:, 0] + coarse_features_1 = coarse_features[:, 1] + + similarity = coarse_features_0 @ coarse_features_1.transpose(-1, -2) + similarity = similarity / self.config.coarse_matching_temperature + + if self.config.coarse_matching_skip_softmax: + confidence = similarity + else: + confidence = nn.functional.softmax(similarity, 1) * nn.functional.softmax(similarity, 2) + + confidence = confidence.view(batch_size, height, width, height, width) + matched_indices, matching_scores = self._get_matches_from_scores(confidence) + + keypoints = torch.stack([matched_indices % width, matched_indices // width], dim=-1) * coarse_scale + + return keypoints, matching_scores, matched_indices + + def _get_first_stage_fine_matching( + self, + fine_confidence: torch.Tensor, + coarse_matched_keypoints: torch.Tensor, + fine_window_size: int, + fine_scale: float, + ) -> tuple[torch.Tensor, torch.Tensor]: + """ + For each coarse pixel, retrieve the highest fine confidence score and index. + The index represents the matching between a pixel position in the fine window in the first image and a pixel + position in the fine window of the second image. + For example, for a fine_window_size of 64 (8 * 8), the index 2474 represents the matching between the index 38 + (2474 // 64) in the fine window of the first image, and the index 42 in the second image. This means that 38 + which corresponds to the position (4, 6) (4 // 8 and 4 % 8) is matched with the position (5, 2). In this example + the coarse matched coordinate will be shifted to the matched fine coordinates in the first and second image. + + Note: + This step can be done as a postprocessing step, because does not involve any model weights/params. + However, we keep it in the modeling code for consistency with other keypoint matching models AND for + easier torch.compile/torch.export (all ops are in torch). + + Args: + fine_confidence (`torch.Tensor` of shape `(num_matches, fine_window_size, fine_window_size)`): + First stage confidence of matching fine features between the first and the second image + coarse_matched_keypoints (`torch.Tensor` of shape `(2, num_matches, 2)`): + Coarse matched keypoint between the first and the second image. + fine_window_size (`int`): + Size of the window used to refine matches + fine_scale (`float`): + Scale between the size of fine features and coarse features + + Returns: + indices (`torch.Tensor` of shape `(2, num_matches, 1)`): + Indices of the fine coordinate matched in the fine window + fine_matches (`torch.Tensor` of shape `(2, num_matches, 2)`): + Coordinates of matched keypoints after the first fine stage + """ + batch_size, num_keypoints, _, _ = fine_confidence.shape + fine_kernel_size = torch_int(fine_window_size**0.5) + + fine_confidence = fine_confidence.reshape(batch_size, num_keypoints, -1) + values, indices = torch.max(fine_confidence, dim=-1) + indices = indices[..., None] + indices_0 = indices // fine_window_size + indices_1 = indices % fine_window_size + + grid = create_meshgrid( + fine_kernel_size, + fine_kernel_size, + normalized_coordinates=False, + device=fine_confidence.device, + dtype=fine_confidence.dtype, + ) + grid = grid - (fine_kernel_size // 2) + 0.5 + grid = grid.reshape(1, 1, -1, 2).expand(batch_size, num_keypoints, -1, -1) + delta_0 = torch.gather(grid, 1, indices_0.unsqueeze(-1).expand(-1, -1, -1, 2)).squeeze(2) + delta_1 = torch.gather(grid, 1, indices_1.unsqueeze(-1).expand(-1, -1, -1, 2)).squeeze(2) + + fine_matches_0 = coarse_matched_keypoints[:, 0] + delta_0 * fine_scale + fine_matches_1 = coarse_matched_keypoints[:, 1] + delta_1 * fine_scale + + indices = torch.stack([indices_0, indices_1], dim=1) + fine_matches = torch.stack([fine_matches_0, fine_matches_1], dim=1) + + return indices, fine_matches + + def _get_second_stage_fine_matching( + self, + indices: torch.Tensor, + fine_matches: torch.Tensor, + fine_confidence: torch.Tensor, + fine_window_size: int, + fine_scale: float, + ) -> torch.Tensor: + """ + For the given position in their respective fine windows, retrieve the 3x3 fine confidences around this position. + After applying softmax to these confidences, compute the 2D spatial expected coordinates. + Shift the first stage fine matching with these expected coordinates. + + Note: + This step can be done as a postprocessing step, because does not involve any model weights/params. + However, we keep it in the modeling code for consistency with other keypoint matching models AND for + easier torch.compile/torch.export (all ops are in torch). + + Args: + indices (`torch.Tensor` of shape `(batch_size, 2, num_keypoints)`): + Indices representing the position of each keypoint in the fine window + fine_matches (`torch.Tensor` of shape `(2, num_matches, 2)`): + Coordinates of matched keypoints after the first fine stage + fine_confidence (`torch.Tensor` of shape `(num_matches, fine_window_size, fine_window_size)`): + Second stage confidence of matching fine features between the first and the second image + fine_window_size (`int`): + Size of the window used to refine matches + fine_scale (`float`): + Scale between the size of fine features and coarse features + + Returns: + fine_matches (`torch.Tensor` of shape `(2, num_matches, 2)`): + Coordinates of matched keypoints after the second fine stage + """ + batch_size, num_keypoints, _, _ = fine_confidence.shape + fine_kernel_size = torch_int(fine_window_size**0.5) + + indices_0 = indices[:, 0] + indices_1 = indices[:, 1] + indices_1_i = indices_1 // fine_kernel_size + indices_1_j = indices_1 % fine_kernel_size + + # matches_indices, indices_0, indices_1_i, indices_1_j of shape (num_matches, 3, 3) + batch_indices = torch.arange(batch_size, device=indices_0.device).reshape(batch_size, 1, 1, 1) + matches_indices = torch.arange(num_keypoints, device=indices_0.device).reshape(1, num_keypoints, 1, 1) + indices_0 = indices_0[..., None] + indices_1_i = indices_1_i[..., None] + indices_1_j = indices_1_j[..., None] + + delta = create_meshgrid(3, 3, normalized_coordinates=True, device=indices_0.device).to(torch.long) + delta = delta[None, ...] + + indices_1_i = indices_1_i + delta[..., 1] + indices_1_j = indices_1_j + delta[..., 0] + + fine_confidence = fine_confidence.reshape( + batch_size, num_keypoints, fine_window_size, fine_kernel_size + 2, fine_kernel_size + 2 + ) + # (batch_size, seq_len, fine_window_size, fine_kernel_size + 2, fine_kernel_size + 2) -> (batch_size, seq_len, 3, 3) + fine_confidence = fine_confidence[batch_indices, matches_indices, indices_0, indices_1_i, indices_1_j] + fine_confidence = fine_confidence.reshape(batch_size, num_keypoints, 9) + fine_confidence = nn.functional.softmax( + fine_confidence / self.config.fine_matching_regress_temperature, dim=-1 + ) + + heatmap = fine_confidence.reshape(batch_size, num_keypoints, 3, 3) + fine_coordinates_normalized = spatial_expectation2d(heatmap, True)[0] + + fine_matches_0 = fine_matches[:, 0] + fine_matches_1 = fine_matches[:, 1] + (fine_coordinates_normalized * (3 // 2) * fine_scale) + + fine_matches = torch.stack([fine_matches_0, fine_matches_1], dim=1) + + return fine_matches + + def _fine_matching( + self, + fine_features_0: torch.Tensor, + fine_features_1: torch.Tensor, + coarse_matched_keypoints: torch.Tensor, + fine_scale: float, + ) -> torch.Tensor: + """ + For each coarse pixel with a corresponding window of fine features, compute the matching confidence between fine + features in the first image and the second image. + + Fine features are sliced in two part : + - The first part used for the first stage are the first fine_hidden_size - config.fine_matching_slicedim (64 - 8 + = 56 by default) features. + - The second part used for the second stage are the last config.fine_matching_slicedim (8 by default) features. + + Each part is used to compute a fine confidence tensor of the following shape : + (batch_size, (coarse_height * coarse_width), fine_window_size, fine_window_size) + They correspond to the score between each fine pixel in the first image and each fine pixel in the second image. + + Args: + fine_features_0 (`torch.Tensor` of shape `(num_matches, fine_kernel_size ** 2, fine_kernel_size ** 2)`): + Fine features from the first image + fine_features_1 (`torch.Tensor` of shape `(num_matches, (fine_kernel_size + 2) ** 2, (fine_kernel_size + 2) + ** 2)`): + Fine features from the second image + coarse_matched_keypoints (`torch.Tensor` of shape `(2, num_matches, 2)`): + Keypoint coordinates found in coarse matching for the first and second image + fine_scale (`int`): + Scale between the size of fine features and coarse features + + Returns: + fine_coordinates (`torch.Tensor` of shape `(2, num_matches, 2)`): + Matched keypoint between the first and the second image. All matched keypoints are concatenated in the + second dimension. + + """ + batch_size, num_keypoints, fine_window_size, fine_embed_dim = fine_features_0.shape + fine_matching_slice_dim = self.config.fine_matching_slice_dim + + fine_kernel_size = torch_int(fine_window_size**0.5) + + # Split fine features into first and second stage features + split_fine_features_0 = torch.split(fine_features_0, fine_embed_dim - fine_matching_slice_dim, -1) + split_fine_features_1 = torch.split(fine_features_1, fine_embed_dim - fine_matching_slice_dim, -1) + + # Retrieve first stage fine features + fine_features_0 = split_fine_features_0[0] + fine_features_1 = split_fine_features_1[0] + + # Normalize first stage fine features + fine_features_0 = fine_features_0 / fine_features_0.shape[-1] ** 0.5 + fine_features_1 = fine_features_1 / fine_features_1.shape[-1] ** 0.5 + + # Compute first stage confidence + fine_confidence = fine_features_0 @ fine_features_1.transpose(-1, -2) + fine_confidence = nn.functional.softmax(fine_confidence, 1) * nn.functional.softmax(fine_confidence, 2) + fine_confidence = fine_confidence.reshape( + batch_size, num_keypoints, fine_window_size, fine_kernel_size + 2, fine_kernel_size + 2 + ) + fine_confidence = fine_confidence[..., 1:-1, 1:-1] + first_stage_fine_confidence = fine_confidence.reshape( + batch_size, num_keypoints, fine_window_size, fine_window_size + ) + + fine_indices, fine_matches = self._get_first_stage_fine_matching( + first_stage_fine_confidence, + coarse_matched_keypoints, + fine_window_size, + fine_scale, + ) + + # Retrieve second stage fine features + fine_features_0 = split_fine_features_0[1] + fine_features_1 = split_fine_features_1[1] + + # Normalize second stage fine features + fine_features_1 = fine_features_1 / fine_matching_slice_dim**0.5 + + # Compute second stage fine confidence + second_stage_fine_confidence = fine_features_0 @ fine_features_1.transpose(-1, -2) + + fine_coordinates = self._get_second_stage_fine_matching( + fine_indices, + fine_matches, + second_stage_fine_confidence, + fine_window_size, + fine_scale, + ) + + return fine_coordinates + + @auto_docstring + @can_return_tuple + def forward( + self, + pixel_values: torch.FloatTensor, + labels: Optional[torch.LongTensor] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> KeypointMatchingOutput: + r""" + Examples: + + ```python + >>> from transformers import AutoImageProcessor, AutoModel + >>> import torch + >>> from PIL import Image + >>> import requests + + >>> url = "https://github.com/magicleap/SuperGluePretrainedNetwork/blob/master/assets/phototourism_sample_images/london_bridge_78916675_4568141288.jpg?raw=true" + >>> image1 = Image.open(requests.get(url, stream=True).raw) + >>> url = "https://github.com/magicleap/SuperGluePretrainedNetwork/blob/master/assets/phototourism_sample_images/london_bridge_19481797_2295892421.jpg?raw=true" + >>> image2 = Image.open(requests.get(url, stream=True).raw) + >>> images = [image1, image2] + + >>> processor = AutoImageProcessor.from_pretrained("zju-community/efficient_loftr") + >>> model = AutoModel.from_pretrained("zju-community/efficient_loftr") + + >>> with torch.no_grad(): + >>> inputs = processor(images, return_tensors="pt") + >>> outputs = model(**inputs) + ```""" + if labels is not None: + raise ValueError("SuperGlue is not trainable, no labels should be provided.") + + # 1. Extract coarse and residual features + model_outputs: BackboneOutput = self.efficientloftr(pixel_values, **kwargs) + features = model_outputs.feature_maps + + # 2. Compute coarse-level matching + coarse_features = features[0] + coarse_embed_dim, coarse_height, coarse_width = coarse_features.shape[-3:] + batch_size, _, channels, height, width = pixel_values.shape + coarse_scale = height / coarse_height + coarse_keypoints, coarse_matching_scores, coarse_matched_indices = self._coarse_matching( + coarse_features, coarse_scale + ) + + # 3. Fine-level refinement + residual_features = features[1:] + coarse_features = coarse_features / self.config.hidden_size**0.5 + fine_features_0, fine_features_1 = self.refinement_layer(coarse_features, residual_features) + + # Filter fine features with coarse matches indices + _, _, num_keypoints = coarse_matching_scores.shape + batch_indices = torch.arange(batch_size)[..., None] + fine_features_0 = fine_features_0[batch_indices, coarse_matched_indices[:, 0]] + fine_features_1 = fine_features_1[batch_indices, coarse_matched_indices[:, 1]] + + # 4. Computer fine-level matching + fine_height = torch_int(coarse_height * coarse_scale) + fine_scale = height / fine_height + matching_keypoints = self._fine_matching(fine_features_0, fine_features_1, coarse_keypoints, fine_scale) + + matching_keypoints[:, :, :, 0] = matching_keypoints[:, :, :, 0] / width + matching_keypoints[:, :, :, 1] = matching_keypoints[:, :, :, 1] / height + + return KeypointMatchingOutput( + matches=coarse_matched_indices, + matching_scores=coarse_matching_scores, + keypoints=matching_keypoints, + hidden_states=model_outputs.hidden_states, + attentions=model_outputs.attentions, + ) + + +__all__ = ["EfficientLoFTRPreTrainedModel", "EfficientLoFTRModel", "EfficientLoFTRForKeypointMatching"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/electra/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/electra/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a78ed5c42aea51038335efabde5b03e333592ed6 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/electra/__init__.py @@ -0,0 +1,31 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_electra import * + from .modeling_electra import * + from .modeling_flax_electra import * + from .modeling_tf_electra import * + from .tokenization_electra import * + from .tokenization_electra_fast import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/electra/configuration_electra.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/electra/configuration_electra.py new file mode 100644 index 0000000000000000000000000000000000000000..f12756d976b35ee3a4f333483b1b4e6e1a07fb7e --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/electra/configuration_electra.py @@ -0,0 +1,187 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""ELECTRA model configuration""" + +from collections import OrderedDict +from collections.abc import Mapping + +from ...configuration_utils import PretrainedConfig +from ...onnx import OnnxConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class ElectraConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`ElectraModel`] or a [`TFElectraModel`]. It is + used to instantiate a ELECTRA model according to the specified arguments, defining the model architecture. + Instantiating a configuration with the defaults will yield a similar configuration to that of the ELECTRA + [google/electra-small-discriminator](https://huggingface.co/google/electra-small-discriminator) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 30522): + Vocabulary size of the ELECTRA model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`ElectraModel`] or [`TFElectraModel`]. + embedding_size (`int`, *optional*, defaults to 128): + Dimensionality of the encoder layers and the pooler layer. + hidden_size (`int`, *optional*, defaults to 256): + Dimensionality of the encoder layers and the pooler layer. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 4): + Number of attention heads for each attention layer in the Transformer encoder. + intermediate_size (`int`, *optional*, defaults to 1024): + Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"silu"` and `"gelu_new"` are supported. + hidden_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout ratio for the attention probabilities. + max_position_embeddings (`int`, *optional*, defaults to 512): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + type_vocab_size (`int`, *optional*, defaults to 2): + The vocabulary size of the `token_type_ids` passed when calling [`ElectraModel`] or [`TFElectraModel`]. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-12): + The epsilon used by the layer normalization layers. + summary_type (`str`, *optional*, defaults to `"first"`): + Argument used when doing sequence summary. Used in the sequence classification and multiple choice models. + + Has to be one of the following options: + + - `"last"`: Take the last token hidden state (like XLNet). + - `"first"`: Take the first token hidden state (like BERT). + - `"mean"`: Take the mean of all tokens hidden states. + - `"cls_index"`: Supply a Tensor of classification token position (like GPT/GPT-2). + - `"attn"`: Not implemented now, use multi-head attention. + summary_use_proj (`bool`, *optional*, defaults to `True`): + Argument used when doing sequence summary. Used in the sequence classification and multiple choice models. + + Whether or not to add a projection after the vector extraction. + summary_activation (`str`, *optional*): + Argument used when doing sequence summary. Used in the sequence classification and multiple choice models. + + Pass `"gelu"` for a gelu activation to the output, any other value will result in no activation. + summary_last_dropout (`float`, *optional*, defaults to 0.0): + Argument used when doing sequence summary. Used in the sequence classification and multiple choice models. + + The dropout ratio to be used after the projection and activation. + position_embedding_type (`str`, *optional*, defaults to `"absolute"`): + Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For + positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to + [Self-Attention with Relative Position Representations (Shaw et al.)](https://huggingface.co/papers/1803.02155). + For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models + with Better Relative Position Embeddings (Huang et al.)](https://huggingface.co/papers/2009.13658). + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + classifier_dropout (`float`, *optional*): + The dropout ratio for the classification head. + + Examples: + + ```python + >>> from transformers import ElectraConfig, ElectraModel + + >>> # Initializing a ELECTRA electra-base-uncased style configuration + >>> configuration = ElectraConfig() + + >>> # Initializing a model (with random weights) from the electra-base-uncased style configuration + >>> model = ElectraModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "electra" + + def __init__( + self, + vocab_size=30522, + embedding_size=128, + hidden_size=256, + num_hidden_layers=12, + num_attention_heads=4, + intermediate_size=1024, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=2, + initializer_range=0.02, + layer_norm_eps=1e-12, + summary_type="first", + summary_use_proj=True, + summary_activation="gelu", + summary_last_dropout=0.1, + pad_token_id=0, + position_embedding_type="absolute", + use_cache=True, + classifier_dropout=None, + **kwargs, + ): + super().__init__(pad_token_id=pad_token_id, **kwargs) + + self.vocab_size = vocab_size + self.embedding_size = embedding_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + + self.summary_type = summary_type + self.summary_use_proj = summary_use_proj + self.summary_activation = summary_activation + self.summary_last_dropout = summary_last_dropout + self.position_embedding_type = position_embedding_type + self.use_cache = use_cache + self.classifier_dropout = classifier_dropout + + +class ElectraOnnxConfig(OnnxConfig): + @property + def inputs(self) -> Mapping[str, Mapping[int, str]]: + if self.task == "multiple-choice": + dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"} + else: + dynamic_axis = {0: "batch", 1: "sequence"} + return OrderedDict( + [ + ("input_ids", dynamic_axis), + ("attention_mask", dynamic_axis), + ("token_type_ids", dynamic_axis), + ] + ) + + +__all__ = ["ElectraConfig", "ElectraOnnxConfig"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/electra/modeling_electra.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/electra/modeling_electra.py new file mode 100644 index 0000000000000000000000000000000000000000..a10b0b6583374d8b620fb58b46d6e8f0ad469c1e --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/electra/modeling_electra.py @@ -0,0 +1,1586 @@ +# coding=utf-8 +# Copyright 2019 The Google AI Language Team Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch ELECTRA model.""" + +import math +import os +from dataclasses import dataclass +from typing import Callable, Optional, Union + +import torch +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from ...activations import ACT2FN, get_activation +from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache +from ...generation import GenerationMixin +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_outputs import ( + BaseModelOutputWithCrossAttentions, + BaseModelOutputWithPastAndCrossAttentions, + CausalLMOutputWithCrossAttentions, + MaskedLMOutput, + MultipleChoiceModelOutput, + QuestionAnsweringModelOutput, + SequenceClassifierOutput, + TokenClassifierOutput, +) +from ...modeling_utils import PreTrainedModel +from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer +from ...utils import ModelOutput, auto_docstring, logging +from ...utils.deprecation import deprecate_kwarg +from .configuration_electra import ElectraConfig + + +logger = logging.get_logger(__name__) + + +def load_tf_weights_in_electra(model, config, tf_checkpoint_path, discriminator_or_generator="discriminator"): + """Load tf checkpoints in a pytorch model.""" + try: + import re + + import numpy as np + import tensorflow as tf + except ImportError: + logger.error( + "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see " + "https://www.tensorflow.org/install/ for installation instructions." + ) + raise + tf_path = os.path.abspath(tf_checkpoint_path) + logger.info(f"Converting TensorFlow checkpoint from {tf_path}") + # Load weights from TF model + init_vars = tf.train.list_variables(tf_path) + names = [] + arrays = [] + for name, shape in init_vars: + logger.info(f"Loading TF weight {name} with shape {shape}") + array = tf.train.load_variable(tf_path, name) + names.append(name) + arrays.append(array) + for name, array in zip(names, arrays): + original_name: str = name + + try: + if isinstance(model, ElectraForMaskedLM): + name = name.replace("electra/embeddings/", "generator/embeddings/") + + if discriminator_or_generator == "generator": + name = name.replace("electra/", "discriminator/") + name = name.replace("generator/", "electra/") + + name = name.replace("dense_1", "dense_prediction") + name = name.replace("generator_predictions/output_bias", "generator_lm_head/bias") + + name = name.split("/") + # print(original_name, name) + # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v + # which are not required for using pretrained model + if any(n in ["global_step", "temperature"] for n in name): + logger.info(f"Skipping {original_name}") + continue + pointer = model + for m_name in name: + if re.fullmatch(r"[A-Za-z]+_\d+", m_name): + scope_names = re.split(r"_(\d+)", m_name) + else: + scope_names = [m_name] + if scope_names[0] == "kernel" or scope_names[0] == "gamma": + pointer = getattr(pointer, "weight") + elif scope_names[0] == "output_bias" or scope_names[0] == "beta": + pointer = getattr(pointer, "bias") + elif scope_names[0] == "output_weights": + pointer = getattr(pointer, "weight") + elif scope_names[0] == "squad": + pointer = getattr(pointer, "classifier") + else: + pointer = getattr(pointer, scope_names[0]) + if len(scope_names) >= 2: + num = int(scope_names[1]) + pointer = pointer[num] + if m_name.endswith("_embeddings"): + pointer = getattr(pointer, "weight") + elif m_name == "kernel": + array = np.transpose(array) + try: + if pointer.shape != array.shape: + raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched") + except ValueError as e: + e.args += (pointer.shape, array.shape) + raise + print(f"Initialize PyTorch weight {name}", original_name) + pointer.data = torch.from_numpy(array) + except AttributeError as e: + print(f"Skipping {original_name}", name, e) + continue + return model + + +class ElectraEmbeddings(nn.Module): + """Construct the embeddings from word, position and token_type embeddings.""" + + def __init__(self, config): + super().__init__() + self.word_embeddings = nn.Embedding(config.vocab_size, config.embedding_size, padding_idx=config.pad_token_id) + self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.embedding_size) + self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.embedding_size) + + # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load + # any TensorFlow checkpoint file + self.LayerNorm = nn.LayerNorm(config.embedding_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + # position_ids (1, len position emb) is contiguous in memory and exported when serialized + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) + self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") + self.register_buffer( + "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False + ) + + # Copied from transformers.models.bert.modeling_bert.BertEmbeddings.forward + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + past_key_values_length: int = 0, + ) -> torch.Tensor: + if input_ids is not None: + input_shape = input_ids.size() + else: + input_shape = inputs_embeds.size()[:-1] + + seq_length = input_shape[1] + + if position_ids is None: + position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length] + + # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs + # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves + # issue #5664 + if token_type_ids is None: + if hasattr(self, "token_type_ids"): + buffered_token_type_ids = self.token_type_ids[:, :seq_length] + buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length) + token_type_ids = buffered_token_type_ids_expanded + else: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device) + + if inputs_embeds is None: + inputs_embeds = self.word_embeddings(input_ids) + token_type_embeddings = self.token_type_embeddings(token_type_ids) + + embeddings = inputs_embeds + token_type_embeddings + if self.position_embedding_type == "absolute": + position_embeddings = self.position_embeddings(position_ids) + embeddings += position_embeddings + embeddings = self.LayerNorm(embeddings) + embeddings = self.dropout(embeddings) + return embeddings + + +# Copied from transformers.models.bert.modeling_bert.BertSelfAttention with Bert->Electra +class ElectraSelfAttention(nn.Module): + def __init__(self, config, position_embedding_type=None, layer_idx=None): + super().__init__() + if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): + raise ValueError( + f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention " + f"heads ({config.num_attention_heads})" + ) + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size) + self.key = nn.Linear(config.hidden_size, self.all_head_size) + self.value = nn.Linear(config.hidden_size, self.all_head_size) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + self.position_embedding_type = position_embedding_type or getattr( + config, "position_embedding_type", "absolute" + ) + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + self.max_position_embeddings = config.max_position_embeddings + self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) + + self.is_decoder = config.is_decoder + self.layer_idx = layer_idx + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + past_key_values: Optional[Cache] = None, + output_attentions: Optional[bool] = False, + cache_position: Optional[torch.Tensor] = None, + ) -> tuple[torch.Tensor]: + batch_size, seq_length, _ = hidden_states.shape + query_layer = self.query(hidden_states) + query_layer = query_layer.view(batch_size, -1, self.num_attention_heads, self.attention_head_size).transpose( + 1, 2 + ) + + is_updated = False + is_cross_attention = encoder_hidden_states is not None + if past_key_values is not None: + if isinstance(past_key_values, EncoderDecoderCache): + is_updated = past_key_values.is_updated.get(self.layer_idx) + if is_cross_attention: + # after the first generated id, we can subsequently re-use all key/value_layer from cache + curr_past_key_value = past_key_values.cross_attention_cache + else: + curr_past_key_value = past_key_values.self_attention_cache + else: + curr_past_key_value = past_key_values + + current_states = encoder_hidden_states if is_cross_attention else hidden_states + if is_cross_attention and past_key_values is not None and is_updated: + # reuse k,v, cross_attentions + key_layer = curr_past_key_value.layers[self.layer_idx].keys + value_layer = curr_past_key_value.layers[self.layer_idx].values + else: + key_layer = self.key(current_states) + key_layer = key_layer.view(batch_size, -1, self.num_attention_heads, self.attention_head_size).transpose( + 1, 2 + ) + value_layer = self.value(current_states) + value_layer = value_layer.view( + batch_size, -1, self.num_attention_heads, self.attention_head_size + ).transpose(1, 2) + + if past_key_values is not None: + # save all key/value_layer to cache to be re-used for fast auto-regressive generation + cache_position = cache_position if not is_cross_attention else None + key_layer, value_layer = curr_past_key_value.update( + key_layer, value_layer, self.layer_idx, {"cache_position": cache_position} + ) + # set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls + if is_cross_attention and isinstance(past_key_values, EncoderDecoderCache): + past_key_values.is_updated[self.layer_idx] = True + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + query_length, key_length = query_layer.shape[2], key_layer.shape[2] + if past_key_values is not None: + position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=hidden_states.device).view( + -1, 1 + ) + else: + position_ids_l = torch.arange(query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1) + position_ids_r = torch.arange(key_length, dtype=torch.long, device=hidden_states.device).view(1, -1) + distance = position_ids_l - position_ids_r + + positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1) + positional_embedding = positional_embedding.to(dtype=query_layer.dtype) # fp16 compatibility + + if self.position_embedding_type == "relative_key": + relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores + elif self.position_embedding_type == "relative_key_query": + relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key + + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in ElectraModel forward() function) + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = nn.functional.softmax(attention_scores, dim=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = attention_probs * head_mask + + context_layer = torch.matmul(attention_probs, value_layer) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(new_context_layer_shape) + + return context_layer, attention_probs + + +# Copied from transformers.models.bert.modeling_bert.BertSelfOutput +class ElectraSelfOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +ELECTRA_SELF_ATTENTION_CLASSES = { + "eager": ElectraSelfAttention, +} + + +# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->Electra,BERT->ELECTRA +class ElectraAttention(nn.Module): + def __init__(self, config, position_embedding_type=None, layer_idx=None): + super().__init__() + self.self = ELECTRA_SELF_ATTENTION_CLASSES[config._attn_implementation]( + config, + position_embedding_type=position_embedding_type, + layer_idx=layer_idx, + ) + self.output = ElectraSelfOutput(config) + self.pruned_heads = set() + + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads + ) + + # Prune linear layers + self.self.query = prune_linear_layer(self.self.query, index) + self.self.key = prune_linear_layer(self.self.key, index) + self.self.value = prune_linear_layer(self.self.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.self.num_attention_heads = self.self.num_attention_heads - len(heads) + self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + past_key_values: Optional[Cache] = None, + output_attentions: Optional[bool] = False, + cache_position: Optional[torch.Tensor] = None, + ) -> tuple[torch.Tensor]: + self_outputs = self.self( + hidden_states, + attention_mask=attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + past_key_values=past_key_values, + output_attentions=output_attentions, + cache_position=cache_position, + ) + attention_output = self.output(self_outputs[0], hidden_states) + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + return outputs + + +# Copied from transformers.models.bert.modeling_bert.BertIntermediate +class ElectraIntermediate(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +# Copied from transformers.models.bert.modeling_bert.BertOutput +class ElectraOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +# Copied from transformers.models.bert.modeling_bert.BertLayer with Bert->Electra +class ElectraLayer(GradientCheckpointingLayer): + def __init__(self, config, layer_idx=None): + super().__init__() + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 + self.attention = ElectraAttention(config, layer_idx=layer_idx) + self.is_decoder = config.is_decoder + self.add_cross_attention = config.add_cross_attention + if self.add_cross_attention: + if not self.is_decoder: + raise ValueError(f"{self} should be used as a decoder model if cross attention is added") + self.crossattention = ElectraAttention(config, position_embedding_type="absolute", layer_idx=layer_idx) + self.intermediate = ElectraIntermediate(config) + self.output = ElectraOutput(config) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_values: Optional[Cache] = None, + output_attentions: Optional[bool] = False, + cache_position: Optional[torch.Tensor] = None, + ) -> tuple[torch.Tensor]: + self_attention_outputs = self.attention( + hidden_states, + attention_mask=attention_mask, + head_mask=head_mask, + output_attentions=output_attentions, + past_key_values=past_key_values, + cache_position=cache_position, + ) + attention_output = self_attention_outputs[0] + outputs = self_attention_outputs[1:] # add self attentions if we output attention weights + + if self.is_decoder and encoder_hidden_states is not None: + if not hasattr(self, "crossattention"): + raise ValueError( + f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers" + " by setting `config.add_cross_attention=True`" + ) + + cross_attention_outputs = self.crossattention( + attention_output, + attention_mask=encoder_attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + past_key_values=past_key_values, + output_attentions=output_attentions, + cache_position=cache_position, + ) + attention_output = cross_attention_outputs[0] + outputs = outputs + cross_attention_outputs[1:] # add cross attentions if we output attention weights + + layer_output = apply_chunking_to_forward( + self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output + ) + outputs = (layer_output,) + outputs + + return outputs + + def feed_forward_chunk(self, attention_output): + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + return layer_output + + +# Copied from transformers.models.bert.modeling_bert.BertEncoder with Bert->Electra +class ElectraEncoder(nn.Module): + def __init__(self, config, layer_idx=None): + super().__init__() + self.config = config + self.layer = nn.ModuleList([ElectraLayer(config, layer_idx=i) for i in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_values: Optional[Cache] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = False, + output_hidden_states: Optional[bool] = False, + return_dict: Optional[bool] = True, + cache_position: Optional[torch.Tensor] = None, + ) -> Union[tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]: + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + if use_cache and self.config.is_decoder and past_key_values is None: + past_key_values = EncoderDecoderCache(DynamicCache(config=self.config), DynamicCache(config=self.config)) + + if use_cache and self.config.is_decoder and isinstance(past_key_values, tuple): + logger.warning_once( + "Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. " + "You should pass an instance of `EncoderDecoderCache` instead, e.g. " + "`past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`." + ) + past_key_values = EncoderDecoderCache.from_legacy_cache(past_key_values) + + for i, layer_module in enumerate(self.layer): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_head_mask = head_mask[i] if head_mask is not None else None + + layer_outputs = layer_module( + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, # as a positional argument for gradient checkpointing + encoder_attention_mask=encoder_attention_mask, + past_key_values=past_key_values, + output_attentions=output_attentions, + cache_position=cache_position, + ) + + hidden_states = layer_outputs[0] + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + if self.config.add_cross_attention: + all_cross_attentions = all_cross_attentions + (layer_outputs[2],) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple( + v + for v in [ + hidden_states, + past_key_values, + all_hidden_states, + all_self_attentions, + all_cross_attentions, + ] + if v is not None + ) + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=past_key_values, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + cross_attentions=all_cross_attentions, + ) + + +class ElectraDiscriminatorPredictions(nn.Module): + """Prediction module for the discriminator, made up of two dense layers.""" + + def __init__(self, config): + super().__init__() + + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.activation = get_activation(config.hidden_act) + self.dense_prediction = nn.Linear(config.hidden_size, 1) + self.config = config + + def forward(self, discriminator_hidden_states): + hidden_states = self.dense(discriminator_hidden_states) + hidden_states = self.activation(hidden_states) + logits = self.dense_prediction(hidden_states).squeeze(-1) + + return logits + + +class ElectraGeneratorPredictions(nn.Module): + """Prediction module for the generator, made up of two dense layers.""" + + def __init__(self, config): + super().__init__() + + self.activation = get_activation("gelu") + self.LayerNorm = nn.LayerNorm(config.embedding_size, eps=config.layer_norm_eps) + self.dense = nn.Linear(config.hidden_size, config.embedding_size) + + def forward(self, generator_hidden_states): + hidden_states = self.dense(generator_hidden_states) + hidden_states = self.activation(hidden_states) + hidden_states = self.LayerNorm(hidden_states) + + return hidden_states + + +@auto_docstring +class ElectraPreTrainedModel(PreTrainedModel): + config: ElectraConfig + load_tf_weights = load_tf_weights_in_electra + base_model_prefix = "electra" + supports_gradient_checkpointing = True + + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, nn.Linear): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + +@dataclass +@auto_docstring( + custom_intro=""" + Output type of [`ElectraForPreTraining`]. + """ +) +class ElectraForPreTrainingOutput(ModelOutput): + r""" + loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`): + Total loss of the ELECTRA objective. + logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): + Prediction scores of the head (scores for each token before SoftMax). + """ + + loss: Optional[torch.FloatTensor] = None + logits: Optional[torch.FloatTensor] = None + hidden_states: Optional[tuple[torch.FloatTensor]] = None + attentions: Optional[tuple[torch.FloatTensor]] = None + + +@auto_docstring +class ElectraModel(ElectraPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.embeddings = ElectraEmbeddings(config) + + if config.embedding_size != config.hidden_size: + self.embeddings_project = nn.Linear(config.embedding_size, config.hidden_size) + + self.encoder = ElectraEncoder(config) + self.config = config + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embeddings.word_embeddings + + def set_input_embeddings(self, value): + self.embeddings.word_embeddings = value + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[Cache] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple[torch.Tensor], BaseModelOutputWithCrossAttentions]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask) + input_shape = input_ids.size() + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + batch_size, seq_length = input_shape + device = input_ids.device if input_ids is not None else inputs_embeds.device + + past_key_values_length = 0 + if past_key_values is not None: + past_key_values_length = ( + past_key_values[0][0].shape[-2] + if not isinstance(past_key_values, Cache) + else past_key_values.get_seq_length() + ) + + if attention_mask is None: + attention_mask = torch.ones(input_shape, device=device) + if token_type_ids is None: + if hasattr(self.embeddings, "token_type_ids"): + buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length] + buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length) + token_type_ids = buffered_token_type_ids_expanded + else: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) + + extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape) + + # If a 2D or 3D attention mask is provided for the cross-attention + # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] + if self.config.is_decoder and encoder_hidden_states is not None: + encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size() + encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) + if encoder_attention_mask is None: + encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device) + encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) + else: + encoder_extended_attention_mask = None + + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + + hidden_states = self.embeddings( + input_ids=input_ids, + position_ids=position_ids, + token_type_ids=token_type_ids, + inputs_embeds=inputs_embeds, + past_key_values_length=past_key_values_length, + ) + + if hasattr(self, "embeddings_project"): + hidden_states = self.embeddings_project(hidden_states) + + hidden_states = self.encoder( + hidden_states, + attention_mask=extended_attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_extended_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + return hidden_states + + +class ElectraClassificationHead(nn.Module): + """Head for sentence-level classification tasks.""" + + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + classifier_dropout = ( + config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob + ) + self.activation = get_activation("gelu") + self.dropout = nn.Dropout(classifier_dropout) + self.out_proj = nn.Linear(config.hidden_size, config.num_labels) + + def forward(self, features, **kwargs): + x = features[:, 0, :] # take token (equiv. to [CLS]) + x = self.dropout(x) + x = self.dense(x) + x = self.activation(x) # although BERT uses tanh here, it seems Electra authors used gelu here + x = self.dropout(x) + x = self.out_proj(x) + return x + + +# Copied from transformers.models.xlm.modeling_xlm.XLMSequenceSummary with XLM->Electra +class ElectraSequenceSummary(nn.Module): + r""" + Compute a single vector summary of a sequence hidden states. + + Args: + config ([`ElectraConfig`]): + The config used by the model. Relevant arguments in the config class of the model are (refer to the actual + config class of your model for the default values it uses): + + - **summary_type** (`str`) -- The method to use to make this summary. Accepted values are: + + - `"last"` -- Take the last token hidden state (like XLNet) + - `"first"` -- Take the first token hidden state (like Bert) + - `"mean"` -- Take the mean of all tokens hidden states + - `"cls_index"` -- Supply a Tensor of classification token position (GPT/GPT-2) + - `"attn"` -- Not implemented now, use multi-head attention + + - **summary_use_proj** (`bool`) -- Add a projection after the vector extraction. + - **summary_proj_to_labels** (`bool`) -- If `True`, the projection outputs to `config.num_labels` classes + (otherwise to `config.hidden_size`). + - **summary_activation** (`Optional[str]`) -- Set to `"tanh"` to add a tanh activation to the output, + another string or `None` will add no activation. + - **summary_first_dropout** (`float`) -- Optional dropout probability before the projection and activation. + - **summary_last_dropout** (`float`)-- Optional dropout probability after the projection and activation. + """ + + def __init__(self, config: ElectraConfig): + super().__init__() + + self.summary_type = getattr(config, "summary_type", "last") + if self.summary_type == "attn": + # We should use a standard multi-head attention module with absolute positional embedding for that. + # Cf. https://github.com/zihangdai/xlnet/blob/master/modeling.py#L253-L276 + # We can probably just use the multi-head attention module of PyTorch >=1.1.0 + raise NotImplementedError + + self.summary = nn.Identity() + if hasattr(config, "summary_use_proj") and config.summary_use_proj: + if hasattr(config, "summary_proj_to_labels") and config.summary_proj_to_labels and config.num_labels > 0: + num_classes = config.num_labels + else: + num_classes = config.hidden_size + self.summary = nn.Linear(config.hidden_size, num_classes) + + activation_string = getattr(config, "summary_activation", None) + self.activation: Callable = get_activation(activation_string) if activation_string else nn.Identity() + + self.first_dropout = nn.Identity() + if hasattr(config, "summary_first_dropout") and config.summary_first_dropout > 0: + self.first_dropout = nn.Dropout(config.summary_first_dropout) + + self.last_dropout = nn.Identity() + if hasattr(config, "summary_last_dropout") and config.summary_last_dropout > 0: + self.last_dropout = nn.Dropout(config.summary_last_dropout) + + def forward( + self, hidden_states: torch.FloatTensor, cls_index: Optional[torch.LongTensor] = None + ) -> torch.FloatTensor: + """ + Compute a single vector summary of a sequence hidden states. + + Args: + hidden_states (`torch.FloatTensor` of shape `[batch_size, seq_len, hidden_size]`): + The hidden states of the last layer. + cls_index (`torch.LongTensor` of shape `[batch_size]` or `[batch_size, ...]` where ... are optional leading dimensions of `hidden_states`, *optional*): + Used if `summary_type == "cls_index"` and takes the last token of the sequence as classification token. + + Returns: + `torch.FloatTensor`: The summary of the sequence hidden states. + """ + if self.summary_type == "last": + output = hidden_states[:, -1] + elif self.summary_type == "first": + output = hidden_states[:, 0] + elif self.summary_type == "mean": + output = hidden_states.mean(dim=1) + elif self.summary_type == "cls_index": + if cls_index is None: + cls_index = torch.full_like( + hidden_states[..., :1, :], + hidden_states.shape[-2] - 1, + dtype=torch.long, + ) + else: + cls_index = cls_index.unsqueeze(-1).unsqueeze(-1) + cls_index = cls_index.expand((-1,) * (cls_index.dim() - 1) + (hidden_states.size(-1),)) + # shape of cls_index: (bsz, XX, 1, hidden_size) where XX are optional leading dim of hidden_states + output = hidden_states.gather(-2, cls_index).squeeze(-2) # shape (bsz, XX, hidden_size) + elif self.summary_type == "attn": + raise NotImplementedError + + output = self.first_dropout(output) + output = self.summary(output) + output = self.activation(output) + output = self.last_dropout(output) + + return output + + +@auto_docstring( + custom_intro=""" + ELECTRA Model transformer with a sequence classification/regression head on top (a linear layer on top of the + pooled output) e.g. for GLUE tasks. + """ +) +class ElectraForSequenceClassification(ElectraPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.config = config + self.electra = ElectraModel(config) + self.classifier = ElectraClassificationHead(config) + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple[torch.Tensor], SequenceClassifierOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + discriminator_hidden_states = self.electra( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = discriminator_hidden_states[0] + logits = self.classifier(sequence_output) + + loss = None + if labels is not None: + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits, labels) + + if not return_dict: + output = (logits,) + discriminator_hidden_states[1:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutput( + loss=loss, + logits=logits, + hidden_states=discriminator_hidden_states.hidden_states, + attentions=discriminator_hidden_states.attentions, + ) + + +@auto_docstring( + custom_intro=""" + Electra model with a binary classification head on top as used during pretraining for identifying generated tokens. + + It is recommended to load the discriminator checkpoint into that model. + """ +) +class ElectraForPreTraining(ElectraPreTrainedModel): + def __init__(self, config): + super().__init__(config) + + self.electra = ElectraModel(config) + self.discriminator_predictions = ElectraDiscriminatorPredictions(config) + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple[torch.Tensor], ElectraForPreTrainingOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the ELECTRA loss. Input should be a sequence of tokens (see `input_ids` docstring) + Indices should be in `[0, 1]`: + + - 0 indicates the token is an original token, + - 1 indicates the token was replaced. + + Examples: + + ```python + >>> from transformers import ElectraForPreTraining, AutoTokenizer + >>> import torch + + >>> discriminator = ElectraForPreTraining.from_pretrained("google/electra-base-discriminator") + >>> tokenizer = AutoTokenizer.from_pretrained("google/electra-base-discriminator") + + >>> sentence = "The quick brown fox jumps over the lazy dog" + >>> fake_sentence = "The quick brown fox fake over the lazy dog" + + >>> fake_tokens = tokenizer.tokenize(fake_sentence, add_special_tokens=True) + >>> fake_inputs = tokenizer.encode(fake_sentence, return_tensors="pt") + >>> discriminator_outputs = discriminator(fake_inputs) + >>> predictions = torch.round((torch.sign(discriminator_outputs[0]) + 1) / 2) + + >>> fake_tokens + ['[CLS]', 'the', 'quick', 'brown', 'fox', 'fake', 'over', 'the', 'lazy', 'dog', '[SEP]'] + + >>> predictions.squeeze().tolist() + [0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0] + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + discriminator_hidden_states = self.electra( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + discriminator_sequence_output = discriminator_hidden_states[0] + + logits = self.discriminator_predictions(discriminator_sequence_output) + + loss = None + if labels is not None: + loss_fct = nn.BCEWithLogitsLoss() + if attention_mask is not None: + active_loss = attention_mask.view(-1, discriminator_sequence_output.shape[1]) == 1 + active_logits = logits.view(-1, discriminator_sequence_output.shape[1])[active_loss] + active_labels = labels[active_loss] + loss = loss_fct(active_logits, active_labels.float()) + else: + loss = loss_fct(logits.view(-1, discriminator_sequence_output.shape[1]), labels.float()) + + if not return_dict: + output = (logits,) + discriminator_hidden_states[1:] + return ((loss,) + output) if loss is not None else output + + return ElectraForPreTrainingOutput( + loss=loss, + logits=logits, + hidden_states=discriminator_hidden_states.hidden_states, + attentions=discriminator_hidden_states.attentions, + ) + + +@auto_docstring( + custom_intro=""" + Electra model with a language modeling head on top. + + Even though both the discriminator and generator may be loaded into this model, the generator is the only model of + the two to have been trained for the masked language modeling task. + """ +) +class ElectraForMaskedLM(ElectraPreTrainedModel): + _tied_weights_keys = ["generator_lm_head.weight"] + + def __init__(self, config): + super().__init__(config) + + self.electra = ElectraModel(config) + self.generator_predictions = ElectraGeneratorPredictions(config) + + self.generator_lm_head = nn.Linear(config.embedding_size, config.vocab_size) + # Initialize weights and apply final processing + self.post_init() + + def get_output_embeddings(self): + return self.generator_lm_head + + def set_output_embeddings(self, word_embeddings): + self.generator_lm_head = word_embeddings + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple[torch.Tensor], MaskedLMOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., + config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the + loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + generator_hidden_states = self.electra( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + generator_sequence_output = generator_hidden_states[0] + + prediction_scores = self.generator_predictions(generator_sequence_output) + prediction_scores = self.generator_lm_head(prediction_scores) + + loss = None + # Masked language modeling softmax layer + if labels is not None: + loss_fct = nn.CrossEntropyLoss() # -100 index = padding token + loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) + + if not return_dict: + output = (prediction_scores,) + generator_hidden_states[1:] + return ((loss,) + output) if loss is not None else output + + return MaskedLMOutput( + loss=loss, + logits=prediction_scores, + hidden_states=generator_hidden_states.hidden_states, + attentions=generator_hidden_states.attentions, + ) + + +@auto_docstring( + custom_intro=""" + Electra model with a token classification head on top. + + Both the discriminator and generator may be loaded into this model. + """ +) +class ElectraForTokenClassification(ElectraPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + + self.electra = ElectraModel(config) + classifier_dropout = ( + config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob + ) + self.dropout = nn.Dropout(classifier_dropout) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple[torch.Tensor], TokenClassifierOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + discriminator_hidden_states = self.electra( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + discriminator_sequence_output = discriminator_hidden_states[0] + + discriminator_sequence_output = self.dropout(discriminator_sequence_output) + logits = self.classifier(discriminator_sequence_output) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + + if not return_dict: + output = (logits,) + discriminator_hidden_states[1:] + return ((loss,) + output) if loss is not None else output + + return TokenClassifierOutput( + loss=loss, + logits=logits, + hidden_states=discriminator_hidden_states.hidden_states, + attentions=discriminator_hidden_states.attentions, + ) + + +@auto_docstring +class ElectraForQuestionAnswering(ElectraPreTrainedModel): + config: ElectraConfig + base_model_prefix = "electra" + + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + + self.electra = ElectraModel(config) + self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + start_positions: Optional[torch.Tensor] = None, + end_positions: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple[torch.Tensor], QuestionAnsweringModelOutput]: + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + discriminator_hidden_states = self.electra( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + ) + + sequence_output = discriminator_hidden_states[0] + + logits = self.qa_outputs(sequence_output) + start_logits, end_logits = logits.split(1, dim=-1) + start_logits = start_logits.squeeze(-1).contiguous() + end_logits = end_logits.squeeze(-1).contiguous() + + total_loss = None + if start_positions is not None and end_positions is not None: + # If we are on multi-GPU, split add a dimension + if len(start_positions.size()) > 1: + start_positions = start_positions.squeeze(-1) + if len(end_positions.size()) > 1: + end_positions = end_positions.squeeze(-1) + # sometimes the start/end positions are outside our model inputs, we ignore these terms + ignored_index = start_logits.size(1) + start_positions = start_positions.clamp(0, ignored_index) + end_positions = end_positions.clamp(0, ignored_index) + + loss_fct = CrossEntropyLoss(ignore_index=ignored_index) + start_loss = loss_fct(start_logits, start_positions) + end_loss = loss_fct(end_logits, end_positions) + total_loss = (start_loss + end_loss) / 2 + + if not return_dict: + output = ( + start_logits, + end_logits, + ) + discriminator_hidden_states[1:] + return ((total_loss,) + output) if total_loss is not None else output + + return QuestionAnsweringModelOutput( + loss=total_loss, + start_logits=start_logits, + end_logits=end_logits, + hidden_states=discriminator_hidden_states.hidden_states, + attentions=discriminator_hidden_states.attentions, + ) + + +@auto_docstring +class ElectraForMultipleChoice(ElectraPreTrainedModel): + def __init__(self, config): + super().__init__(config) + + self.electra = ElectraModel(config) + self.sequence_summary = ElectraSequenceSummary(config) + self.classifier = nn.Linear(config.hidden_size, 1) + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple[torch.Tensor], MultipleChoiceModelOutput]: + r""" + input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, + 1]`: + + - 0 corresponds to a *sentence A* token, + - 1 corresponds to a *sentence B* token. + + [What are token type IDs?](../glossary#token-type-ids) + position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + + [What are position IDs?](../glossary#position-ids) + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., + num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See + `input_ids` above) + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] + + input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None + attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None + token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None + position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None + inputs_embeds = ( + inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1)) + if inputs_embeds is not None + else None + ) + + discriminator_hidden_states = self.electra( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = discriminator_hidden_states[0] + + pooled_output = self.sequence_summary(sequence_output) + logits = self.classifier(pooled_output) + reshaped_logits = logits.view(-1, num_choices) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(reshaped_logits, labels) + + if not return_dict: + output = (reshaped_logits,) + discriminator_hidden_states[1:] + return ((loss,) + output) if loss is not None else output + + return MultipleChoiceModelOutput( + loss=loss, + logits=reshaped_logits, + hidden_states=discriminator_hidden_states.hidden_states, + attentions=discriminator_hidden_states.attentions, + ) + + +@auto_docstring( + custom_intro=""" + ELECTRA Model with a `language modeling` head on top for CLM fine-tuning. + """ +) +class ElectraForCausalLM(ElectraPreTrainedModel, GenerationMixin): + _tied_weights_keys = ["generator_lm_head.weight"] + + def __init__(self, config): + super().__init__(config) + + if not config.is_decoder: + logger.warning("If you want to use `ElectraForCausalLM` as a standalone, add `is_decoder=True.`") + + self.electra = ElectraModel(config) + self.generator_predictions = ElectraGeneratorPredictions(config) + self.generator_lm_head = nn.Linear(config.embedding_size, config.vocab_size) + + self.init_weights() + + def get_output_embeddings(self): + return self.generator_lm_head + + def set_output_embeddings(self, new_embeddings): + self.generator_lm_head = new_embeddings + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + past_key_values: Optional[Cache] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + **kwargs, + ) -> Union[tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in + `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are + ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` + + Example: + + ```python + >>> from transformers import AutoTokenizer, ElectraForCausalLM, ElectraConfig + >>> import torch + + >>> tokenizer = AutoTokenizer.from_pretrained("google/electra-base-generator") + >>> config = ElectraConfig.from_pretrained("google/electra-base-generator") + >>> config.is_decoder = True + >>> model = ElectraForCausalLM.from_pretrained("google/electra-base-generator", config=config) + + >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") + >>> outputs = model(**inputs) + + >>> prediction_logits = outputs.logits + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + if labels is not None: + use_cache = False + + outputs = self.electra( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + prediction_scores = self.generator_lm_head(self.generator_predictions(sequence_output)) + + lm_loss = None + if labels is not None: + lm_loss = self.loss_function( + prediction_scores, + labels, + vocab_size=self.config.vocab_size, + **kwargs, + ) + + if not return_dict: + output = (prediction_scores,) + outputs[1:] + return ((lm_loss,) + output) if lm_loss is not None else output + + return CausalLMOutputWithCrossAttentions( + loss=lm_loss, + logits=prediction_scores, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + cross_attentions=outputs.cross_attentions, + ) + + +__all__ = [ + "ElectraForCausalLM", + "ElectraForMaskedLM", + "ElectraForMultipleChoice", + "ElectraForPreTraining", + "ElectraForQuestionAnswering", + "ElectraForSequenceClassification", + "ElectraForTokenClassification", + "ElectraModel", + "ElectraPreTrainedModel", + "load_tf_weights_in_electra", +] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/electra/modeling_flax_electra.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/electra/modeling_flax_electra.py new file mode 100644 index 0000000000000000000000000000000000000000..14d845476d62f9defb2de4392742037762fb959f --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/electra/modeling_flax_electra.py @@ -0,0 +1,1614 @@ +# coding=utf-8 +# Copyright 2021 The Google Flax Team Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Callable, Optional + +import flax +import flax.linen as nn +import jax +import jax.numpy as jnp +import numpy as np +from flax.core.frozen_dict import FrozenDict, freeze, unfreeze +from flax.linen import combine_masks, make_causal_mask +from flax.linen import partitioning as nn_partitioning +from flax.linen.attention import dot_product_attention_weights +from flax.traverse_util import flatten_dict, unflatten_dict +from jax import lax + +from ...modeling_flax_outputs import ( + FlaxBaseModelOutput, + FlaxBaseModelOutputWithPastAndCrossAttentions, + FlaxCausalLMOutputWithCrossAttentions, + FlaxMaskedLMOutput, + FlaxMultipleChoiceModelOutput, + FlaxQuestionAnsweringModelOutput, + FlaxSequenceClassifierOutput, + FlaxTokenClassifierOutput, +) +from ...modeling_flax_utils import ( + ACT2FN, + FlaxPreTrainedModel, + append_call_sample_docstring, + append_replace_return_docstrings, + overwrite_call_docstring, +) +from ...utils import ModelOutput, add_start_docstrings, add_start_docstrings_to_model_forward, logging +from .configuration_electra import ElectraConfig + + +logger = logging.get_logger(__name__) + +_CHECKPOINT_FOR_DOC = "google/electra-small-discriminator" +_CONFIG_FOR_DOC = "ElectraConfig" + +remat = nn_partitioning.remat + + +@flax.struct.dataclass +class FlaxElectraForPreTrainingOutput(ModelOutput): + """ + Output type of [`ElectraForPreTraining`]. + + Args: + logits (`jnp.ndarray` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape + `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + logits: jnp.ndarray = None + hidden_states: Optional[tuple[jnp.ndarray]] = None + attentions: Optional[tuple[jnp.ndarray]] = None + + +ELECTRA_START_DOCSTRING = r""" + + This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading, saving and converting weights from PyTorch models) + + This model is also a Flax Linen + [flax.nn.Module](https://flax.readthedocs.io/en/latest/_autosummary/flax.nn.module.html) subclass. Use it as a + regular Flax Module and refer to the Flax documentation for all matter related to general usage and behavior. + + Finally, this model supports inherent JAX features such as: + + - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit) + - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation) + - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap) + - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap) + + Parameters: + config ([`ElectraConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +ELECTRA_INPUTS_DOCSTRING = r""" + Args: + input_ids (`numpy.ndarray` of shape `({0})`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`numpy.ndarray` of shape `({0})`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + token_type_ids (`numpy.ndarray` of shape `({0})`, *optional*): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, + 1]`: + + - 0 corresponds to a *sentence A* token, + - 1 corresponds to a *sentence B* token. + + [What are token type IDs?](../glossary#token-type-ids) + position_ids (`numpy.ndarray` of shape `({0})`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + head_mask (`numpy.ndarray` of shape `({0})`, `optional): + Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + +""" + + +class FlaxElectraEmbeddings(nn.Module): + """Construct the embeddings from word, position and token_type embeddings.""" + + config: ElectraConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.word_embeddings = nn.Embed( + self.config.vocab_size, + self.config.embedding_size, + embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range), + ) + self.position_embeddings = nn.Embed( + self.config.max_position_embeddings, + self.config.embedding_size, + embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range), + ) + self.token_type_embeddings = nn.Embed( + self.config.type_vocab_size, + self.config.embedding_size, + embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range), + ) + self.LayerNorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype) + self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob) + + # Copied from transformers.models.bert.modeling_flax_bert.FlaxBertEmbeddings.__call__ + def __call__(self, input_ids, token_type_ids, position_ids, attention_mask, deterministic: bool = True): + # Embed + inputs_embeds = self.word_embeddings(input_ids.astype("i4")) + position_embeds = self.position_embeddings(position_ids.astype("i4")) + token_type_embeddings = self.token_type_embeddings(token_type_ids.astype("i4")) + + # Sum all embeddings + hidden_states = inputs_embeds + token_type_embeddings + position_embeds + + # Layer Norm + hidden_states = self.LayerNorm(hidden_states) + hidden_states = self.dropout(hidden_states, deterministic=deterministic) + return hidden_states + + +# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertSelfAttention with Bert->Electra +class FlaxElectraSelfAttention(nn.Module): + config: ElectraConfig + causal: bool = False + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.head_dim = self.config.hidden_size // self.config.num_attention_heads + if self.config.hidden_size % self.config.num_attention_heads != 0: + raise ValueError( + "`config.hidden_size`: {self.config.hidden_size} has to be a multiple of `config.num_attention_heads` " + " : {self.config.num_attention_heads}" + ) + + self.query = nn.Dense( + self.config.hidden_size, + dtype=self.dtype, + kernel_init=jax.nn.initializers.normal(self.config.initializer_range), + ) + self.key = nn.Dense( + self.config.hidden_size, + dtype=self.dtype, + kernel_init=jax.nn.initializers.normal(self.config.initializer_range), + ) + self.value = nn.Dense( + self.config.hidden_size, + dtype=self.dtype, + kernel_init=jax.nn.initializers.normal(self.config.initializer_range), + ) + + if self.causal: + self.causal_mask = make_causal_mask( + jnp.ones((1, self.config.max_position_embeddings), dtype="bool"), dtype="bool" + ) + + def _split_heads(self, hidden_states): + return hidden_states.reshape(hidden_states.shape[:2] + (self.config.num_attention_heads, self.head_dim)) + + def _merge_heads(self, hidden_states): + return hidden_states.reshape(hidden_states.shape[:2] + (self.config.hidden_size,)) + + @nn.compact + # Copied from transformers.models.bart.modeling_flax_bart.FlaxBartAttention._concatenate_to_cache + def _concatenate_to_cache(self, key, value, query, attention_mask): + """ + This function takes projected key, value states from a single input token and concatenates the states to cached + states from previous steps. This function is slightly adapted from the official Flax repository: + https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252 + """ + # detect if we're initializing by absence of existing cache data. + is_initialized = self.has_variable("cache", "cached_key") + cached_key = self.variable("cache", "cached_key", jnp.zeros, key.shape, key.dtype) + cached_value = self.variable("cache", "cached_value", jnp.zeros, value.shape, value.dtype) + cache_index = self.variable("cache", "cache_index", lambda: jnp.array(0, dtype=jnp.int32)) + + if is_initialized: + *batch_dims, max_length, num_heads, depth_per_head = cached_key.value.shape + # update key, value caches with our new 1d spatial slices + cur_index = cache_index.value + indices = (0,) * len(batch_dims) + (cur_index, 0, 0) + key = lax.dynamic_update_slice(cached_key.value, key, indices) + value = lax.dynamic_update_slice(cached_value.value, value, indices) + cached_key.value = key + cached_value.value = value + num_updated_cache_vectors = query.shape[1] + cache_index.value = cache_index.value + num_updated_cache_vectors + # causal mask for cached decoder self-attention: our single query position should only attend to those key positions that have already been generated and cached, not the remaining zero elements. + pad_mask = jnp.broadcast_to( + jnp.arange(max_length) < cur_index + num_updated_cache_vectors, + tuple(batch_dims) + (1, num_updated_cache_vectors, max_length), + ) + attention_mask = combine_masks(pad_mask, attention_mask) + return key, value, attention_mask + + def __call__( + self, + hidden_states, + attention_mask, + layer_head_mask, + key_value_states: Optional[jnp.ndarray] = None, + init_cache: bool = False, + deterministic=True, + output_attentions: bool = False, + ): + # if key_value_states are provided this layer is used as a cross-attention layer + # for the decoder + is_cross_attention = key_value_states is not None + batch_size = hidden_states.shape[0] + + # get query proj + query_states = self.query(hidden_states) + # get key, value proj + if is_cross_attention: + # cross_attentions + key_states = self.key(key_value_states) + value_states = self.value(key_value_states) + else: + # self_attention + key_states = self.key(hidden_states) + value_states = self.value(hidden_states) + + query_states = self._split_heads(query_states) + key_states = self._split_heads(key_states) + value_states = self._split_heads(value_states) + + # handle cache prepare causal attention mask + if self.causal: + query_length, key_length = query_states.shape[1], key_states.shape[1] + if self.has_variable("cache", "cached_key"): + mask_shift = self.variables["cache"]["cache_index"] + max_decoder_length = self.variables["cache"]["cached_key"].shape[1] + causal_mask = lax.dynamic_slice( + self.causal_mask, (0, 0, mask_shift, 0), (1, 1, query_length, max_decoder_length) + ) + else: + causal_mask = self.causal_mask[:, :, :query_length, :key_length] + causal_mask = jnp.broadcast_to(causal_mask, (batch_size,) + causal_mask.shape[1:]) + + # combine masks if needed + if attention_mask is not None and self.causal: + attention_mask = jnp.broadcast_to(jnp.expand_dims(attention_mask, axis=(-3, -2)), causal_mask.shape) + attention_mask = combine_masks(attention_mask, causal_mask) + elif self.causal: + attention_mask = causal_mask + elif attention_mask is not None: + attention_mask = jnp.expand_dims(attention_mask, axis=(-3, -2)) + + # During fast autoregressive decoding, we feed one position at a time, + # and cache the keys and values step by step. + if self.causal and (self.has_variable("cache", "cached_key") or init_cache): + key_states, value_states, attention_mask = self._concatenate_to_cache( + key_states, value_states, query_states, attention_mask + ) + + # Convert the boolean attention mask to an attention bias. + if attention_mask is not None: + # attention mask in the form of attention bias + attention_bias = lax.select( + attention_mask > 0, + jnp.full(attention_mask.shape, 0.0).astype(self.dtype), + jnp.full(attention_mask.shape, jnp.finfo(self.dtype).min).astype(self.dtype), + ) + else: + attention_bias = None + + dropout_rng = None + if not deterministic and self.config.attention_probs_dropout_prob > 0.0: + dropout_rng = self.make_rng("dropout") + + attn_weights = dot_product_attention_weights( + query_states, + key_states, + bias=attention_bias, + dropout_rng=dropout_rng, + dropout_rate=self.config.attention_probs_dropout_prob, + broadcast_dropout=True, + deterministic=deterministic, + dtype=self.dtype, + precision=None, + ) + + # Mask heads if we want to + if layer_head_mask is not None: + attn_weights = jnp.einsum("...hqk,h->...hqk", attn_weights, layer_head_mask) + + attn_output = jnp.einsum("...hqk,...khd->...qhd", attn_weights, value_states) + attn_output = attn_output.reshape(attn_output.shape[:2] + (-1,)) + + outputs = (attn_output, attn_weights) if output_attentions else (attn_output,) + return outputs + + +# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertSelfOutput with Bert->Electra +class FlaxElectraSelfOutput(nn.Module): + config: ElectraConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.dense = nn.Dense( + self.config.hidden_size, + kernel_init=jax.nn.initializers.normal(self.config.initializer_range), + dtype=self.dtype, + ) + self.LayerNorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype) + self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob) + + def __call__(self, hidden_states, input_tensor, deterministic: bool = True): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states, deterministic=deterministic) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertAttention with Bert->Electra +class FlaxElectraAttention(nn.Module): + config: ElectraConfig + causal: bool = False + dtype: jnp.dtype = jnp.float32 + + def setup(self): + self.self = FlaxElectraSelfAttention(self.config, causal=self.causal, dtype=self.dtype) + self.output = FlaxElectraSelfOutput(self.config, dtype=self.dtype) + + def __call__( + self, + hidden_states, + attention_mask, + layer_head_mask, + key_value_states=None, + init_cache=False, + deterministic=True, + output_attentions: bool = False, + ): + # Attention mask comes in as attention_mask.shape == (*batch_sizes, kv_length) + # FLAX expects: attention_mask.shape == (*batch_sizes, 1, 1, kv_length) such that it is broadcastable + # with attn_weights.shape == (*batch_sizes, num_heads, q_length, kv_length) + attn_outputs = self.self( + hidden_states, + attention_mask, + layer_head_mask=layer_head_mask, + key_value_states=key_value_states, + init_cache=init_cache, + deterministic=deterministic, + output_attentions=output_attentions, + ) + attn_output = attn_outputs[0] + hidden_states = self.output(attn_output, hidden_states, deterministic=deterministic) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (attn_outputs[1],) + + return outputs + + +# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertIntermediate with Bert->Electra +class FlaxElectraIntermediate(nn.Module): + config: ElectraConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.dense = nn.Dense( + self.config.intermediate_size, + kernel_init=jax.nn.initializers.normal(self.config.initializer_range), + dtype=self.dtype, + ) + self.activation = ACT2FN[self.config.hidden_act] + + def __call__(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.activation(hidden_states) + return hidden_states + + +# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertOutput with Bert->Electra +class FlaxElectraOutput(nn.Module): + config: ElectraConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.dense = nn.Dense( + self.config.hidden_size, + kernel_init=jax.nn.initializers.normal(self.config.initializer_range), + dtype=self.dtype, + ) + self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob) + self.LayerNorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype) + + def __call__(self, hidden_states, attention_output, deterministic: bool = True): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states, deterministic=deterministic) + hidden_states = self.LayerNorm(hidden_states + attention_output) + return hidden_states + + +# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertLayer with Bert->Electra +class FlaxElectraLayer(nn.Module): + config: ElectraConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.attention = FlaxElectraAttention(self.config, causal=self.config.is_decoder, dtype=self.dtype) + self.intermediate = FlaxElectraIntermediate(self.config, dtype=self.dtype) + self.output = FlaxElectraOutput(self.config, dtype=self.dtype) + if self.config.add_cross_attention: + self.crossattention = FlaxElectraAttention(self.config, causal=False, dtype=self.dtype) + + def __call__( + self, + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states: Optional[jnp.ndarray] = None, + encoder_attention_mask: Optional[jnp.ndarray] = None, + init_cache: bool = False, + deterministic: bool = True, + output_attentions: bool = False, + ): + # Self Attention + attention_outputs = self.attention( + hidden_states, + attention_mask, + layer_head_mask=layer_head_mask, + init_cache=init_cache, + deterministic=deterministic, + output_attentions=output_attentions, + ) + attention_output = attention_outputs[0] + + # Cross-Attention Block + if encoder_hidden_states is not None: + cross_attention_outputs = self.crossattention( + attention_output, + attention_mask=encoder_attention_mask, + layer_head_mask=layer_head_mask, + key_value_states=encoder_hidden_states, + deterministic=deterministic, + output_attentions=output_attentions, + ) + attention_output = cross_attention_outputs[0] + + hidden_states = self.intermediate(attention_output) + hidden_states = self.output(hidden_states, attention_output, deterministic=deterministic) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (attention_outputs[1],) + if encoder_hidden_states is not None: + outputs += (cross_attention_outputs[1],) + return outputs + + +# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertLayerCollection with Bert->Electra +class FlaxElectraLayerCollection(nn.Module): + config: ElectraConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + gradient_checkpointing: bool = False + + def setup(self): + if self.gradient_checkpointing: + FlaxElectraCheckpointLayer = remat(FlaxElectraLayer, static_argnums=(5, 6, 7)) + self.layers = [ + FlaxElectraCheckpointLayer(self.config, name=str(i), dtype=self.dtype) + for i in range(self.config.num_hidden_layers) + ] + else: + self.layers = [ + FlaxElectraLayer(self.config, name=str(i), dtype=self.dtype) + for i in range(self.config.num_hidden_layers) + ] + + def __call__( + self, + hidden_states, + attention_mask, + head_mask, + encoder_hidden_states: Optional[jnp.ndarray] = None, + encoder_attention_mask: Optional[jnp.ndarray] = None, + init_cache: bool = False, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + all_attentions = () if output_attentions else None + all_hidden_states = () if output_hidden_states else None + all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None + + # Check if head_mask has a correct number of layers specified if desired + if head_mask is not None: + if head_mask.shape[0] != (len(self.layers)): + raise ValueError( + f"The head_mask should be specified for {len(self.layers)} layers, but it is for " + f" {head_mask.shape[0]}." + ) + + for i, layer in enumerate(self.layers): + if output_hidden_states: + all_hidden_states += (hidden_states,) + + layer_outputs = layer( + hidden_states, + attention_mask, + head_mask[i] if head_mask is not None else None, + encoder_hidden_states, + encoder_attention_mask, + init_cache, + deterministic, + output_attentions, + ) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_attentions += (layer_outputs[1],) + + if encoder_hidden_states is not None: + all_cross_attentions += (layer_outputs[2],) + + if output_hidden_states: + all_hidden_states += (hidden_states,) + + outputs = (hidden_states, all_hidden_states, all_attentions, all_cross_attentions) + + if not return_dict: + return tuple(v for v in outputs if v is not None) + + return FlaxBaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + hidden_states=all_hidden_states, + attentions=all_attentions, + cross_attentions=all_cross_attentions, + ) + + +# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertEncoder with Bert->Electra +class FlaxElectraEncoder(nn.Module): + config: ElectraConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + gradient_checkpointing: bool = False + + def setup(self): + self.layer = FlaxElectraLayerCollection( + self.config, + dtype=self.dtype, + gradient_checkpointing=self.gradient_checkpointing, + ) + + def __call__( + self, + hidden_states, + attention_mask, + head_mask, + encoder_hidden_states: Optional[jnp.ndarray] = None, + encoder_attention_mask: Optional[jnp.ndarray] = None, + init_cache: bool = False, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + return self.layer( + hidden_states, + attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + init_cache=init_cache, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + +class FlaxElectraGeneratorPredictions(nn.Module): + config: ElectraConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self): + self.LayerNorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype) + self.dense = nn.Dense(self.config.embedding_size, dtype=self.dtype) + + def __call__(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = ACT2FN[self.config.hidden_act](hidden_states) + hidden_states = self.LayerNorm(hidden_states) + return hidden_states + + +class FlaxElectraDiscriminatorPredictions(nn.Module): + """Prediction module for the discriminator, made up of two dense layers.""" + + config: ElectraConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self): + self.dense = nn.Dense(self.config.hidden_size, dtype=self.dtype) + self.dense_prediction = nn.Dense(1, dtype=self.dtype) + + def __call__(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = ACT2FN[self.config.hidden_act](hidden_states) + hidden_states = self.dense_prediction(hidden_states).squeeze(-1) + return hidden_states + + +class FlaxElectraPreTrainedModel(FlaxPreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = ElectraConfig + base_model_prefix = "electra" + module_class: nn.Module = None + + def __init__( + self, + config: ElectraConfig, + input_shape: tuple = (1, 1), + seed: int = 0, + dtype: jnp.dtype = jnp.float32, + _do_init: bool = True, + gradient_checkpointing: bool = False, + **kwargs, + ): + module = self.module_class(config=config, dtype=dtype, gradient_checkpointing=gradient_checkpointing, **kwargs) + super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init) + + # Copied from transformers.models.bert.modeling_flax_bert.FlaxBertPreTrainedModel.enable_gradient_checkpointing + def enable_gradient_checkpointing(self): + self._module = self.module_class( + config=self.config, + dtype=self.dtype, + gradient_checkpointing=True, + ) + + # Copied from transformers.models.bert.modeling_flax_bert.FlaxBertPreTrainedModel.init_weights + def init_weights(self, rng: jax.random.PRNGKey, input_shape: tuple, params: FrozenDict = None) -> FrozenDict: + # init input tensors + input_ids = jnp.zeros(input_shape, dtype="i4") + token_type_ids = jnp.zeros_like(input_ids) + position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_shape) + attention_mask = jnp.ones_like(input_ids) + head_mask = jnp.ones((self.config.num_hidden_layers, self.config.num_attention_heads)) + + params_rng, dropout_rng = jax.random.split(rng) + rngs = {"params": params_rng, "dropout": dropout_rng} + + if self.config.add_cross_attention: + encoder_hidden_states = jnp.zeros(input_shape + (self.config.hidden_size,)) + encoder_attention_mask = attention_mask + module_init_outputs = self.module.init( + rngs, + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + return_dict=False, + ) + else: + module_init_outputs = self.module.init( + rngs, input_ids, attention_mask, token_type_ids, position_ids, head_mask, return_dict=False + ) + + random_params = module_init_outputs["params"] + + if params is not None: + random_params = flatten_dict(unfreeze(random_params)) + params = flatten_dict(unfreeze(params)) + for missing_key in self._missing_keys: + params[missing_key] = random_params[missing_key] + self._missing_keys = set() + return freeze(unflatten_dict(params)) + else: + return random_params + + # Copied from transformers.models.bart.modeling_flax_bart.FlaxBartDecoderPreTrainedModel.init_cache + def init_cache(self, batch_size, max_length): + r""" + Args: + batch_size (`int`): + batch_size used for fast auto-regressive decoding. Defines the batch size of the initialized cache. + max_length (`int`): + maximum possible length for auto-regressive decoding. Defines the sequence length of the initialized + cache. + """ + # init input variables to retrieve cache + input_ids = jnp.ones((batch_size, max_length), dtype="i4") + attention_mask = jnp.ones_like(input_ids, dtype="i4") + position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape) + + init_variables = self.module.init( + jax.random.PRNGKey(0), input_ids, attention_mask, position_ids, return_dict=False, init_cache=True + ) + return unfreeze(init_variables["cache"]) + + @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + def __call__( + self, + input_ids, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + params: Optional[dict] = None, + dropout_rng: jax.random.PRNGKey = None, + train: bool = False, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + past_key_values: Optional[dict] = None, + ): + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.return_dict + + # init input tensors if not passed + if token_type_ids is None: + token_type_ids = jnp.ones_like(input_ids) + + if position_ids is None: + position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape) + + if attention_mask is None: + attention_mask = jnp.ones_like(input_ids) + + if head_mask is None: + head_mask = jnp.ones((self.config.num_hidden_layers, self.config.num_attention_heads)) + + # Handle any PRNG if needed + rngs = {} + if dropout_rng is not None: + rngs["dropout"] = dropout_rng + + inputs = {"params": params or self.params} + + if self.config.add_cross_attention: + # if past_key_values are passed then cache is already initialized a private flag init_cache has to be passed + # down to ensure cache is used. It has to be made sure that cache is marked as mutable so that it can be + # changed by FlaxElectraAttention module + if past_key_values: + inputs["cache"] = past_key_values + mutable = ["cache"] + else: + mutable = False + + outputs = self.module.apply( + inputs, + jnp.array(input_ids, dtype="i4"), + jnp.array(attention_mask, dtype="i4"), + token_type_ids=jnp.array(token_type_ids, dtype="i4"), + position_ids=jnp.array(position_ids, dtype="i4"), + head_mask=jnp.array(head_mask, dtype="i4"), + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + deterministic=not train, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + rngs=rngs, + mutable=mutable, + ) + + # add updated cache to model output + if past_key_values is not None and return_dict: + outputs, past_key_values = outputs + outputs["past_key_values"] = unfreeze(past_key_values["cache"]) + return outputs + elif past_key_values is not None and not return_dict: + outputs, past_key_values = outputs + outputs = outputs[:1] + (unfreeze(past_key_values["cache"]),) + outputs[1:] + + else: + outputs = self.module.apply( + inputs, + jnp.array(input_ids, dtype="i4"), + jnp.array(attention_mask, dtype="i4"), + token_type_ids=jnp.array(token_type_ids, dtype="i4"), + position_ids=jnp.array(position_ids, dtype="i4"), + head_mask=jnp.array(head_mask, dtype="i4"), + deterministic=not train, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + rngs=rngs, + ) + + return outputs + + +class FlaxElectraModule(nn.Module): + config: ElectraConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + gradient_checkpointing: bool = False + + def setup(self): + self.embeddings = FlaxElectraEmbeddings(self.config, dtype=self.dtype) + if self.config.embedding_size != self.config.hidden_size: + self.embeddings_project = nn.Dense(self.config.hidden_size, dtype=self.dtype) + self.encoder = FlaxElectraEncoder( + self.config, dtype=self.dtype, gradient_checkpointing=self.gradient_checkpointing + ) + + def __call__( + self, + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask: Optional[np.ndarray] = None, + encoder_hidden_states: Optional[jnp.ndarray] = None, + encoder_attention_mask: Optional[jnp.ndarray] = None, + init_cache: bool = False, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + embeddings = self.embeddings( + input_ids, token_type_ids, position_ids, attention_mask, deterministic=deterministic + ) + if hasattr(self, "embeddings_project"): + embeddings = self.embeddings_project(embeddings) + + return self.encoder( + embeddings, + attention_mask, + head_mask=head_mask, + deterministic=deterministic, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + init_cache=init_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + +@add_start_docstrings( + "The bare Electra Model transformer outputting raw hidden-states without any specific head on top.", + ELECTRA_START_DOCSTRING, +) +class FlaxElectraModel(FlaxElectraPreTrainedModel): + module_class = FlaxElectraModule + + +append_call_sample_docstring(FlaxElectraModel, _CHECKPOINT_FOR_DOC, FlaxBaseModelOutput, _CONFIG_FOR_DOC) + + +class FlaxElectraTiedDense(nn.Module): + embedding_size: int + dtype: jnp.dtype = jnp.float32 + precision = None + bias_init: Callable[..., np.ndarray] = jax.nn.initializers.zeros + + def setup(self): + self.bias = self.param("bias", self.bias_init, (self.embedding_size,)) + + def __call__(self, x, kernel): + x = jnp.asarray(x, self.dtype) + kernel = jnp.asarray(kernel, self.dtype) + y = lax.dot_general( + x, + kernel, + (((x.ndim - 1,), (0,)), ((), ())), + precision=self.precision, + ) + bias = jnp.asarray(self.bias, self.dtype) + return y + bias + + +class FlaxElectraForMaskedLMModule(nn.Module): + config: ElectraConfig + dtype: jnp.dtype = jnp.float32 + gradient_checkpointing: bool = False + + def setup(self): + self.electra = FlaxElectraModule( + config=self.config, dtype=self.dtype, gradient_checkpointing=self.gradient_checkpointing + ) + self.generator_predictions = FlaxElectraGeneratorPredictions(config=self.config, dtype=self.dtype) + if self.config.tie_word_embeddings: + self.generator_lm_head = FlaxElectraTiedDense(self.config.vocab_size, dtype=self.dtype) + else: + self.generator_lm_head = nn.Dense(self.config.vocab_size, dtype=self.dtype) + + def __call__( + self, + input_ids, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + outputs = self.electra( + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = outputs[0] + prediction_scores = self.generator_predictions(hidden_states) + + if self.config.tie_word_embeddings: + shared_embedding = self.electra.variables["params"]["embeddings"]["word_embeddings"]["embedding"] + prediction_scores = self.generator_lm_head(prediction_scores, shared_embedding.T) + else: + prediction_scores = self.generator_lm_head(prediction_scores) + + if not return_dict: + return (prediction_scores,) + outputs[1:] + + return FlaxMaskedLMOutput( + logits=prediction_scores, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings("""Electra Model with a `language modeling` head on top.""", ELECTRA_START_DOCSTRING) +class FlaxElectraForMaskedLM(FlaxElectraPreTrainedModel): + module_class = FlaxElectraForMaskedLMModule + + +append_call_sample_docstring(FlaxElectraForMaskedLM, _CHECKPOINT_FOR_DOC, FlaxMaskedLMOutput, _CONFIG_FOR_DOC) + + +class FlaxElectraForPreTrainingModule(nn.Module): + config: ElectraConfig + dtype: jnp.dtype = jnp.float32 + gradient_checkpointing: bool = False + + def setup(self): + self.electra = FlaxElectraModule( + config=self.config, dtype=self.dtype, gradient_checkpointing=self.gradient_checkpointing + ) + self.discriminator_predictions = FlaxElectraDiscriminatorPredictions(config=self.config, dtype=self.dtype) + + def __call__( + self, + input_ids, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + # Model + outputs = self.electra( + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = outputs[0] + + logits = self.discriminator_predictions(hidden_states) + + if not return_dict: + return (logits,) + outputs[1:] + + return FlaxElectraForPreTrainingOutput( + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + Electra model with a binary classification head on top as used during pretraining for identifying generated tokens. + + It is recommended to load the discriminator checkpoint into that model. + """, + ELECTRA_START_DOCSTRING, +) +class FlaxElectraForPreTraining(FlaxElectraPreTrainedModel): + module_class = FlaxElectraForPreTrainingModule + + +FLAX_ELECTRA_FOR_PRETRAINING_DOCSTRING = """ + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, FlaxElectraForPreTraining + + >>> tokenizer = AutoTokenizer.from_pretrained("google/electra-small-discriminator") + >>> model = FlaxElectraForPreTraining.from_pretrained("google/electra-small-discriminator") + + >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="np") + >>> outputs = model(**inputs) + + >>> prediction_logits = outputs.logits + ``` +""" + +overwrite_call_docstring( + FlaxElectraForPreTraining, + ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length") + FLAX_ELECTRA_FOR_PRETRAINING_DOCSTRING, +) +append_replace_return_docstrings( + FlaxElectraForPreTraining, output_type=FlaxElectraForPreTrainingOutput, config_class=_CONFIG_FOR_DOC +) + + +class FlaxElectraForTokenClassificationModule(nn.Module): + config: ElectraConfig + dtype: jnp.dtype = jnp.float32 + gradient_checkpointing: bool = False + + def setup(self): + self.electra = FlaxElectraModule( + config=self.config, dtype=self.dtype, gradient_checkpointing=self.gradient_checkpointing + ) + classifier_dropout = ( + self.config.classifier_dropout + if self.config.classifier_dropout is not None + else self.config.hidden_dropout_prob + ) + self.dropout = nn.Dropout(classifier_dropout) + self.classifier = nn.Dense(self.config.num_labels, dtype=self.dtype) + + def __call__( + self, + input_ids, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + # Model + outputs = self.electra( + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = outputs[0] + + hidden_states = self.dropout(hidden_states, deterministic=deterministic) + logits = self.classifier(hidden_states) + + if not return_dict: + return (logits,) + outputs[1:] + + return FlaxTokenClassifierOutput( + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + Electra model with a token classification head on top. + + Both the discriminator and generator may be loaded into this model. + """, + ELECTRA_START_DOCSTRING, +) +class FlaxElectraForTokenClassification(FlaxElectraPreTrainedModel): + module_class = FlaxElectraForTokenClassificationModule + + +append_call_sample_docstring( + FlaxElectraForTokenClassification, + _CHECKPOINT_FOR_DOC, + FlaxTokenClassifierOutput, + _CONFIG_FOR_DOC, +) + + +def identity(x, **kwargs): + return x + + +class FlaxElectraSequenceSummary(nn.Module): + r""" + Compute a single vector summary of a sequence hidden states. + + Args: + config ([`PretrainedConfig`]): + The config used by the model. Relevant arguments in the config class of the model are (refer to the actual + config class of your model for the default values it uses): + + - **summary_use_proj** (`bool`) -- Add a projection after the vector extraction. + - **summary_proj_to_labels** (`bool`) -- If `True`, the projection outputs to `config.num_labels` classes + (otherwise to `config.hidden_size`). + - **summary_activation** (`Optional[str]`) -- Set to `"tanh"` to add a tanh activation to the output, + another string or `None` will add no activation. + - **summary_first_dropout** (`float`) -- Optional dropout probability before the projection and activation. + - **summary_last_dropout** (`float`)-- Optional dropout probability after the projection and activation. + """ + + config: ElectraConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self): + self.summary = identity + if hasattr(self.config, "summary_use_proj") and self.config.summary_use_proj: + if ( + hasattr(self.config, "summary_proj_to_labels") + and self.config.summary_proj_to_labels + and self.config.num_labels > 0 + ): + num_classes = self.config.num_labels + else: + num_classes = self.config.hidden_size + self.summary = nn.Dense(num_classes, dtype=self.dtype) + + activation_string = getattr(self.config, "summary_activation", None) + self.activation = ACT2FN[activation_string] if activation_string else lambda x: x # noqa F407 + + self.first_dropout = identity + if hasattr(self.config, "summary_first_dropout") and self.config.summary_first_dropout > 0: + self.first_dropout = nn.Dropout(self.config.summary_first_dropout) + + self.last_dropout = identity + if hasattr(self.config, "summary_last_dropout") and self.config.summary_last_dropout > 0: + self.last_dropout = nn.Dropout(self.config.summary_last_dropout) + + def __call__(self, hidden_states, cls_index=None, deterministic: bool = True): + """ + Compute a single vector summary of a sequence hidden states. + + Args: + hidden_states (`jnp.ndarray` of shape `[batch_size, seq_len, hidden_size]`): + The hidden states of the last layer. + cls_index (`jnp.ndarray` of shape `[batch_size]` or `[batch_size, ...]` where ... are optional leading dimensions of `hidden_states`, *optional*): + Used if `summary_type == "cls_index"` and takes the last token of the sequence as classification token. + + Returns: + `jnp.ndarray`: The summary of the sequence hidden states. + """ + # NOTE: this does "first" type summary always + output = hidden_states[:, 0] + output = self.first_dropout(output, deterministic=deterministic) + output = self.summary(output) + output = self.activation(output) + output = self.last_dropout(output, deterministic=deterministic) + return output + + +class FlaxElectraForMultipleChoiceModule(nn.Module): + config: ElectraConfig + dtype: jnp.dtype = jnp.float32 + gradient_checkpointing: bool = False + + def setup(self): + self.electra = FlaxElectraModule( + config=self.config, dtype=self.dtype, gradient_checkpointing=self.gradient_checkpointing + ) + self.sequence_summary = FlaxElectraSequenceSummary(config=self.config, dtype=self.dtype) + self.classifier = nn.Dense(1, dtype=self.dtype) + + def __call__( + self, + input_ids, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + num_choices = input_ids.shape[1] + input_ids = input_ids.reshape(-1, input_ids.shape[-1]) if input_ids is not None else None + attention_mask = attention_mask.reshape(-1, attention_mask.shape[-1]) if attention_mask is not None else None + token_type_ids = token_type_ids.reshape(-1, token_type_ids.shape[-1]) if token_type_ids is not None else None + position_ids = position_ids.reshape(-1, position_ids.shape[-1]) if position_ids is not None else None + + # Model + outputs = self.electra( + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = outputs[0] + pooled_output = self.sequence_summary(hidden_states, deterministic=deterministic) + logits = self.classifier(pooled_output) + + reshaped_logits = logits.reshape(-1, num_choices) + + if not return_dict: + return (reshaped_logits,) + outputs[1:] + + return FlaxMultipleChoiceModelOutput( + logits=reshaped_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + ELECTRA Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a + softmax) e.g. for RocStories/SWAG tasks. + """, + ELECTRA_START_DOCSTRING, +) +class FlaxElectraForMultipleChoice(FlaxElectraPreTrainedModel): + module_class = FlaxElectraForMultipleChoiceModule + + +# adapt docstring slightly for FlaxElectraForMultipleChoice +overwrite_call_docstring( + FlaxElectraForMultipleChoice, ELECTRA_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length") +) +append_call_sample_docstring( + FlaxElectraForMultipleChoice, + _CHECKPOINT_FOR_DOC, + FlaxMultipleChoiceModelOutput, + _CONFIG_FOR_DOC, +) + + +class FlaxElectraForQuestionAnsweringModule(nn.Module): + config: ElectraConfig + dtype: jnp.dtype = jnp.float32 + gradient_checkpointing: bool = False + + def setup(self): + self.electra = FlaxElectraModule( + config=self.config, dtype=self.dtype, gradient_checkpointing=self.gradient_checkpointing + ) + self.qa_outputs = nn.Dense(self.config.num_labels, dtype=self.dtype) + + def __call__( + self, + input_ids, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + # Model + outputs = self.electra( + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = outputs[0] + logits = self.qa_outputs(hidden_states) + start_logits, end_logits = jnp.split(logits, self.config.num_labels, axis=-1) + start_logits = start_logits.squeeze(-1) + end_logits = end_logits.squeeze(-1) + + if not return_dict: + return (start_logits, end_logits) + outputs[1:] + + return FlaxQuestionAnsweringModelOutput( + start_logits=start_logits, + end_logits=end_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + ELECTRA Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear + layers on top of the hidden-states output to compute `span start logits` and `span end logits`). + """, + ELECTRA_START_DOCSTRING, +) +class FlaxElectraForQuestionAnswering(FlaxElectraPreTrainedModel): + module_class = FlaxElectraForQuestionAnsweringModule + + +append_call_sample_docstring( + FlaxElectraForQuestionAnswering, + _CHECKPOINT_FOR_DOC, + FlaxQuestionAnsweringModelOutput, + _CONFIG_FOR_DOC, +) + + +class FlaxElectraClassificationHead(nn.Module): + """Head for sentence-level classification tasks.""" + + config: ElectraConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self): + self.dense = nn.Dense(self.config.hidden_size, dtype=self.dtype) + classifier_dropout = ( + self.config.classifier_dropout + if self.config.classifier_dropout is not None + else self.config.hidden_dropout_prob + ) + self.dropout = nn.Dropout(classifier_dropout) + self.out_proj = nn.Dense(self.config.num_labels, dtype=self.dtype) + + def __call__(self, hidden_states, deterministic: bool = True): + x = hidden_states[:, 0, :] # take token (equiv. to [CLS]) + x = self.dropout(x, deterministic=deterministic) + x = self.dense(x) + x = ACT2FN["gelu"](x) # although BERT uses tanh here, it seems Electra authors used gelu + x = self.dropout(x, deterministic=deterministic) + x = self.out_proj(x) + return x + + +class FlaxElectraForSequenceClassificationModule(nn.Module): + config: ElectraConfig + dtype: jnp.dtype = jnp.float32 + gradient_checkpointing: bool = False + + def setup(self): + self.electra = FlaxElectraModule( + config=self.config, dtype=self.dtype, gradient_checkpointing=self.gradient_checkpointing + ) + self.classifier = FlaxElectraClassificationHead(config=self.config, dtype=self.dtype) + + def __call__( + self, + input_ids, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + # Model + outputs = self.electra( + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = outputs[0] + logits = self.classifier(hidden_states, deterministic=deterministic) + + if not return_dict: + return (logits,) + outputs[1:] + + return FlaxSequenceClassifierOutput( + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + Electra Model transformer with a sequence classification/regression head on top (a linear layer on top of the + pooled output) e.g. for GLUE tasks. + """, + ELECTRA_START_DOCSTRING, +) +class FlaxElectraForSequenceClassification(FlaxElectraPreTrainedModel): + module_class = FlaxElectraForSequenceClassificationModule + + +append_call_sample_docstring( + FlaxElectraForSequenceClassification, + _CHECKPOINT_FOR_DOC, + FlaxSequenceClassifierOutput, + _CONFIG_FOR_DOC, +) + + +class FlaxElectraForCausalLMModule(nn.Module): + config: ElectraConfig + dtype: jnp.dtype = jnp.float32 + gradient_checkpointing: bool = False + + def setup(self): + self.electra = FlaxElectraModule( + config=self.config, dtype=self.dtype, gradient_checkpointing=self.gradient_checkpointing + ) + self.generator_predictions = FlaxElectraGeneratorPredictions(config=self.config, dtype=self.dtype) + if self.config.tie_word_embeddings: + self.generator_lm_head = FlaxElectraTiedDense(self.config.vocab_size, dtype=self.dtype) + else: + self.generator_lm_head = nn.Dense(self.config.vocab_size, dtype=self.dtype) + + def __call__( + self, + input_ids, + attention_mask: Optional[jnp.ndarray] = None, + token_type_ids: Optional[jnp.ndarray] = None, + position_ids: Optional[jnp.ndarray] = None, + head_mask: Optional[jnp.ndarray] = None, + encoder_hidden_states: Optional[jnp.ndarray] = None, + encoder_attention_mask: Optional[jnp.ndarray] = None, + init_cache: bool = False, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + outputs = self.electra( + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + init_cache=init_cache, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = outputs[0] + prediction_scores = self.generator_predictions(hidden_states) + + if self.config.tie_word_embeddings: + shared_embedding = self.electra.variables["params"]["embeddings"]["word_embeddings"]["embedding"] + prediction_scores = self.generator_lm_head(prediction_scores, shared_embedding.T) + else: + prediction_scores = self.generator_lm_head(prediction_scores) + + if not return_dict: + return (prediction_scores,) + outputs[1:] + + return FlaxCausalLMOutputWithCrossAttentions( + logits=prediction_scores, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + cross_attentions=outputs.cross_attentions, + ) + + +@add_start_docstrings( + """ + Electra Model with a language modeling head on top (a linear layer on top of the hidden-states output) e.g for + autoregressive tasks. + """, + ELECTRA_START_DOCSTRING, +) +# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertForCausalLM with Bert->Electra +class FlaxElectraForCausalLM(FlaxElectraPreTrainedModel): + module_class = FlaxElectraForCausalLMModule + + def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jax.Array] = None): + # initializing the cache + batch_size, seq_length = input_ids.shape + + past_key_values = self.init_cache(batch_size, max_length) + # Note that usually one would have to put 0's in the attention_mask for x > input_ids.shape[-1] and x < cache_length. + # But since the decoder uses a causal mask, those positions are masked anyway. + # Thus, we can create a single static attention_mask here, which is more efficient for compilation + extended_attention_mask = jnp.ones((batch_size, max_length), dtype="i4") + if attention_mask is not None: + position_ids = attention_mask.cumsum(axis=-1) - 1 + extended_attention_mask = lax.dynamic_update_slice(extended_attention_mask, attention_mask, (0, 0)) + else: + position_ids = jnp.broadcast_to(jnp.arange(seq_length, dtype="i4")[None, :], (batch_size, seq_length)) + + return { + "past_key_values": past_key_values, + "attention_mask": extended_attention_mask, + "position_ids": position_ids, + } + + def update_inputs_for_generation(self, model_outputs, model_kwargs): + model_kwargs["past_key_values"] = model_outputs.past_key_values + model_kwargs["position_ids"] = model_kwargs["position_ids"][:, -1:] + 1 + return model_kwargs + + +append_call_sample_docstring( + FlaxElectraForCausalLM, + _CHECKPOINT_FOR_DOC, + FlaxCausalLMOutputWithCrossAttentions, + _CONFIG_FOR_DOC, +) + + +__all__ = [ + "FlaxElectraForCausalLM", + "FlaxElectraForMaskedLM", + "FlaxElectraForMultipleChoice", + "FlaxElectraForPreTraining", + "FlaxElectraForQuestionAnswering", + "FlaxElectraForSequenceClassification", + "FlaxElectraForTokenClassification", + "FlaxElectraModel", + "FlaxElectraPreTrainedModel", +] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/electra/modeling_tf_electra.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/electra/modeling_tf_electra.py new file mode 100644 index 0000000000000000000000000000000000000000..3a5c33e503d7386df5c2be0fc10a079ee4fe014a --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/electra/modeling_tf_electra.py @@ -0,0 +1,1775 @@ +# coding=utf-8 +# Copyright 2019 The Google AI Language Team Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""TF Electra model.""" + +from __future__ import annotations + +import math +import warnings +from dataclasses import dataclass + +import numpy as np +import tensorflow as tf + +from ...activations_tf import get_tf_activation +from ...modeling_tf_outputs import ( + TFBaseModelOutputWithPastAndCrossAttentions, + TFMaskedLMOutput, + TFMultipleChoiceModelOutput, + TFQuestionAnsweringModelOutput, + TFSequenceClassifierOutput, + TFTokenClassifierOutput, +) +from ...modeling_tf_utils import ( + TFMaskedLanguageModelingLoss, + TFModelInputType, + TFMultipleChoiceLoss, + TFPreTrainedModel, + TFQuestionAnsweringLoss, + TFSequenceClassificationLoss, + TFSequenceSummary, + TFTokenClassificationLoss, + get_initializer, + keras, + keras_serializable, + unpack_inputs, +) +from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax +from ...utils import ( + ModelOutput, + add_code_sample_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, + replace_return_docstrings, +) +from .configuration_electra import ElectraConfig + + +logger = logging.get_logger(__name__) + +_CHECKPOINT_FOR_DOC = "google/electra-small-discriminator" +_CONFIG_FOR_DOC = "ElectraConfig" + + +# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention with Bert->Electra +class TFElectraSelfAttention(keras.layers.Layer): + def __init__(self, config: ElectraConfig, **kwargs): + super().__init__(**kwargs) + + if config.hidden_size % config.num_attention_heads != 0: + raise ValueError( + f"The hidden size ({config.hidden_size}) is not a multiple of the number " + f"of attention heads ({config.num_attention_heads})" + ) + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + self.sqrt_att_head_size = math.sqrt(self.attention_head_size) + + self.query = keras.layers.Dense( + units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query" + ) + self.key = keras.layers.Dense( + units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key" + ) + self.value = keras.layers.Dense( + units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value" + ) + self.dropout = keras.layers.Dropout(rate=config.attention_probs_dropout_prob) + + self.is_decoder = config.is_decoder + self.config = config + + def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor: + # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size] + tensor = tf.reshape(tensor=tensor, shape=(batch_size, -1, self.num_attention_heads, self.attention_head_size)) + + # Transpose the tensor from [batch_size, seq_length, num_attention_heads, attention_head_size] to [batch_size, num_attention_heads, seq_length, attention_head_size] + return tf.transpose(tensor, perm=[0, 2, 1, 3]) + + def call( + self, + hidden_states: tf.Tensor, + attention_mask: tf.Tensor, + head_mask: tf.Tensor, + encoder_hidden_states: tf.Tensor, + encoder_attention_mask: tf.Tensor, + past_key_value: tuple[tf.Tensor], + output_attentions: bool, + training: bool = False, + ) -> tuple[tf.Tensor]: + batch_size = shape_list(hidden_states)[0] + mixed_query_layer = self.query(inputs=hidden_states) + + # If this is instantiated as a cross-attention module, the keys + # and values come from an encoder; the attention mask needs to be + # such that the encoder's padding tokens are not attended to. + is_cross_attention = encoder_hidden_states is not None + + if is_cross_attention and past_key_value is not None: + # reuse k,v, cross_attentions + key_layer = past_key_value[0] + value_layer = past_key_value[1] + attention_mask = encoder_attention_mask + elif is_cross_attention: + key_layer = self.transpose_for_scores(self.key(inputs=encoder_hidden_states), batch_size) + value_layer = self.transpose_for_scores(self.value(inputs=encoder_hidden_states), batch_size) + attention_mask = encoder_attention_mask + elif past_key_value is not None: + key_layer = self.transpose_for_scores(self.key(inputs=hidden_states), batch_size) + value_layer = self.transpose_for_scores(self.value(inputs=hidden_states), batch_size) + key_layer = tf.concat([past_key_value[0], key_layer], axis=2) + value_layer = tf.concat([past_key_value[1], value_layer], axis=2) + else: + key_layer = self.transpose_for_scores(self.key(inputs=hidden_states), batch_size) + value_layer = self.transpose_for_scores(self.value(inputs=hidden_states), batch_size) + + query_layer = self.transpose_for_scores(mixed_query_layer, batch_size) + + if self.is_decoder: + # if cross_attention save Tuple(tf.Tensor, tf.Tensor) of all cross attention key/value_states. + # Further calls to cross_attention layer can then reuse all cross-attention + # key/value_states (first "if" case) + # if uni-directional self-attention (decoder) save Tuple(tf.Tensor, tf.Tensor) of + # all previous decoder key/value_states. Further calls to uni-directional self-attention + # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) + # if encoder bi-directional self-attention `past_key_value` is always `None` + past_key_value = (key_layer, value_layer) + + # Take the dot product between "query" and "key" to get the raw attention scores. + # (batch size, num_heads, seq_len_q, seq_len_k) + attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True) + dk = tf.cast(self.sqrt_att_head_size, dtype=attention_scores.dtype) + attention_scores = tf.divide(attention_scores, dk) + + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in TFElectraModel call() function) + attention_scores = tf.add(attention_scores, attention_mask) + + # Normalize the attention scores to probabilities. + attention_probs = stable_softmax(logits=attention_scores, axis=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(inputs=attention_probs, training=training) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = tf.multiply(attention_probs, head_mask) + + attention_output = tf.matmul(attention_probs, value_layer) + attention_output = tf.transpose(attention_output, perm=[0, 2, 1, 3]) + + # (batch_size, seq_len_q, all_head_size) + attention_output = tf.reshape(tensor=attention_output, shape=(batch_size, -1, self.all_head_size)) + outputs = (attention_output, attention_probs) if output_attentions else (attention_output,) + + if self.is_decoder: + outputs = outputs + (past_key_value,) + return outputs + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "query", None) is not None: + with tf.name_scope(self.query.name): + self.query.build([None, None, self.config.hidden_size]) + if getattr(self, "key", None) is not None: + with tf.name_scope(self.key.name): + self.key.build([None, None, self.config.hidden_size]) + if getattr(self, "value", None) is not None: + with tf.name_scope(self.value.name): + self.value.build([None, None, self.config.hidden_size]) + + +# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput with Bert->Electra +class TFElectraSelfOutput(keras.layers.Layer): + def __init__(self, config: ElectraConfig, **kwargs): + super().__init__(**kwargs) + + self.dense = keras.layers.Dense( + units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + ) + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.config = config + + def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: + hidden_states = self.dense(inputs=hidden_states) + hidden_states = self.dropout(inputs=hidden_states, training=training) + hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor) + + return hidden_states + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build([None, None, self.config.hidden_size]) + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) + + +# Copied from transformers.models.bert.modeling_tf_bert.TFBertAttention with Bert->Electra +class TFElectraAttention(keras.layers.Layer): + def __init__(self, config: ElectraConfig, **kwargs): + super().__init__(**kwargs) + + self.self_attention = TFElectraSelfAttention(config, name="self") + self.dense_output = TFElectraSelfOutput(config, name="output") + + def prune_heads(self, heads): + raise NotImplementedError + + def call( + self, + input_tensor: tf.Tensor, + attention_mask: tf.Tensor, + head_mask: tf.Tensor, + encoder_hidden_states: tf.Tensor, + encoder_attention_mask: tf.Tensor, + past_key_value: tuple[tf.Tensor], + output_attentions: bool, + training: bool = False, + ) -> tuple[tf.Tensor]: + self_outputs = self.self_attention( + hidden_states=input_tensor, + attention_mask=attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + past_key_value=past_key_value, + output_attentions=output_attentions, + training=training, + ) + attention_output = self.dense_output( + hidden_states=self_outputs[0], input_tensor=input_tensor, training=training + ) + # add attentions (possibly with past_key_value) if we output them + outputs = (attention_output,) + self_outputs[1:] + + return outputs + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "self_attention", None) is not None: + with tf.name_scope(self.self_attention.name): + self.self_attention.build(None) + if getattr(self, "dense_output", None) is not None: + with tf.name_scope(self.dense_output.name): + self.dense_output.build(None) + + +# Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->Electra +class TFElectraIntermediate(keras.layers.Layer): + def __init__(self, config: ElectraConfig, **kwargs): + super().__init__(**kwargs) + + self.dense = keras.layers.Dense( + units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + ) + + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = get_tf_activation(config.hidden_act) + else: + self.intermediate_act_fn = config.hidden_act + self.config = config + + def call(self, hidden_states: tf.Tensor) -> tf.Tensor: + hidden_states = self.dense(inputs=hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + + return hidden_states + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build([None, None, self.config.hidden_size]) + + +# Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput with Bert->Electra +class TFElectraOutput(keras.layers.Layer): + def __init__(self, config: ElectraConfig, **kwargs): + super().__init__(**kwargs) + + self.dense = keras.layers.Dense( + units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + ) + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.config = config + + def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: + hidden_states = self.dense(inputs=hidden_states) + hidden_states = self.dropout(inputs=hidden_states, training=training) + hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor) + + return hidden_states + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build([None, None, self.config.intermediate_size]) + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) + + +# Copied from transformers.models.bert.modeling_tf_bert.TFBertLayer with Bert->Electra +class TFElectraLayer(keras.layers.Layer): + def __init__(self, config: ElectraConfig, **kwargs): + super().__init__(**kwargs) + + self.attention = TFElectraAttention(config, name="attention") + self.is_decoder = config.is_decoder + self.add_cross_attention = config.add_cross_attention + if self.add_cross_attention: + if not self.is_decoder: + raise ValueError(f"{self} should be used as a decoder model if cross attention is added") + self.crossattention = TFElectraAttention(config, name="crossattention") + self.intermediate = TFElectraIntermediate(config, name="intermediate") + self.bert_output = TFElectraOutput(config, name="output") + + def call( + self, + hidden_states: tf.Tensor, + attention_mask: tf.Tensor, + head_mask: tf.Tensor, + encoder_hidden_states: tf.Tensor | None, + encoder_attention_mask: tf.Tensor | None, + past_key_value: tuple[tf.Tensor] | None, + output_attentions: bool, + training: bool = False, + ) -> tuple[tf.Tensor]: + # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 + self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None + self_attention_outputs = self.attention( + input_tensor=hidden_states, + attention_mask=attention_mask, + head_mask=head_mask, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=self_attn_past_key_value, + output_attentions=output_attentions, + training=training, + ) + attention_output = self_attention_outputs[0] + + # if decoder, the last output is tuple of self-attn cache + if self.is_decoder: + outputs = self_attention_outputs[1:-1] + present_key_value = self_attention_outputs[-1] + else: + outputs = self_attention_outputs[1:] # add self attentions if we output attention weights + + cross_attn_present_key_value = None + if self.is_decoder and encoder_hidden_states is not None: + if not hasattr(self, "crossattention"): + raise ValueError( + f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers" + " by setting `config.add_cross_attention=True`" + ) + + # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple + cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None + cross_attention_outputs = self.crossattention( + input_tensor=attention_output, + attention_mask=attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + past_key_value=cross_attn_past_key_value, + output_attentions=output_attentions, + training=training, + ) + attention_output = cross_attention_outputs[0] + outputs = outputs + cross_attention_outputs[1:-1] # add cross attentions if we output attention weights + + # add cross-attn cache to positions 3,4 of present_key_value tuple + cross_attn_present_key_value = cross_attention_outputs[-1] + present_key_value = present_key_value + cross_attn_present_key_value + + intermediate_output = self.intermediate(hidden_states=attention_output) + layer_output = self.bert_output( + hidden_states=intermediate_output, input_tensor=attention_output, training=training + ) + outputs = (layer_output,) + outputs # add attentions if we output them + + # if decoder, return the attn key/values as the last output + if self.is_decoder: + outputs = outputs + (present_key_value,) + + return outputs + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "attention", None) is not None: + with tf.name_scope(self.attention.name): + self.attention.build(None) + if getattr(self, "intermediate", None) is not None: + with tf.name_scope(self.intermediate.name): + self.intermediate.build(None) + if getattr(self, "bert_output", None) is not None: + with tf.name_scope(self.bert_output.name): + self.bert_output.build(None) + if getattr(self, "crossattention", None) is not None: + with tf.name_scope(self.crossattention.name): + self.crossattention.build(None) + + +# Copied from transformers.models.bert.modeling_tf_bert.TFBertEncoder with Bert->Electra +class TFElectraEncoder(keras.layers.Layer): + def __init__(self, config: ElectraConfig, **kwargs): + super().__init__(**kwargs) + self.config = config + self.layer = [TFElectraLayer(config, name=f"layer_._{i}") for i in range(config.num_hidden_layers)] + + def call( + self, + hidden_states: tf.Tensor, + attention_mask: tf.Tensor, + head_mask: tf.Tensor, + encoder_hidden_states: tf.Tensor | None, + encoder_attention_mask: tf.Tensor | None, + past_key_values: tuple[tuple[tf.Tensor]] | None, + use_cache: bool | None, + output_attentions: bool, + output_hidden_states: bool, + return_dict: bool, + training: bool = False, + ) -> TFBaseModelOutputWithPastAndCrossAttentions | tuple[tf.Tensor]: + all_hidden_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None + + next_decoder_cache = () if use_cache else None + for i, layer_module in enumerate(self.layer): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + past_key_value = past_key_values[i] if past_key_values is not None else None + + layer_outputs = layer_module( + hidden_states=hidden_states, + attention_mask=attention_mask, + head_mask=head_mask[i], + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + past_key_value=past_key_value, + output_attentions=output_attentions, + training=training, + ) + hidden_states = layer_outputs[0] + + if use_cache: + next_decoder_cache += (layer_outputs[-1],) + + if output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + if self.config.add_cross_attention and encoder_hidden_states is not None: + all_cross_attentions = all_cross_attentions + (layer_outputs[2],) + + # Add last layer + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple( + v for v in [hidden_states, all_hidden_states, all_attentions, all_cross_attentions] if v is not None + ) + + return TFBaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=next_decoder_cache, + hidden_states=all_hidden_states, + attentions=all_attentions, + cross_attentions=all_cross_attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "layer", None) is not None: + for layer in self.layer: + with tf.name_scope(layer.name): + layer.build(None) + + +# Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler with Bert->Electra +class TFElectraPooler(keras.layers.Layer): + def __init__(self, config: ElectraConfig, **kwargs): + super().__init__(**kwargs) + + self.dense = keras.layers.Dense( + units=config.hidden_size, + kernel_initializer=get_initializer(config.initializer_range), + activation="tanh", + name="dense", + ) + self.config = config + + def call(self, hidden_states: tf.Tensor) -> tf.Tensor: + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. + first_token_tensor = hidden_states[:, 0] + pooled_output = self.dense(inputs=first_token_tensor) + + return pooled_output + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build([None, None, self.config.hidden_size]) + + +# Copied from transformers.models.albert.modeling_tf_albert.TFAlbertEmbeddings with Albert->Electra +class TFElectraEmbeddings(keras.layers.Layer): + """Construct the embeddings from word, position and token_type embeddings.""" + + def __init__(self, config: ElectraConfig, **kwargs): + super().__init__(**kwargs) + + self.config = config + self.embedding_size = config.embedding_size + self.max_position_embeddings = config.max_position_embeddings + self.initializer_range = config.initializer_range + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) + + def build(self, input_shape=None): + with tf.name_scope("word_embeddings"): + self.weight = self.add_weight( + name="weight", + shape=[self.config.vocab_size, self.embedding_size], + initializer=get_initializer(self.initializer_range), + ) + + with tf.name_scope("token_type_embeddings"): + self.token_type_embeddings = self.add_weight( + name="embeddings", + shape=[self.config.type_vocab_size, self.embedding_size], + initializer=get_initializer(self.initializer_range), + ) + + with tf.name_scope("position_embeddings"): + self.position_embeddings = self.add_weight( + name="embeddings", + shape=[self.max_position_embeddings, self.embedding_size], + initializer=get_initializer(self.initializer_range), + ) + + if self.built: + return + self.built = True + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.embedding_size]) + + # Copied from transformers.models.bert.modeling_tf_bert.TFBertEmbeddings.call + def call( + self, + input_ids: tf.Tensor | None = None, + position_ids: tf.Tensor | None = None, + token_type_ids: tf.Tensor | None = None, + inputs_embeds: tf.Tensor | None = None, + past_key_values_length=0, + training: bool = False, + ) -> tf.Tensor: + """ + Applies embedding based on inputs tensor. + + Returns: + final_embeddings (`tf.Tensor`): output embedding tensor. + """ + if input_ids is None and inputs_embeds is None: + raise ValueError("Need to provide either `input_ids` or `input_embeds`.") + + if input_ids is not None: + check_embeddings_within_bounds(input_ids, self.config.vocab_size) + inputs_embeds = tf.gather(params=self.weight, indices=input_ids) + + input_shape = shape_list(inputs_embeds)[:-1] + + if token_type_ids is None: + token_type_ids = tf.fill(dims=input_shape, value=0) + + if position_ids is None: + position_ids = tf.expand_dims( + tf.range(start=past_key_values_length, limit=input_shape[1] + past_key_values_length), axis=0 + ) + + position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids) + token_type_embeds = tf.gather(params=self.token_type_embeddings, indices=token_type_ids) + final_embeddings = inputs_embeds + position_embeds + token_type_embeds + final_embeddings = self.LayerNorm(inputs=final_embeddings) + final_embeddings = self.dropout(inputs=final_embeddings, training=training) + + return final_embeddings + + +class TFElectraDiscriminatorPredictions(keras.layers.Layer): + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + + self.dense = keras.layers.Dense(config.hidden_size, name="dense") + self.dense_prediction = keras.layers.Dense(1, name="dense_prediction") + self.config = config + + def call(self, discriminator_hidden_states, training=False): + hidden_states = self.dense(discriminator_hidden_states) + hidden_states = get_tf_activation(self.config.hidden_act)(hidden_states) + logits = tf.squeeze(self.dense_prediction(hidden_states), -1) + + return logits + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build([None, None, self.config.hidden_size]) + if getattr(self, "dense_prediction", None) is not None: + with tf.name_scope(self.dense_prediction.name): + self.dense_prediction.build([None, None, self.config.hidden_size]) + + +class TFElectraGeneratorPredictions(keras.layers.Layer): + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dense = keras.layers.Dense(config.embedding_size, name="dense") + self.config = config + + def call(self, generator_hidden_states, training=False): + hidden_states = self.dense(generator_hidden_states) + hidden_states = get_tf_activation("gelu")(hidden_states) + hidden_states = self.LayerNorm(hidden_states) + + return hidden_states + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.embedding_size]) + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build([None, None, self.config.hidden_size]) + + +class TFElectraPreTrainedModel(TFPreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = ElectraConfig + base_model_prefix = "electra" + # When the model is loaded from a PT model + _keys_to_ignore_on_load_unexpected = [r"generator_lm_head.weight"] + _keys_to_ignore_on_load_missing = [r"dropout"] + + +@keras_serializable +class TFElectraMainLayer(keras.layers.Layer): + config_class = ElectraConfig + + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + + self.config = config + self.is_decoder = config.is_decoder + + self.embeddings = TFElectraEmbeddings(config, name="embeddings") + + if config.embedding_size != config.hidden_size: + self.embeddings_project = keras.layers.Dense(config.hidden_size, name="embeddings_project") + + self.encoder = TFElectraEncoder(config, name="encoder") + + def get_input_embeddings(self): + return self.embeddings + + def set_input_embeddings(self, value): + self.embeddings.weight = value + self.embeddings.vocab_size = shape_list(value)[0] + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + raise NotImplementedError + + def get_extended_attention_mask(self, attention_mask, input_shape, dtype, past_key_values_length=0): + batch_size, seq_length = input_shape + + if attention_mask is None: + attention_mask = tf.fill(dims=(batch_size, seq_length + past_key_values_length), value=1) + + # We create a 3D attention mask from a 2D tensor mask. + # Sizes are [batch_size, 1, 1, to_seq_length] + # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] + # this attention mask is more simple than the triangular masking of causal attention + # used in OpenAI GPT, we just need to prepare the broadcast dimension here. + attention_mask_shape = shape_list(attention_mask) + + mask_seq_length = seq_length + past_key_values_length + # Copied from `modeling_tf_t5.py` + # Provided a padding mask of dimensions [batch_size, mask_seq_length] + # - if the model is a decoder, apply a causal mask in addition to the padding mask + # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, mask_seq_length, mask_seq_length] + if self.is_decoder: + seq_ids = tf.range(mask_seq_length) + causal_mask = tf.less_equal( + tf.tile(seq_ids[None, None, :], (batch_size, mask_seq_length, 1)), + seq_ids[None, :, None], + ) + causal_mask = tf.cast(causal_mask, dtype=attention_mask.dtype) + extended_attention_mask = causal_mask * attention_mask[:, None, :] + attention_mask_shape = shape_list(extended_attention_mask) + extended_attention_mask = tf.reshape( + extended_attention_mask, (attention_mask_shape[0], 1, attention_mask_shape[1], attention_mask_shape[2]) + ) + if past_key_values_length > 0: + extended_attention_mask = extended_attention_mask[:, :, -seq_length:, :] + else: + extended_attention_mask = tf.reshape( + attention_mask, (attention_mask_shape[0], 1, 1, attention_mask_shape[1]) + ) + + # Since attention_mask is 1.0 for positions we want to attend and 0.0 for + # masked positions, this operation will create a tensor which is 0.0 for + # positions we want to attend and -10000.0 for masked positions. + # Since we are adding it to the raw scores before the softmax, this is + # effectively the same as removing these entirely. + extended_attention_mask = tf.cast(extended_attention_mask, dtype=dtype) + one_cst = tf.constant(1.0, dtype=dtype) + ten_thousand_cst = tf.constant(-10000.0, dtype=dtype) + extended_attention_mask = tf.multiply(tf.subtract(one_cst, extended_attention_mask), ten_thousand_cst) + + return extended_attention_mask + + def get_head_mask(self, head_mask): + if head_mask is not None: + raise NotImplementedError + else: + head_mask = [None] * self.config.num_hidden_layers + + return head_mask + + @unpack_inputs + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + encoder_hidden_states: np.ndarray | tf.Tensor | None = None, + encoder_attention_mask: np.ndarray | tf.Tensor | None = None, + past_key_values: tuple[tuple[np.ndarray | tf.Tensor]] | None = None, + use_cache: bool | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + training: bool | None = False, + ) -> TFBaseModelOutputWithPastAndCrossAttentions | tuple[tf.Tensor]: + if not self.config.is_decoder: + use_cache = False + + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + input_shape = shape_list(input_ids) + elif inputs_embeds is not None: + input_shape = shape_list(inputs_embeds)[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + batch_size, seq_length = input_shape + + if past_key_values is None: + past_key_values_length = 0 + past_key_values = [None] * len(self.encoder.layer) + else: + past_key_values_length = shape_list(past_key_values[0][0])[-2] + + if attention_mask is None: + attention_mask = tf.fill(dims=(batch_size, seq_length + past_key_values_length), value=1) + + if token_type_ids is None: + token_type_ids = tf.fill(dims=input_shape, value=0) + + hidden_states = self.embeddings( + input_ids=input_ids, + position_ids=position_ids, + token_type_ids=token_type_ids, + inputs_embeds=inputs_embeds, + past_key_values_length=past_key_values_length, + training=training, + ) + extended_attention_mask = self.get_extended_attention_mask( + attention_mask, input_shape, hidden_states.dtype, past_key_values_length + ) + + # Copied from `modeling_tf_t5.py` with -1e9 -> -10000 + if self.is_decoder and encoder_attention_mask is not None: + # If a 2D ou 3D attention mask is provided for the cross-attention + # we need to make broadcastable to [batch_size, num_heads, mask_seq_length, mask_seq_length] + # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] + encoder_attention_mask = tf.cast(encoder_attention_mask, dtype=extended_attention_mask.dtype) + num_dims_encoder_attention_mask = len(shape_list(encoder_attention_mask)) + if num_dims_encoder_attention_mask == 3: + encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :] + if num_dims_encoder_attention_mask == 2: + encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :] + + # T5 has a mask that can compare sequence ids, we can simulate this here with this transposition + # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow/transformer/transformer_layers.py#L270 + # encoder_extended_attention_mask = tf.math.equal(encoder_extended_attention_mask, + # tf.transpose(encoder_extended_attention_mask, perm=(-1, -2))) + + encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -10000.0 + else: + encoder_extended_attention_mask = None + + head_mask = self.get_head_mask(head_mask) + + if hasattr(self, "embeddings_project"): + hidden_states = self.embeddings_project(hidden_states, training=training) + + hidden_states = self.encoder( + hidden_states=hidden_states, + attention_mask=extended_attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_extended_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + return hidden_states + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "embeddings", None) is not None: + with tf.name_scope(self.embeddings.name): + self.embeddings.build(None) + if getattr(self, "encoder", None) is not None: + with tf.name_scope(self.encoder.name): + self.encoder.build(None) + if getattr(self, "embeddings_project", None) is not None: + with tf.name_scope(self.embeddings_project.name): + self.embeddings_project.build([None, None, self.config.embedding_size]) + + +@dataclass +class TFElectraForPreTrainingOutput(ModelOutput): + """ + Output type of [`TFElectraForPreTraining`]. + + Args: + loss (*optional*, returned when `labels` is provided, `tf.Tensor` of shape `(1,)`): + Total loss of the ELECTRA objective. + logits (`tf.Tensor` of shape `(batch_size, sequence_length)`): + Prediction scores of the head (scores for each token before SoftMax). + hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape + `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + logits: tf.Tensor | None = None + hidden_states: tuple[tf.Tensor] | None = None + attentions: tuple[tf.Tensor] | None = None + + +ELECTRA_START_DOCSTRING = r""" + + This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and + behavior. + + + + TensorFlow models and layers in `transformers` accept two formats as input: + + - having all inputs as keyword arguments (like PyTorch models), or + - having all inputs as a list, tuple or dict in the first positional argument. + + The reason the second format is supported is that Keras methods prefer this format when passing inputs to models + and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just + pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second + format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with + the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first + positional argument: + + - a single Tensor with `input_ids` only and nothing else: `model(input_ids)` + - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring: + `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])` + - a dictionary with one or several input Tensors associated to the input names given in the docstring: + `model({"input_ids": input_ids, "token_type_ids": token_type_ids})` + + Note that when creating models and layers with + [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry + about any of this, as you can just pass inputs like you would to any other Python function! + + + + Parameters: + config ([`ElectraConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +ELECTRA_INPUTS_DOCSTRING = r""" + Args: + input_ids (`Numpy array` or `tf.Tensor` of shape `({0})`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and + [`PreTrainedTokenizer.encode`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + position_ids (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + + [What are position IDs?](../glossary#position-ids) + head_mask (`Numpy array` or `tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + inputs_embeds (`tf.Tensor` of shape `({0}, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the + config will be used instead. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. This argument can be used only in eager mode, in graph mode the value in the config will be + used instead. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in + eager mode, in graph mode the value will always be set to True. + training (`bool`, *optional*, defaults to `False`): + Whether or not to use the model in training mode (some modules like dropout modules have different + behaviors between training and evaluation). +""" + + +@add_start_docstrings( + "The bare Electra Model transformer outputting raw hidden-states without any specific head on top. Identical to " + "the BERT model except that it uses an additional linear layer between the embedding layer and the encoder if the " + "hidden size and embedding size are different. " + "" + "Both the generator and discriminator checkpoints may be loaded into this model.", + ELECTRA_START_DOCSTRING, +) +class TFElectraModel(TFElectraPreTrainedModel): + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + self.electra = TFElectraMainLayer(config, name="electra") + + @unpack_inputs + @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFBaseModelOutputWithPastAndCrossAttentions, + config_class=_CONFIG_FOR_DOC, + ) + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + encoder_hidden_states: np.ndarray | tf.Tensor | None = None, + encoder_attention_mask: np.ndarray | tf.Tensor | None = None, + past_key_values: tuple[tuple[np.ndarray | tf.Tensor]] | None = None, + use_cache: bool | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + training: bool | None = False, + ) -> TFBaseModelOutputWithPastAndCrossAttentions | tuple[tf.Tensor]: + r""" + encoder_hidden_states (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if + the model is configured as a decoder. + encoder_attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in + the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + past_key_values (`tuple[tuple[tf.Tensor]]` of length `config.n_layers`) + contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `(batch_size, sequence_length)`. + use_cache (`bool`, *optional*, defaults to `True`): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). Set to `False` during training, `True` during generation + """ + outputs = self.electra( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + return outputs + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "electra", None) is not None: + with tf.name_scope(self.electra.name): + self.electra.build(None) + + +@add_start_docstrings( + """ + Electra model with a binary classification head on top as used during pretraining for identifying generated tokens. + + Even though both the discriminator and generator may be loaded into this model, the discriminator is the only model + of the two to have the correct classification head to be used for this model. + """, + ELECTRA_START_DOCSTRING, +) +class TFElectraForPreTraining(TFElectraPreTrainedModel): + def __init__(self, config, **kwargs): + super().__init__(config, **kwargs) + + self.electra = TFElectraMainLayer(config, name="electra") + self.discriminator_predictions = TFElectraDiscriminatorPredictions(config, name="discriminator_predictions") + + @unpack_inputs + @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @replace_return_docstrings(output_type=TFElectraForPreTrainingOutput, config_class=_CONFIG_FOR_DOC) + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + training: bool | None = False, + ) -> TFElectraForPreTrainingOutput | tuple[tf.Tensor]: + r""" + Returns: + + Examples: + + ```python + >>> import tensorflow as tf + >>> from transformers import AutoTokenizer, TFElectraForPreTraining + + >>> tokenizer = AutoTokenizer.from_pretrained("google/electra-small-discriminator") + >>> model = TFElectraForPreTraining.from_pretrained("google/electra-small-discriminator") + >>> input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 + >>> outputs = model(input_ids) + >>> scores = outputs[0] + ```""" + discriminator_hidden_states = self.electra( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + discriminator_sequence_output = discriminator_hidden_states[0] + logits = self.discriminator_predictions(discriminator_sequence_output) + + if not return_dict: + return (logits,) + discriminator_hidden_states[1:] + + return TFElectraForPreTrainingOutput( + logits=logits, + hidden_states=discriminator_hidden_states.hidden_states, + attentions=discriminator_hidden_states.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "electra", None) is not None: + with tf.name_scope(self.electra.name): + self.electra.build(None) + if getattr(self, "discriminator_predictions", None) is not None: + with tf.name_scope(self.discriminator_predictions.name): + self.discriminator_predictions.build(None) + + +class TFElectraMaskedLMHead(keras.layers.Layer): + def __init__(self, config, input_embeddings, **kwargs): + super().__init__(**kwargs) + + self.config = config + self.embedding_size = config.embedding_size + self.input_embeddings = input_embeddings + + def build(self, input_shape): + self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias") + + super().build(input_shape) + + def get_output_embeddings(self): + return self.input_embeddings + + def set_output_embeddings(self, value): + self.input_embeddings.weight = value + self.input_embeddings.vocab_size = shape_list(value)[0] + + def get_bias(self): + return {"bias": self.bias} + + def set_bias(self, value): + self.bias = value["bias"] + self.config.vocab_size = shape_list(value["bias"])[0] + + def call(self, hidden_states): + seq_length = shape_list(tensor=hidden_states)[1] + hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.embedding_size]) + hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True) + hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size]) + hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias) + + return hidden_states + + +@add_start_docstrings( + """ + Electra model with a language modeling head on top. + + Even though both the discriminator and generator may be loaded into this model, the generator is the only model of + the two to have been trained for the masked language modeling task. + """, + ELECTRA_START_DOCSTRING, +) +class TFElectraForMaskedLM(TFElectraPreTrainedModel, TFMaskedLanguageModelingLoss): + def __init__(self, config, **kwargs): + super().__init__(config, **kwargs) + + self.config = config + self.electra = TFElectraMainLayer(config, name="electra") + self.generator_predictions = TFElectraGeneratorPredictions(config, name="generator_predictions") + + if isinstance(config.hidden_act, str): + self.activation = get_tf_activation(config.hidden_act) + else: + self.activation = config.hidden_act + + self.generator_lm_head = TFElectraMaskedLMHead(config, self.electra.embeddings, name="generator_lm_head") + + def get_lm_head(self): + return self.generator_lm_head + + def get_prefix_bias_name(self): + warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning) + return self.name + "/" + self.generator_lm_head.name + + @unpack_inputs + @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint="google/electra-small-generator", + output_type=TFMaskedLMOutput, + config_class=_CONFIG_FOR_DOC, + mask="[MASK]", + expected_output="'paris'", + expected_loss=1.22, + ) + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + labels: np.ndarray | tf.Tensor | None = None, + training: bool | None = False, + ) -> TFMaskedLMOutput | tuple[tf.Tensor]: + r""" + labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., + config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the + loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` + """ + generator_hidden_states = self.electra( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + generator_sequence_output = generator_hidden_states[0] + prediction_scores = self.generator_predictions(generator_sequence_output, training=training) + prediction_scores = self.generator_lm_head(prediction_scores, training=training) + loss = None if labels is None else self.hf_compute_loss(labels, prediction_scores) + + if not return_dict: + output = (prediction_scores,) + generator_hidden_states[1:] + + return ((loss,) + output) if loss is not None else output + + return TFMaskedLMOutput( + loss=loss, + logits=prediction_scores, + hidden_states=generator_hidden_states.hidden_states, + attentions=generator_hidden_states.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "electra", None) is not None: + with tf.name_scope(self.electra.name): + self.electra.build(None) + if getattr(self, "generator_predictions", None) is not None: + with tf.name_scope(self.generator_predictions.name): + self.generator_predictions.build(None) + if getattr(self, "generator_lm_head", None) is not None: + with tf.name_scope(self.generator_lm_head.name): + self.generator_lm_head.build(None) + + +class TFElectraClassificationHead(keras.layers.Layer): + """Head for sentence-level classification tasks.""" + + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + + self.dense = keras.layers.Dense( + config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + ) + classifier_dropout = ( + config.classifhidden_dropout_probier_dropout + if config.classifier_dropout is not None + else config.hidden_dropout_prob + ) + self.dropout = keras.layers.Dropout(classifier_dropout) + self.out_proj = keras.layers.Dense( + config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj" + ) + self.config = config + + def call(self, inputs, **kwargs): + x = inputs[:, 0, :] # take token (equiv. to [CLS]) + x = self.dropout(x) + x = self.dense(x) + x = get_tf_activation("gelu")(x) # although BERT uses tanh here, it seems Electra authors used gelu here + x = self.dropout(x) + x = self.out_proj(x) + + return x + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build([None, None, self.config.hidden_size]) + if getattr(self, "out_proj", None) is not None: + with tf.name_scope(self.out_proj.name): + self.out_proj.build([None, None, self.config.hidden_size]) + + +@add_start_docstrings( + """ + ELECTRA Model transformer with a sequence classification/regression head on top (a linear layer on top of the + pooled output) e.g. for GLUE tasks. + """, + ELECTRA_START_DOCSTRING, +) +class TFElectraForSequenceClassification(TFElectraPreTrainedModel, TFSequenceClassificationLoss): + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + self.num_labels = config.num_labels + self.electra = TFElectraMainLayer(config, name="electra") + self.classifier = TFElectraClassificationHead(config, name="classifier") + + @unpack_inputs + @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint="bhadresh-savani/electra-base-emotion", + output_type=TFSequenceClassifierOutput, + config_class=_CONFIG_FOR_DOC, + expected_output="'joy'", + expected_loss=0.06, + ) + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + labels: np.ndarray | tf.Tensor | None = None, + training: bool | None = False, + ) -> TFSequenceClassifierOutput | tuple[tf.Tensor]: + r""" + labels (`tf.Tensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + outputs = self.electra( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + logits = self.classifier(outputs[0]) + loss = None if labels is None else self.hf_compute_loss(labels, logits) + + if not return_dict: + output = (logits,) + outputs[1:] + + return ((loss,) + output) if loss is not None else output + + return TFSequenceClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "electra", None) is not None: + with tf.name_scope(self.electra.name): + self.electra.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build(None) + + +@add_start_docstrings( + """ + ELECTRA Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a + softmax) e.g. for RocStories/SWAG tasks. + """, + ELECTRA_START_DOCSTRING, +) +class TFElectraForMultipleChoice(TFElectraPreTrainedModel, TFMultipleChoiceLoss): + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + self.electra = TFElectraMainLayer(config, name="electra") + self.sequence_summary = TFSequenceSummary( + config, initializer_range=config.initializer_range, name="sequence_summary" + ) + self.classifier = keras.layers.Dense( + 1, kernel_initializer=get_initializer(config.initializer_range), name="classifier" + ) + self.config = config + + @unpack_inputs + @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFMultipleChoiceModelOutput, + config_class=_CONFIG_FOR_DOC, + ) + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + labels: np.ndarray | tf.Tensor | None = None, + training: bool | None = False, + ) -> TFMultipleChoiceModelOutput | tuple[tf.Tensor]: + r""" + labels (`tf.Tensor` of shape `(batch_size,)`, *optional*): + Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices]` + where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above) + """ + + if input_ids is not None: + num_choices = shape_list(input_ids)[1] + seq_length = shape_list(input_ids)[2] + else: + num_choices = shape_list(inputs_embeds)[1] + seq_length = shape_list(inputs_embeds)[2] + + flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None + flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None + flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None + flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None + flat_inputs_embeds = ( + tf.reshape(inputs_embeds, (-1, seq_length, shape_list(inputs_embeds)[3])) + if inputs_embeds is not None + else None + ) + outputs = self.electra( + input_ids=flat_input_ids, + attention_mask=flat_attention_mask, + token_type_ids=flat_token_type_ids, + position_ids=flat_position_ids, + head_mask=head_mask, + inputs_embeds=flat_inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + logits = self.sequence_summary(outputs[0]) + logits = self.classifier(logits) + reshaped_logits = tf.reshape(logits, (-1, num_choices)) + loss = None if labels is None else self.hf_compute_loss(labels, reshaped_logits) + + if not return_dict: + output = (reshaped_logits,) + outputs[1:] + + return ((loss,) + output) if loss is not None else output + + return TFMultipleChoiceModelOutput( + loss=loss, + logits=reshaped_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "electra", None) is not None: + with tf.name_scope(self.electra.name): + self.electra.build(None) + if getattr(self, "sequence_summary", None) is not None: + with tf.name_scope(self.sequence_summary.name): + self.sequence_summary.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build([None, None, self.config.hidden_size]) + + +@add_start_docstrings( + """ + Electra model with a token classification head on top. + + Both the discriminator and generator may be loaded into this model. + """, + ELECTRA_START_DOCSTRING, +) +class TFElectraForTokenClassification(TFElectraPreTrainedModel, TFTokenClassificationLoss): + def __init__(self, config, **kwargs): + super().__init__(config, **kwargs) + + self.electra = TFElectraMainLayer(config, name="electra") + classifier_dropout = ( + config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob + ) + self.dropout = keras.layers.Dropout(classifier_dropout) + self.classifier = keras.layers.Dense( + config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" + ) + self.config = config + + @unpack_inputs + @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint="bhadresh-savani/electra-base-discriminator-finetuned-conll03-english", + output_type=TFTokenClassifierOutput, + config_class=_CONFIG_FOR_DOC, + expected_output="['B-LOC', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'B-LOC', 'I-LOC']", + expected_loss=0.11, + ) + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + labels: np.ndarray | tf.Tensor | None = None, + training: bool | None = False, + ) -> TFTokenClassifierOutput | tuple[tf.Tensor]: + r""" + labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. + """ + discriminator_hidden_states = self.electra( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + discriminator_sequence_output = discriminator_hidden_states[0] + discriminator_sequence_output = self.dropout(discriminator_sequence_output) + logits = self.classifier(discriminator_sequence_output) + loss = None if labels is None else self.hf_compute_loss(labels, logits) + + if not return_dict: + output = (logits,) + discriminator_hidden_states[1:] + + return ((loss,) + output) if loss is not None else output + + return TFTokenClassifierOutput( + loss=loss, + logits=logits, + hidden_states=discriminator_hidden_states.hidden_states, + attentions=discriminator_hidden_states.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "electra", None) is not None: + with tf.name_scope(self.electra.name): + self.electra.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build([None, None, self.config.hidden_size]) + + +@add_start_docstrings( + """ + Electra Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear + layers on top of the hidden-states output to compute `span start logits` and `span end logits`). + """, + ELECTRA_START_DOCSTRING, +) +class TFElectraForQuestionAnswering(TFElectraPreTrainedModel, TFQuestionAnsweringLoss): + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + self.num_labels = config.num_labels + self.electra = TFElectraMainLayer(config, name="electra") + self.qa_outputs = keras.layers.Dense( + config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" + ) + self.config = config + + @unpack_inputs + @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint="bhadresh-savani/electra-base-squad2", + output_type=TFQuestionAnsweringModelOutput, + config_class=_CONFIG_FOR_DOC, + qa_target_start_index=11, + qa_target_end_index=12, + expected_output="'a nice puppet'", + expected_loss=2.64, + ) + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + start_positions: np.ndarray | tf.Tensor | None = None, + end_positions: np.ndarray | tf.Tensor | None = None, + training: bool | None = False, + ) -> TFQuestionAnsweringModelOutput | tuple[tf.Tensor]: + r""" + start_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*): + Labels for position (index) of the start of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence + are not taken into account for computing the loss. + end_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*): + Labels for position (index) of the end of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence + are not taken into account for computing the loss. + """ + discriminator_hidden_states = self.electra( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + discriminator_sequence_output = discriminator_hidden_states[0] + logits = self.qa_outputs(discriminator_sequence_output) + start_logits, end_logits = tf.split(logits, 2, axis=-1) + start_logits = tf.squeeze(start_logits, axis=-1) + end_logits = tf.squeeze(end_logits, axis=-1) + loss = None + + if start_positions is not None and end_positions is not None: + labels = {"start_position": start_positions} + labels["end_position"] = end_positions + loss = self.hf_compute_loss(labels, (start_logits, end_logits)) + + if not return_dict: + output = ( + start_logits, + end_logits, + ) + discriminator_hidden_states[1:] + + return ((loss,) + output) if loss is not None else output + + return TFQuestionAnsweringModelOutput( + loss=loss, + start_logits=start_logits, + end_logits=end_logits, + hidden_states=discriminator_hidden_states.hidden_states, + attentions=discriminator_hidden_states.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "electra", None) is not None: + with tf.name_scope(self.electra.name): + self.electra.build(None) + if getattr(self, "qa_outputs", None) is not None: + with tf.name_scope(self.qa_outputs.name): + self.qa_outputs.build([None, None, self.config.hidden_size]) + + +__all__ = [ + "TFElectraForMaskedLM", + "TFElectraForMultipleChoice", + "TFElectraForPreTraining", + "TFElectraForQuestionAnswering", + "TFElectraForSequenceClassification", + "TFElectraForTokenClassification", + "TFElectraModel", + "TFElectraPreTrainedModel", +] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/electra/tokenization_electra.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/electra/tokenization_electra.py new file mode 100644 index 0000000000000000000000000000000000000000..d8971dd6f40374e1d8a6e8ec479cd9da79b64da3 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/electra/tokenization_electra.py @@ -0,0 +1,482 @@ +# coding=utf-8 +# Copyright 2020 The Google AI Team, Stanford University and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import collections +import os +import unicodedata +from typing import Optional + +from ...tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace +from ...utils import logging + + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"} + + +# Copied from transformers.models.bert.tokenization_bert.load_vocab +def load_vocab(vocab_file): + """Loads a vocabulary file into a dictionary.""" + vocab = collections.OrderedDict() + with open(vocab_file, "r", encoding="utf-8") as reader: + tokens = reader.readlines() + for index, token in enumerate(tokens): + token = token.rstrip("\n") + vocab[token] = index + return vocab + + +# Copied from transformers.models.bert.tokenization_bert.whitespace_tokenize +def whitespace_tokenize(text): + """Runs basic whitespace cleaning and splitting on a piece of text.""" + text = text.strip() + if not text: + return [] + tokens = text.split() + return tokens + + +# Copied from transformers.models.bert.tokenization_bert.BertTokenizer with Bert->Electra,BERT->Electra +class ElectraTokenizer(PreTrainedTokenizer): + r""" + Construct a Electra tokenizer. Based on WordPiece. + + This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to + this superclass for more information regarding those methods. + + Args: + vocab_file (`str`): + File containing the vocabulary. + do_lower_case (`bool`, *optional*, defaults to `True`): + Whether or not to lowercase the input when tokenizing. + do_basic_tokenize (`bool`, *optional*, defaults to `True`): + Whether or not to do basic tokenization before WordPiece. + never_split (`Iterable`, *optional*): + Collection of tokens which will never be split during tokenization. Only has an effect when + `do_basic_tokenize=True` + unk_token (`str`, *optional*, defaults to `"[UNK]"`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + sep_token (`str`, *optional*, defaults to `"[SEP]"`): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for + sequence classification or for a text and a question for question answering. It is also used as the last + token of a sequence built with special tokens. + pad_token (`str`, *optional*, defaults to `"[PAD]"`): + The token used for padding, for example when batching sequences of different lengths. + cls_token (`str`, *optional*, defaults to `"[CLS]"`): + The classifier token which is used when doing sequence classification (classification of the whole sequence + instead of per-token classification). It is the first token of the sequence when built with special tokens. + mask_token (`str`, *optional*, defaults to `"[MASK]"`): + The token used for masking values. This is the token used when training this model with masked language + modeling. This is the token which the model will try to predict. + tokenize_chinese_chars (`bool`, *optional*, defaults to `True`): + Whether or not to tokenize Chinese characters. + + This should likely be deactivated for Japanese (see this + [issue](https://github.com/huggingface/transformers/issues/328)). + strip_accents (`bool`, *optional*): + Whether or not to strip all accents. If this option is not specified, then it will be determined by the + value for `lowercase` (as in the original Electra). + clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`): + Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like + extra spaces. + """ + + vocab_files_names = VOCAB_FILES_NAMES + + def __init__( + self, + vocab_file, + do_lower_case=True, + do_basic_tokenize=True, + never_split=None, + unk_token="[UNK]", + sep_token="[SEP]", + pad_token="[PAD]", + cls_token="[CLS]", + mask_token="[MASK]", + tokenize_chinese_chars=True, + strip_accents=None, + clean_up_tokenization_spaces=True, + **kwargs, + ): + if not os.path.isfile(vocab_file): + raise ValueError( + f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained" + " model use `tokenizer = ElectraTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`" + ) + self.vocab = load_vocab(vocab_file) + self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()]) + self.do_basic_tokenize = do_basic_tokenize + if do_basic_tokenize: + self.basic_tokenizer = BasicTokenizer( + do_lower_case=do_lower_case, + never_split=never_split, + tokenize_chinese_chars=tokenize_chinese_chars, + strip_accents=strip_accents, + ) + + self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token)) + + super().__init__( + do_lower_case=do_lower_case, + do_basic_tokenize=do_basic_tokenize, + never_split=never_split, + unk_token=unk_token, + sep_token=sep_token, + pad_token=pad_token, + cls_token=cls_token, + mask_token=mask_token, + tokenize_chinese_chars=tokenize_chinese_chars, + strip_accents=strip_accents, + clean_up_tokenization_spaces=clean_up_tokenization_spaces, + **kwargs, + ) + + @property + def do_lower_case(self): + return self.basic_tokenizer.do_lower_case + + @property + def vocab_size(self): + return len(self.vocab) + + def get_vocab(self): + return dict(self.vocab, **self.added_tokens_encoder) + + def _tokenize(self, text, split_special_tokens=False): + split_tokens = [] + if self.do_basic_tokenize: + for token in self.basic_tokenizer.tokenize( + text, never_split=self.all_special_tokens if not split_special_tokens else None + ): + # If the token is part of the never_split set + if token in self.basic_tokenizer.never_split: + split_tokens.append(token) + else: + split_tokens += self.wordpiece_tokenizer.tokenize(token) + else: + split_tokens = self.wordpiece_tokenizer.tokenize(text) + return split_tokens + + def _convert_token_to_id(self, token): + """Converts a token (str) in an id using the vocab.""" + return self.vocab.get(token, self.vocab.get(self.unk_token)) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.ids_to_tokens.get(index, self.unk_token) + + def convert_tokens_to_string(self, tokens): + """Converts a sequence of tokens (string) in a single string.""" + out_string = " ".join(tokens).replace(" ##", "").strip() + return out_string + + def build_inputs_with_special_tokens( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None + ) -> list[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A Electra sequence has the following format: + + - single sequence: `[CLS] X [SEP]` + - pair of sequences: `[CLS] A [SEP] B [SEP]` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + if token_ids_1 is None: + return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] + cls = [self.cls_token_id] + sep = [self.sep_token_id] + return cls + token_ids_0 + sep + token_ids_1 + sep + + def get_special_tokens_mask( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False + ) -> list[int]: + """ + Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer `prepare_for_model` method. + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not the token list is already formatted with special tokens for the model. + + Returns: + `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + + if already_has_special_tokens: + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) + + if token_ids_1 is not None: + return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] + return [1] + ([0] * len(token_ids_0)) + [1] + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]: + index = 0 + if os.path.isdir(save_directory): + vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + else: + vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory + with open(vocab_file, "w", encoding="utf-8") as writer: + for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]): + if index != token_index: + logger.warning( + f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive." + " Please check that the vocabulary is not corrupted!" + ) + index = token_index + writer.write(token + "\n") + index += 1 + return (vocab_file,) + + +# Copied from transformers.models.bert.tokenization_bert.BasicTokenizer +class BasicTokenizer: + """ + Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.). + + Args: + do_lower_case (`bool`, *optional*, defaults to `True`): + Whether or not to lowercase the input when tokenizing. + never_split (`Iterable`, *optional*): + Collection of tokens which will never be split during tokenization. Only has an effect when + `do_basic_tokenize=True` + tokenize_chinese_chars (`bool`, *optional*, defaults to `True`): + Whether or not to tokenize Chinese characters. + + This should likely be deactivated for Japanese (see this + [issue](https://github.com/huggingface/transformers/issues/328)). + strip_accents (`bool`, *optional*): + Whether or not to strip all accents. If this option is not specified, then it will be determined by the + value for `lowercase` (as in the original BERT). + do_split_on_punc (`bool`, *optional*, defaults to `True`): + In some instances we want to skip the basic punctuation splitting so that later tokenization can capture + the full context of the words, such as contractions. + """ + + def __init__( + self, + do_lower_case=True, + never_split=None, + tokenize_chinese_chars=True, + strip_accents=None, + do_split_on_punc=True, + ): + if never_split is None: + never_split = [] + self.do_lower_case = do_lower_case + self.never_split = set(never_split) + self.tokenize_chinese_chars = tokenize_chinese_chars + self.strip_accents = strip_accents + self.do_split_on_punc = do_split_on_punc + + def tokenize(self, text, never_split=None): + """ + Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer. + + Args: + never_split (`List[str]`, *optional*) + Kept for backward compatibility purposes. Now implemented directly at the base class level (see + [`PreTrainedTokenizer.tokenize`]) List of token not to split. + """ + # union() returns a new set by concatenating the two sets. + never_split = self.never_split.union(set(never_split)) if never_split else self.never_split + text = self._clean_text(text) + + # This was added on November 1st, 2018 for the multilingual and Chinese + # models. This is also applied to the English models now, but it doesn't + # matter since the English models were not trained on any Chinese data + # and generally don't have any Chinese data in them (there are Chinese + # characters in the vocabulary because Wikipedia does have some Chinese + # words in the English Wikipedia.). + if self.tokenize_chinese_chars: + text = self._tokenize_chinese_chars(text) + # prevents treating the same character with different unicode codepoints as different characters + unicode_normalized_text = unicodedata.normalize("NFC", text) + orig_tokens = whitespace_tokenize(unicode_normalized_text) + split_tokens = [] + for token in orig_tokens: + if token not in never_split: + if self.do_lower_case: + token = token.lower() + if self.strip_accents is not False: + token = self._run_strip_accents(token) + elif self.strip_accents: + token = self._run_strip_accents(token) + split_tokens.extend(self._run_split_on_punc(token, never_split)) + + output_tokens = whitespace_tokenize(" ".join(split_tokens)) + return output_tokens + + def _run_strip_accents(self, text): + """Strips accents from a piece of text.""" + text = unicodedata.normalize("NFD", text) + output = [] + for char in text: + cat = unicodedata.category(char) + if cat == "Mn": + continue + output.append(char) + return "".join(output) + + def _run_split_on_punc(self, text, never_split=None): + """Splits punctuation on a piece of text.""" + if not self.do_split_on_punc or (never_split is not None and text in never_split): + return [text] + chars = list(text) + i = 0 + start_new_word = True + output = [] + while i < len(chars): + char = chars[i] + if _is_punctuation(char): + output.append([char]) + start_new_word = True + else: + if start_new_word: + output.append([]) + start_new_word = False + output[-1].append(char) + i += 1 + + return ["".join(x) for x in output] + + def _tokenize_chinese_chars(self, text): + """Adds whitespace around any CJK character.""" + output = [] + for char in text: + cp = ord(char) + if self._is_chinese_char(cp): + output.append(" ") + output.append(char) + output.append(" ") + else: + output.append(char) + return "".join(output) + + def _is_chinese_char(self, cp): + """Checks whether CP is the codepoint of a CJK character.""" + # This defines a "chinese character" as anything in the CJK Unicode block: + # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) + # + # Note that the CJK Unicode block is NOT all Japanese and Korean characters, + # despite its name. The modern Korean Hangul alphabet is a different block, + # as is Japanese Hiragana and Katakana. Those alphabets are used to write + # space-separated words, so they are not treated specially and handled + # like the all of the other languages. + if ( + (cp >= 0x4E00 and cp <= 0x9FFF) + or (cp >= 0x3400 and cp <= 0x4DBF) + or (cp >= 0x20000 and cp <= 0x2A6DF) + or (cp >= 0x2A700 and cp <= 0x2B73F) + or (cp >= 0x2B740 and cp <= 0x2B81F) + or (cp >= 0x2B820 and cp <= 0x2CEAF) + or (cp >= 0xF900 and cp <= 0xFAFF) + or (cp >= 0x2F800 and cp <= 0x2FA1F) + ): + return True + + return False + + def _clean_text(self, text): + """Performs invalid character removal and whitespace cleanup on text.""" + output = [] + for char in text: + cp = ord(char) + if cp == 0 or cp == 0xFFFD or _is_control(char): + continue + if _is_whitespace(char): + output.append(" ") + else: + output.append(char) + return "".join(output) + + +# Copied from transformers.models.bert.tokenization_bert.WordpieceTokenizer +class WordpieceTokenizer: + """Runs WordPiece tokenization.""" + + def __init__(self, vocab, unk_token, max_input_chars_per_word=100): + self.vocab = vocab + self.unk_token = unk_token + self.max_input_chars_per_word = max_input_chars_per_word + + def tokenize(self, text): + """ + Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform + tokenization using the given vocabulary. + + For example, `input = "unaffable"` will return as output `["un", "##aff", "##able"]`. + + Args: + text: A single token or whitespace separated tokens. This should have + already been passed through *BasicTokenizer*. + + Returns: + A list of wordpiece tokens. + """ + + output_tokens = [] + for token in whitespace_tokenize(text): + chars = list(token) + if len(chars) > self.max_input_chars_per_word: + output_tokens.append(self.unk_token) + continue + + is_bad = False + start = 0 + sub_tokens = [] + while start < len(chars): + end = len(chars) + cur_substr = None + while start < end: + substr = "".join(chars[start:end]) + if start > 0: + substr = "##" + substr + if substr in self.vocab: + cur_substr = substr + break + end -= 1 + if cur_substr is None: + is_bad = True + break + sub_tokens.append(cur_substr) + start = end + + if is_bad: + output_tokens.append(self.unk_token) + else: + output_tokens.extend(sub_tokens) + return output_tokens + + +__all__ = ["ElectraTokenizer"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/electra/tokenization_electra_fast.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/electra/tokenization_electra_fast.py new file mode 100644 index 0000000000000000000000000000000000000000..db0285581ed1eea5b903a3bed573bbf6408e0167 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/electra/tokenization_electra_fast.py @@ -0,0 +1,143 @@ +# coding=utf-8 +# Copyright 2020 The Google AI Team, Stanford University and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +from typing import Optional + +from tokenizers import normalizers + +from ...tokenization_utils_fast import PreTrainedTokenizerFast +from .tokenization_electra import ElectraTokenizer + + +VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"} + + +# Copied from transformers.models.bert.tokenization_bert_fast.BertTokenizerFast with Bert->Electra , BERT->ELECTRA +class ElectraTokenizerFast(PreTrainedTokenizerFast): + r""" + Construct a "fast" ELECTRA tokenizer (backed by HuggingFace's *tokenizers* library). Based on WordPiece. + + This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should + refer to this superclass for more information regarding those methods. + + Args: + vocab_file (`str`): + File containing the vocabulary. + do_lower_case (`bool`, *optional*, defaults to `True`): + Whether or not to lowercase the input when tokenizing. + unk_token (`str`, *optional*, defaults to `"[UNK]"`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + sep_token (`str`, *optional*, defaults to `"[SEP]"`): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for + sequence classification or for a text and a question for question answering. It is also used as the last + token of a sequence built with special tokens. + pad_token (`str`, *optional*, defaults to `"[PAD]"`): + The token used for padding, for example when batching sequences of different lengths. + cls_token (`str`, *optional*, defaults to `"[CLS]"`): + The classifier token which is used when doing sequence classification (classification of the whole sequence + instead of per-token classification). It is the first token of the sequence when built with special tokens. + mask_token (`str`, *optional*, defaults to `"[MASK]"`): + The token used for masking values. This is the token used when training this model with masked language + modeling. This is the token which the model will try to predict. + clean_text (`bool`, *optional*, defaults to `True`): + Whether or not to clean the text before tokenization by removing any control characters and replacing all + whitespaces by the classic one. + tokenize_chinese_chars (`bool`, *optional*, defaults to `True`): + Whether or not to tokenize Chinese characters. This should likely be deactivated for Japanese (see [this + issue](https://github.com/huggingface/transformers/issues/328)). + strip_accents (`bool`, *optional*): + Whether or not to strip all accents. If this option is not specified, then it will be determined by the + value for `lowercase` (as in the original ELECTRA). + wordpieces_prefix (`str`, *optional*, defaults to `"##"`): + The prefix for subwords. + """ + + vocab_files_names = VOCAB_FILES_NAMES + slow_tokenizer_class = ElectraTokenizer + + def __init__( + self, + vocab_file=None, + tokenizer_file=None, + do_lower_case=True, + unk_token="[UNK]", + sep_token="[SEP]", + pad_token="[PAD]", + cls_token="[CLS]", + mask_token="[MASK]", + tokenize_chinese_chars=True, + strip_accents=None, + **kwargs, + ): + super().__init__( + vocab_file, + tokenizer_file=tokenizer_file, + do_lower_case=do_lower_case, + unk_token=unk_token, + sep_token=sep_token, + pad_token=pad_token, + cls_token=cls_token, + mask_token=mask_token, + tokenize_chinese_chars=tokenize_chinese_chars, + strip_accents=strip_accents, + **kwargs, + ) + + normalizer_state = json.loads(self.backend_tokenizer.normalizer.__getstate__()) + if ( + normalizer_state.get("lowercase", do_lower_case) != do_lower_case + or normalizer_state.get("strip_accents", strip_accents) != strip_accents + or normalizer_state.get("handle_chinese_chars", tokenize_chinese_chars) != tokenize_chinese_chars + ): + normalizer_class = getattr(normalizers, normalizer_state.pop("type")) + normalizer_state["lowercase"] = do_lower_case + normalizer_state["strip_accents"] = strip_accents + normalizer_state["handle_chinese_chars"] = tokenize_chinese_chars + self.backend_tokenizer.normalizer = normalizer_class(**normalizer_state) + + self.do_lower_case = do_lower_case + + def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A ELECTRA sequence has the following format: + + - single sequence: `[CLS] X [SEP]` + - pair of sequences: `[CLS] A [SEP] B [SEP]` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id] + + if token_ids_1 is not None: + output += token_ids_1 + [self.sep_token_id] + + return output + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]: + files = self._tokenizer.model.save(save_directory, name=filename_prefix) + return tuple(files) + + +__all__ = ["ElectraTokenizerFast"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/encoder_decoder/__pycache__/__init__.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/encoder_decoder/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0450d22c921d0238b4f4b6021d5ff68a7db9cacd Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/encoder_decoder/__pycache__/__init__.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/encoder_decoder/__pycache__/configuration_encoder_decoder.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/encoder_decoder/__pycache__/configuration_encoder_decoder.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..07348bd0632fdef84b9add8008f0a281a5f90ef0 Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/encoder_decoder/__pycache__/configuration_encoder_decoder.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/encoder_decoder/__pycache__/modeling_encoder_decoder.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/encoder_decoder/__pycache__/modeling_encoder_decoder.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8885f14628544168fe7dd07af3a989c6484e03d6 Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/encoder_decoder/__pycache__/modeling_encoder_decoder.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/encoder_decoder/__pycache__/modeling_flax_encoder_decoder.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/encoder_decoder/__pycache__/modeling_flax_encoder_decoder.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d972949263ce3cccc13211c8929b5ab24be1d773 Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/encoder_decoder/__pycache__/modeling_flax_encoder_decoder.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/encoder_decoder/__pycache__/modeling_tf_encoder_decoder.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/encoder_decoder/__pycache__/modeling_tf_encoder_decoder.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f8d9c0533475e7a60cd4d7f90ede3568e544ba00 Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/encoder_decoder/__pycache__/modeling_tf_encoder_decoder.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/eomt/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/eomt/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..9f4fe6327b312ff5f60ffb08c4b76566bf63f3f9 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/eomt/__init__.py @@ -0,0 +1,29 @@ +# Copyright 2025 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_eomt import * + from .image_processing_eomt import * + from .image_processing_eomt_fast import * + from .modeling_eomt import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/eomt/configuration_eomt.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/eomt/configuration_eomt.py new file mode 100644 index 0000000000000000000000000000000000000000..670250721150e60df3d5da9280197cdad461beef --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/eomt/configuration_eomt.py @@ -0,0 +1,168 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/eomt/modular_eomt.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_eomt.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright 2025 Mobile Perception Systems Lab at TU/e and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ...configuration_utils import PretrainedConfig + + +class EomtConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`EomtForUniversalSegmentation`]. It is used to instantiate an EoMT model + according to the specified arguments, defining the model architecture. Instantiating a configuration with the + defaults will yield a similar configuration to that of the EoMT + [tue-mps/coco_panoptic_eomt_large_640](https://huggingface.co/tue-mps/coco_panoptic_eomt_large_640) + architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + hidden_size (`int`, *optional*, defaults to 1024): + Dimensionality of the hidden representations. + num_hidden_layers (`int`, *optional*, defaults to 24): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 16): + Number of attention heads in each attention layer. + mlp_ratio (`int`, *optional*, defaults to 4): + Ratio of the MLP hidden dimensionality to the hidden size. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder. + hidden_dropout_prob (`float`, *optional*, defaults to 0.0): + The dropout probability for all fully connected layers in the embeddings and encoder. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon used by the layer normalization layers. + image_size (`int`, *optional*, defaults to 640): + The size (resolution) of each input image. + patch_size (`int`, *optional*, defaults to 16): + The size (resolution) of each patch. + num_channels (`int`, *optional*, defaults to 3): + The number of input channels. + layerscale_value (`float`, *optional*, defaults to 1.0): + Initial value for the LayerScale parameter. + drop_path_rate (`float`, *optional*, defaults to 0.0): + The stochastic depth rate (drop path) used during training. + num_upscale_blocks (`int`, *optional*, defaults to 2): + Number of upsampling blocks used in the decoder or segmentation head. + attention_dropout (`float`, *optional*, defaults to 0.0): + Dropout probability applied after attention projection. + use_swiglu_ffn (`bool`, *optional*, defaults to `False`): + Whether to use the SwiGLU feedforward neural network. + num_blocks (`int`, *optional*, defaults to 4): + Number of feature blocks or stages in the architecture. + no_object_weight (`float`, *optional*, defaults to 0.1): + Loss weight for the 'no object' class in panoptic/instance segmentation. + class_weight (`float`, *optional*, defaults to 2.0): + Loss weight for classification targets. + mask_weight (`float`, *optional*, defaults to 5.0): + Loss weight for mask prediction. + dice_weight (`float`, *optional*, defaults to 5.0): + Loss weight for the dice loss component. + train_num_points (`int`, *optional*, defaults to 12544): + Number of points to sample for mask loss computation during training. + oversample_ratio (`float`, *optional*, defaults to 3.0): + Oversampling ratio used in point sampling for mask training. + importance_sample_ratio (`float`, *optional*, defaults to 0.75): + Ratio of points to sample based on importance during training. + num_queries (`int`, *optional*, defaults to 200): + Number of object queries in the Transformer. + num_register_tokens (`int`, *optional*, defaults to 4): + Number of learnable register tokens added to the transformer input. + + Example: + + ```python + >>> from transformers import EomtConfig, EomtForUniversalSegmentation + + >>> # Initialize configuration + >>> config = EomtConfig() + + >>> # Initialize model + >>> model = EomtForUniversalSegmentation(config) + + >>> # Access config + >>> config = model.config + ```""" + + model_type = "eomt" + + def __init__( + self, + hidden_size=1024, + num_hidden_layers=24, + num_attention_heads=16, + mlp_ratio=4, + hidden_act="gelu", + hidden_dropout_prob=0.0, + initializer_range=0.02, + layer_norm_eps=1e-6, + image_size=640, + patch_size=16, + num_channels=3, + layerscale_value=1.0, + drop_path_rate=0.0, + num_upscale_blocks=2, + attention_dropout=0.0, + use_swiglu_ffn=False, + num_blocks=4, + no_object_weight: float = 0.1, + class_weight: float = 2.0, + mask_weight: float = 5.0, + dice_weight: float = 5.0, + train_num_points: int = 12544, + oversample_ratio: float = 3.0, + importance_sample_ratio: float = 0.75, + num_queries=200, + num_register_tokens=4, + **kwargs, + ): + super().__init__(**kwargs) + + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + + self.mlp_ratio = mlp_ratio + self.attention_dropout = attention_dropout + self.layerscale_value = layerscale_value + self.drop_path_rate = drop_path_rate + self.num_upscale_blocks = num_upscale_blocks + self.use_swiglu_ffn = use_swiglu_ffn + self.num_blocks = num_blocks + self.no_object_weight = no_object_weight + self.class_weight = class_weight + self.mask_weight = mask_weight + self.dice_weight = dice_weight + self.train_num_points = train_num_points + self.oversample_ratio = oversample_ratio + self.importance_sample_ratio = importance_sample_ratio + self.num_queries = num_queries + self.num_register_tokens = num_register_tokens + + +__all__ = ["EomtConfig"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/eomt/image_processing_eomt.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/eomt/image_processing_eomt.py new file mode 100644 index 0000000000000000000000000000000000000000..2b786ce39e71a8710fc46f053a4570e90444830f --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/eomt/image_processing_eomt.py @@ -0,0 +1,973 @@ +# coding=utf-8 +# Copyright 2025 Mobile Perception Systems Lab at TU/e and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Image processor class for EoMT.""" + +import math +from typing import Optional, Union + +import numpy as np + +from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict +from ...image_transforms import ( + PaddingMode, + pad, + resize, +) +from ...image_utils import ( + ChannelDimension, + ImageInput, + PILImageResampling, + get_image_size, + infer_channel_dimension_format, + make_flat_list_of_images, + to_numpy_array, + valid_images, + validate_preprocess_arguments, +) +from ...utils import ( + IMAGENET_DEFAULT_MEAN, + IMAGENET_DEFAULT_STD, + TensorType, + filter_out_non_signature_kwargs, + is_torch_available, + logging, +) + + +logger = logging.get_logger(__name__) + +if is_torch_available(): + import torch + import torch.nn.functional as F + + +# Adapted from transformers.models.maskformer.image_processing_maskformer.convert_segmentation_map_to_binary_masks +def convert_segmentation_map_to_binary_masks( + segmentation_map: np.ndarray, + instance_id_to_semantic_id: Optional[dict[int, int]] = None, + ignore_index: Optional[int] = None, +): + if ignore_index is not None: + segmentation_map = np.where(segmentation_map == 0, ignore_index, segmentation_map - 1) + + # Get unique ids (class or instance ids based on input) + all_labels = np.unique(segmentation_map) + + # Drop background label if applicable + if ignore_index is not None: + all_labels = all_labels[all_labels != ignore_index] + + # Generate a binary mask for each object instance + binary_masks = [(segmentation_map == i) for i in all_labels] + + # Stack the binary masks + if binary_masks: + binary_masks = np.stack(binary_masks, axis=0) + else: + binary_masks = np.zeros((0, *segmentation_map.shape)) + + # Convert instance ids to class ids + if instance_id_to_semantic_id is not None: + labels = np.zeros(all_labels.shape[0]) + + for label in all_labels: + class_id = instance_id_to_semantic_id[label + 1 if ignore_index is not None else label] + labels[all_labels == label] = class_id - 1 if ignore_index is not None else class_id + else: + labels = all_labels + + return binary_masks.astype(np.float32), labels.astype(np.int64) + + +def get_size_with_aspect_ratio(image_size, size, max_size=None) -> tuple[int, int]: + """ + Computes the output image size given the input image size and the desired output size. + + Args: + image_size (`tuple[int, int]`): + The input image size. + size (`int`): + The desired output size. + max_size (`int`, *optional*): + The maximum allowed output size. + """ + height, width = image_size + raw_size = None + if max_size is not None: + min_original_size = float(min((height, width))) + max_original_size = float(max((height, width))) + if max_original_size / min_original_size * size > max_size: + raw_size = max_size * min_original_size / max_original_size + size = int(round(raw_size)) + + if (height <= width and height == size) or (width <= height and width == size): + oh, ow = height, width + elif width < height: + ow = size + if max_size is not None and raw_size is not None: + oh = round(raw_size * height / width) + else: + oh = round(size * height / width) + else: + oh = size + if max_size is not None and raw_size is not None: + ow = round(raw_size * width / height) + else: + ow = round(size * width / height) + + return (oh, ow) + + +# Copied from transformers.models.detr.image_processing_detr.remove_low_and_no_objects +def remove_low_and_no_objects(masks, scores, labels, object_mask_threshold, num_labels): + """ + Binarize the given masks using `object_mask_threshold`, it returns the associated values of `masks`, `scores` and + `labels`. + + Args: + masks (`torch.Tensor`): + A tensor of shape `(num_queries, height, width)`. + scores (`torch.Tensor`): + A tensor of shape `(num_queries)`. + labels (`torch.Tensor`): + A tensor of shape `(num_queries)`. + object_mask_threshold (`float`): + A number between 0 and 1 used to binarize the masks. + Raises: + `ValueError`: Raised when the first dimension doesn't match in all input tensors. + Returns: + `tuple[`torch.Tensor`, `torch.Tensor`, `torch.Tensor`]`: The `masks`, `scores` and `labels` without the region + < `object_mask_threshold`. + """ + if not (masks.shape[0] == scores.shape[0] == labels.shape[0]): + raise ValueError("mask, scores and labels must have the same shape!") + + to_keep = labels.ne(num_labels) & (scores > object_mask_threshold) + + return masks[to_keep], scores[to_keep], labels[to_keep] + + +def check_segment_validity(mask_labels, mask_probs, k, mask_threshold=0.5, overlap_mask_area_threshold=0.8): + # Get the mask associated with the k class + mask_k = mask_labels == k + mask_k_area = mask_k.sum() + + # Compute the area of all the stuff in query k + original_mask = mask_probs[k] >= mask_threshold + original_area = original_mask.sum() + + final_mask = mask_k & original_mask + final_mask_area = final_mask.sum() + + mask_exists = mask_k_area > 0 and original_area > 0 and final_mask_area > 0 + + if mask_exists: + area_ratio = mask_k_area / original_area + if not area_ratio.item() > overlap_mask_area_threshold: + mask_exists = False + + return mask_exists, final_mask + + +def compute_segments( + mask_probs, + pred_scores, + pred_labels, + stuff_classes, + mask_threshold: float = 0.5, + overlap_mask_area_threshold: float = 0.8, + target_size: Optional[tuple[int, int]] = None, +): + height = mask_probs.shape[1] if target_size is None else target_size[0] + width = mask_probs.shape[2] if target_size is None else target_size[1] + + segmentation = torch.zeros((height, width), dtype=torch.long, device=mask_probs.device) - 1 + segments: list[dict] = [] + + # Compute per-pixel assignment based on weighted mask scores + mask_probs = mask_probs.sigmoid() + mask_labels = (pred_scores[:, None, None] * mask_probs).argmax(0) + + # Keep track of instances of each class + current_segment_id = 0 + stuff_memory_list: dict[str, int] = {} + + for k in range(pred_labels.shape[0]): + pred_class = pred_labels[k].item() + + # Check if mask exists and large enough to be a segment + mask_exists, final_mask = check_segment_validity( + mask_labels, mask_probs, k, mask_threshold, overlap_mask_area_threshold + ) + + if not mask_exists: + continue + + if stuff_classes and pred_class in stuff_classes: + if pred_class in stuff_memory_list: + segmentation[final_mask] = stuff_memory_list[pred_class] + continue + else: + stuff_memory_list[pred_class] = current_segment_id + + segmentation[final_mask] = current_segment_id + segment_score = round(pred_scores[k].item(), 6) + segments.append( + { + "id": current_segment_id, + "label_id": pred_class, + "score": segment_score, + } + ) + current_segment_id += 1 + return segmentation, segments + + +def get_target_size(size_dict: dict[str, int]) -> tuple[int, int]: + """Returns the height and width from a size dict.""" + target_height = size_dict["shortest_edge"] + target_width = size_dict.get("longest_edge") or target_height + + return target_height, target_width + + +class EomtImageProcessor(BaseImageProcessor): + r""" + Constructs a EoMT image processor. The image processor can be used to prepare image(s) and optional targets + for the model. + + This image processor inherits from [`BaseImageProcessor`] which contains most of the main methods. Users should + refer to this superclass for more information regarding those methods. + + Args: + do_resize (`bool`, *optional*, defaults to `True`): + Whether to resize the input to a certain `size`. + size (`int`, *optional*, defaults to 640): + Resize the input to the given size. Only has an effect if `do_resize` is set to `True`. If size is a + sequence like `(width, height)`, output size will be matched to this. If size is an int, smaller edge of + the image will be matched to this number. i.e, if `height > width`, then image will be rescaled to `(size * + height / width, size)`. + resample (`int`, *optional*, defaults to `Resampling.BILINEAR`): + An optional resampling filter. This can be one of `PIL.Image.Resampling.NEAREST`, + `PIL.Image.Resampling.BOX`, `PIL.Image.Resampling.BILINEAR`, `PIL.Image.Resampling.HAMMING`, + `PIL.Image.Resampling.BICUBIC` or `PIL.Image.Resampling.LANCZOS`. Only has an effect if `do_resize` is set + to `True`. + do_rescale (`bool`, *optional*, defaults to `True`): + Whether to rescale the input to a certain `scale`. + rescale_factor (`float`, *optional*, defaults to `1/ 255`): + Rescale the input by the given factor. Only has an effect if `do_rescale` is set to `True`. + do_normalize (`bool`, *optional*, defaults to `True`): + Whether or not to normalize the input with mean and standard deviation. + do_split_image (`bool`, *optional*, defaults to `False`): + Whether to split the input images into overlapping patches for semantic segmentation. If set to `True`, the + input images will be split into patches of size `size["shortest_edge"]` with an overlap between patches. + Otherwise, the input images will be padded to the target size. + do_pad (`bool`, *optional*, defaults to `False`): + Whether to pad the image. If `True`, will pad the patch dimension of the images in the batch to the largest + number of patches in the batch. Padding will be applied to the bottom and right with zeros. + image_mean (`int`, *optional*, defaults to `[0.485, 0.456, 0.406]`): + The sequence of means for each channel, to be used when normalizing images. Defaults to the ImageNet mean. + image_std (`int`, *optional*, defaults to `[0.229, 0.224, 0.225]`): + The sequence of standard deviations for each channel, to be used when normalizing images. Defaults to the + ImageNet std. + ignore_index (`int`, *optional*): + Label to be assigned to background pixels in segmentation maps. If provided, segmentation map pixels + denoted with 0 (background) will be replaced with `ignore_index`. + num_labels (`int`, *optional*): + The number of labels in the segmentation map. + """ + + model_input_names = ["pixel_values"] + + def __init__( + self, + do_resize: bool = True, + size: Optional[dict[str, int]] = None, + resample: PILImageResampling = PILImageResampling.BILINEAR, + do_rescale: bool = True, + rescale_factor: float = 1 / 255, + do_normalize: bool = True, + do_split_image: bool = False, + do_pad: bool = False, + image_mean: Optional[Union[float, list[float]]] = None, + image_std: Optional[Union[float, list[float]]] = None, + ignore_index: Optional[int] = None, + num_labels: Optional[int] = None, + **kwargs, + ): + super().__init__(**kwargs) + + size = size if size is not None else {"shortest_edge": 640, "longest_edge": 640} + size = get_size_dict(size, default_to_square=False) + + self.do_resize = do_resize + self.size = size + self.resample = resample + self.do_rescale = do_rescale + self.rescale_factor = rescale_factor + self.do_normalize = do_normalize + self.do_split_image = do_split_image + self.do_pad = do_pad + self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN + self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD + self.ignore_index = ignore_index + self.num_labels = num_labels + + def resize( + self, + image: np.ndarray, + size: dict, + resample: PILImageResampling = PILImageResampling.BILINEAR, + data_format=None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ) -> np.ndarray: + """ + Resize an image. The shortest edge of the image is resized to size["shortest_edge"], with the longest edge + resized to keep the input aspect ratio. + + Args: + image (`np.ndarray`): + Image to resize. + resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`): + Resampling filter to use when resiizing the image. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format of the image. If not provided, it will be the same as the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format of the input image. If not provided, it will be inferred. + """ + image_size = get_image_size(image) + output_size = get_size_with_aspect_ratio(image_size, size["shortest_edge"], size["longest_edge"]) + + image = resize( + image=image, + size=output_size, + resample=resample, + data_format=data_format, + input_data_format=input_data_format, + return_numpy=True, + **kwargs, + ) + + return image + + def _split_image(self, image: ImageInput, size: dict, image_index: int) -> tuple[list, list]: + """Slices an image into overlapping patches for semantic segmentation.""" + + patches, patch_offsets = [], [] + + image_size = get_image_size(image) + patch_size = size["shortest_edge"] + + longer_side = max(image_size) + num_patches = math.ceil(longer_side / patch_size) + total_overlap = num_patches * patch_size - longer_side + overlap_per_patch = total_overlap / (num_patches - 1) if num_patches > 1 else 0 + + for i in range(num_patches): + start = int(i * (patch_size - overlap_per_patch)) + end = start + patch_size + + if image_size[0] > image_size[1]: + patch = image[:, start:end, :] + else: + patch = image[:, :, start:end] + + patches.append(patch) + patch_offsets.append([image_index, start, end]) + + return patches, patch_offsets + + def _pad(self, image: ImageInput, size: dict) -> np.ndarray: + """Pads the image to the target size using zero padding.""" + height, width = get_image_size(image) + + target_height, target_width = get_target_size(size) + pad_h = max(0, target_height - height) + pad_w = max(0, target_width - width) + + padding = ((0, pad_h), (0, pad_w)) + + # Channel axis is last; default padding format is compatible + padded_image = pad(image=image, padding=padding, mode=PaddingMode.CONSTANT, constant_values=0.0) + return padded_image + + def _preprocess_images( + self, + images: ImageInput, + do_resize: Optional[bool] = None, + size: Optional[dict[str, int]] = None, + resample: Optional[PILImageResampling] = None, + do_split_image: Optional[bool] = None, + do_pad: Optional[bool] = None, + do_rescale: Optional[bool] = None, + rescale_factor: Optional[float] = None, + do_normalize: Optional[bool] = None, + image_mean: Optional[Union[float, list[float]]] = None, + image_std: Optional[Union[float, list[float]]] = None, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> np.ndarray: + """Preprocesses a batch of images.""" + images = [to_numpy_array(image) for image in images] + + if do_resize: + images = [ + self.resize( + image, + size=size, + resample=resample, + data_format=data_format, + input_data_format=input_data_format, + ) + for image in images + ] + + processed_images, patch_offsets = [], [] + + if do_split_image: + for idx, img in enumerate(images): + patches, offsets = self._split_image(img, size, idx) + processed_images.extend(patches) + patch_offsets.extend(offsets) + + images = processed_images + + if do_pad: + images = [self._pad(img, size) for img in images] + + if do_rescale: + images = [self.rescale(img, scale=rescale_factor, input_data_format=input_data_format) for img in images] + + if do_normalize: + images = [ + self.normalize( + image, + mean=image_mean, + std=image_std, + input_data_format=input_data_format, + ) + for image in images + ] + + return images, patch_offsets + + def _preprocess_mask( + self, + segmentation_map: ImageInput, + do_resize: Optional[bool] = False, + do_pad: Optional[bool] = False, + size: Optional[dict[str, int]] = None, + resample: Optional[PILImageResampling] = None, + data_format: Union[str, ChannelDimension] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> np.ndarray: + """Preprocesses a single mask.""" + # Add channel dimension if missing - needed for certain transformations + if segmentation_map.ndim == 2: + added_channel_dim = True + segmentation_map = segmentation_map[None, ...] + input_data_format = ChannelDimension.FIRST + else: + added_channel_dim = False + if input_data_format is None: + input_data_format = infer_channel_dimension_format(segmentation_map) + + if do_resize: + segmentation_map = self.resize( + segmentation_map, + size=size, + resample=resample, + data_format=data_format, + ) + + if do_pad: + segmentation_map = self._pad(segmentation_map, size) + + # Remove extra channel dimension if added for processing + if added_channel_dim: + segmentation_map = segmentation_map.squeeze(0) + return torch.from_numpy(segmentation_map) + + @filter_out_non_signature_kwargs() + def preprocess( + self, + images: ImageInput, + segmentation_maps: Optional[Union[list[dict[int, int]], dict[int, int]]] = None, + instance_id_to_semantic_id: Optional[dict[int, int]] = None, + do_split_image: Optional[bool] = None, + do_resize: Optional[bool] = None, + size: Optional[dict[str, int]] = None, + resample: Optional[PILImageResampling] = None, + do_rescale: Optional[bool] = None, + rescale_factor: Optional[float] = None, + do_normalize: Optional[bool] = None, + do_pad: Optional[bool] = None, + image_mean: Optional[Union[float, list[float]]] = None, + image_std: Optional[Union[float, list[float]]] = None, + ignore_index: Optional[int] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> BatchFeature: + """ + Preprocesses images or a batch of images. + + Args: + images (`ImageInput`): + Image or batch of images to preprocess. + segmentation_maps (`ImageInput`, *optional*): + The corresponding semantic segmentation maps with the pixel-wise annotations. + instance_id_to_semantic_id (`list[dict[int, int]]` or `dict[int, int]`, *optional*): + A mapping between object instance ids and class ids. + do_split_image (`bool`, *optional*, defaults to `self.do_split_image`): + Whether to split the input images into overlapping patches for semantic segmentation. + do_resize (`bool`, *optional*, defaults to `self.do_resize`): + Whether to resize the input images. + size (`dict[str, int]`, *optional*, defaults to `self.size`): + Target size as a dictionary with `"shortest_edge"` and `"longest_edge"` keys. + resample (`PILImageResampling`, *optional*, defaults to `self.resample`): + Resampling filter to use when resizing. + do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): + Whether to rescale the input images by `rescale_factor`. + rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): + Factor to scale image pixel values. + do_normalize (`bool`, *optional*, defaults to `self.do_normalize`): + Whether to normalize the input images. + do_pad (`bool`, *optional*, defaults to `False`): + Whether to pad the image. If `True`, will pad the patch dimension of the images in the batch to the largest + number of patches in the batch. Padding will be applied to the bottom and right with zeros. + image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`): + Mean for normalization. Single value or list for each channel. + image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`): + Standard deviation for normalization. Single value or list for each channel. + ignore_index (`int`, *optional*): + Label to be assigned to background pixels in segmentation maps. If provided, segmentation map pixels + denoted with 0 (background) will be replaced with `ignore_index`. + return_tensors (`str` or `TensorType`, *optional*): + The type of tensors to return. Can be `"pt"`, `"tf"`, `"np"`, or `"jax"`. + data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): + Channel format of the output image. Either `"channels_first"` or `"channels_last"`. + input_data_format (`ChannelDimension` or `str`, *optional*): + Channel format of the input image. + """ + + do_split_image = do_split_image if do_split_image is not None else self.do_split_image + do_resize = do_resize if do_resize is not None else self.do_resize + size = size if size is not None else self.size + size = get_size_dict(size, default_to_square=False) + resample = resample if resample is not None else self.resample + do_rescale = do_rescale if do_rescale is not None else self.do_rescale + rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor + do_normalize = do_normalize if do_normalize is not None else self.do_normalize + do_pad = do_pad if do_pad is not None else self.do_pad + image_mean = image_mean if image_mean is not None else self.image_mean + image_std = image_std if image_std is not None else self.image_std + ignore_index = ignore_index if ignore_index is not None else self.ignore_index + + images = self.fetch_images(images) + images = make_flat_list_of_images(images) + + if not valid_images(images): + raise ValueError( + "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " + "torch.Tensor, tf.Tensor or jax.ndarray." + ) + + validate_preprocess_arguments( + do_rescale=do_rescale, + rescale_factor=rescale_factor, + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + do_resize=do_resize, + size=size, + resample=resample, + ) + + pixel_values_list, patch_offsets = self._preprocess_images( + images=images, + do_resize=do_resize, + size=size, + resample=resample, + do_split_image=do_split_image, + do_pad=do_pad, + do_rescale=do_rescale, + rescale_factor=rescale_factor, + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + data_format=data_format, + input_data_format=input_data_format, + ) + + if segmentation_maps is not None: + segmentation_maps = make_flat_list_of_images(segmentation_maps, expected_ndims=2) + segmentation_maps = [to_numpy_array(mask) for mask in segmentation_maps] + + segmentation_maps = [ + self._preprocess_mask( + segmentation_map, + do_resize=do_resize, + do_pad=do_pad, + size=size, + resample=PILImageResampling.NEAREST, + data_format=data_format, + input_data_format=input_data_format, + ) + for segmentation_map in segmentation_maps + ] + + encoded_inputs = self.encode_inputs( + pixel_values_list, + segmentation_maps, + instance_id_to_semantic_id, + ignore_index, + return_tensors, + input_data_format=data_format, + ) + + if do_split_image and patch_offsets: + encoded_inputs["patch_offsets"] = [torch.tensor(offsets) for offsets in patch_offsets] + + return encoded_inputs + + def encode_inputs( + self, + pixel_values_list: list[ImageInput], + segmentation_maps: Optional[ImageInput] = None, + instance_id_to_semantic_id: Optional[Union[list[dict[int, int]], dict[int, int]]] = None, + ignore_index: Optional[int] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ): + """ + Pad images up to the largest image in a batch and create a corresponding `pixel_mask`. + + EoMT addresses semantic segmentation with a mask classification paradigm, thus input segmentation maps + will be converted to lists of binary masks and their respective labels. Let's see an example, assuming + `segmentation_maps = [[2,6,7,9]]`, the output will contain `mask_labels = + [[1,0,0,0],[0,1,0,0],[0,0,1,0],[0,0,0,1]]` (four binary masks) and `class_labels = [2,6,7,9]`, the labels for + each mask. + + Args: + pixel_values_list (`list[ImageInput]`): + list of images (pixel values) to be padded. Each image should be a tensor of shape `(channels, height, + width)`. + + segmentation_maps (`ImageInput`, *optional*): + The corresponding semantic segmentation maps with the pixel-wise annotations. + + (`bool`, *optional*, defaults to `True`): + Whether or not to pad images up to the largest image in a batch and create a pixel mask. + + If left to the default, will return a pixel mask that is: + + - 1 for pixels that are real (i.e. **not masked**), + - 0 for pixels that are padding (i.e. **masked**). + + instance_id_to_semantic_id (`list[dict[int, int]]` or `dict[int, int]`, *optional*): + A mapping between object instance ids and class ids. If passed, `segmentation_maps` is treated as an + instance segmentation map where each pixel represents an instance id. Can be provided as a single + dictionary with a global/dataset-level mapping or as a list of dictionaries (one per image), to map + instance ids in each image separately. + + return_tensors (`str` or [`~file_utils.TensorType`], *optional*): + If set, will return tensors instead of NumPy arrays. If set to `'pt'`, return PyTorch `torch.Tensor` + objects. + + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format of the input image. If not provided, it will be inferred. + + Returns: + [`BatchFeature`]: A [`BatchFeature`] with the following fields: + + - **pixel_values** -- Pixel values to be fed to a model. + - **mask_labels** -- Optional list of mask labels of shape `(labels, height, width)` to be fed to a model + (when `annotations` are provided). + - **class_labels** -- Optional list of class labels of shape `(labels)` to be fed to a model (when + `annotations` are provided). They identify the labels of `mask_labels`, e.g. the label of + `mask_labels[i][j]` if `class_labels[i][j]`. + """ + ignore_index = self.ignore_index if ignore_index is None else ignore_index + + pixel_values_list = [to_numpy_array(pixel_values) for pixel_values in pixel_values_list] + + if input_data_format is None: + input_data_format = infer_channel_dimension_format(pixel_values_list[0]) + + encoded_inputs = BatchFeature({"pixel_values": pixel_values_list}, tensor_type=return_tensors) + + if segmentation_maps is not None: + mask_labels = [] + class_labels = [] + # Convert to list of binary masks and labels + for idx, segmentation_map in enumerate(segmentation_maps): + segmentation_map = to_numpy_array(segmentation_map) + if isinstance(instance_id_to_semantic_id, list): + instance_id = instance_id_to_semantic_id[idx] + else: + instance_id = instance_id_to_semantic_id + # Use instance2class_id mapping per image + masks, classes = convert_segmentation_map_to_binary_masks( + segmentation_map, + instance_id, + ignore_index=ignore_index, + ) + + mask_labels.append(torch.from_numpy(masks)) + class_labels.append(torch.from_numpy(classes)) + + # we cannot batch them since they don't share a common class size + encoded_inputs["mask_labels"] = mask_labels + encoded_inputs["class_labels"] = class_labels + + return encoded_inputs + + def merge_image_patches( + self, + segmentation_logits: torch.Tensor, + patch_offsets: list[tuple[int, int, int]], + target_sizes: list[tuple[int, int]], + size: dict[str, int], + ) -> list[torch.Tensor]: + """ + Reconstructs full-size semantic segmentation logits from patch predictions. + + Args: + segmentation_logits (`torch.Tensor`): + A tensor of shape `(num_patches, num_classes, patch_height, patch_width)` representing predicted logits + for each image patch. + patch_offsets (`list[tuple[int, int, int]]`): + A list of tuples where each tuple contains: + - `image_index` (int): Index of the original image this patch belongs to. + - `start` (int): Start pixel index of the patch along the long dimension (height or width). + - `end` (int): End pixel index of the patch along the long dimension. + target_sizes (`list[tuple[int, int]]`): + list of original (height, width) dimensions for each image before preprocessing. + size (`dict[str, int]`): + A size dict which was used to resize. + """ + num_classes = segmentation_logits.shape[1] + aggregated_logits = [] + patch_counts = [] + + for image_size in target_sizes: + height, width = get_size_with_aspect_ratio(image_size, size["shortest_edge"], size["longest_edge"]) + aggregated_logits.append(torch.zeros((num_classes, height, width), device=segmentation_logits.device)) + patch_counts.append(torch.zeros((num_classes, height, width), device=segmentation_logits.device)) + + # Stitch patches back into full-sized logit maps + for patch_idx, (image_idx, patch_start, patch_end) in enumerate(patch_offsets): + if target_sizes[image_idx][0] > target_sizes[image_idx][1]: + aggregated_logits[image_idx][:, patch_start:patch_end, :] += segmentation_logits[patch_idx] + patch_counts[image_idx][:, patch_start:patch_end, :] += 1 + else: + aggregated_logits[image_idx][:, :, patch_start:patch_end] += segmentation_logits[patch_idx] + patch_counts[image_idx][:, :, patch_start:patch_end] += 1 + + # Normalize and resize logits to original image size + reconstructed_logits = [] + for idx, (logit_sum, count) in enumerate(zip(aggregated_logits, patch_counts)): + averaged_logits = logit_sum / count.clamp(min=1) + resized_logits = F.interpolate( + averaged_logits[None, ...], + size=target_sizes[idx], + mode="bilinear", + align_corners=False, + )[0] + + reconstructed_logits.append(resized_logits) + + return reconstructed_logits + + def unpad_image( + self, + segmentation_logits: torch.Tensor, + target_sizes: list[tuple[int, int]], + size: dict[str, int], + ) -> list[torch.Tensor]: + """Restores panoptic segmentation logits to their original image resolutions.""" + + resized_logits = [] + + for idx, original_size in enumerate(target_sizes): + target_height, target_width = get_size_with_aspect_ratio( + original_size, size["shortest_edge"], size["longest_edge"] + ) + cropped_logits = segmentation_logits[idx][:, :target_height, :target_width] + upsampled_logits = F.interpolate( + cropped_logits[None, ...], size=original_size, mode="bilinear", align_corners=False + )[0] + resized_logits.append(upsampled_logits) + return resized_logits + + def post_process_semantic_segmentation( + self, + outputs, + target_sizes: list[tuple[int, int]], + size: Optional[dict[str, int]] = None, + ) -> np.ndarray: + """Post-processes model outputs into final semantic segmentation prediction.""" + + size = size if size is not None else self.size + + masks_queries_logits = outputs.masks_queries_logits # [batch_size, num_queries, height, width] + class_queries_logits = outputs.class_queries_logits # [batch_size, num_queries, num_classes+1] + patch_offsets = outputs.patch_offsets + + output_size = get_target_size(size) + masks_queries_logits = F.interpolate( + masks_queries_logits, + size=output_size, + mode="bilinear", + ) + + # Remove the null class `[..., :-1]` + masks_classes = class_queries_logits.softmax(dim=-1)[..., :-1] + masks_probs = masks_queries_logits.sigmoid() # [batch_size, num_queries, height, width] + + segmentation_logits = torch.einsum("bqc, bqhw -> bchw", masks_classes, masks_probs) + + output_logits = self.merge_image_patches(segmentation_logits, patch_offsets, target_sizes, size) + + preds = [logit.argmax(dim=0) for logit in output_logits] + return preds + + def post_process_panoptic_segmentation( + self, + outputs, + target_sizes: list[tuple[int, int]], + threshold: float = 0.8, + mask_threshold: float = 0.5, + overlap_mask_area_threshold: float = 0.8, + stuff_classes: Optional[list[int]] = None, + size: Optional[dict[str, int]] = None, + ): + """Post-processes model outputs into final panoptic segmentation prediction.""" + + size = size if size is not None else self.size + + masks_queries_logits = outputs.masks_queries_logits # [batch_size, num_queries, height, width] + class_queries_logits = outputs.class_queries_logits # [batch_size, num_queries, num_classes+1] + + batch_size = class_queries_logits.shape[0] + num_labels = class_queries_logits.shape[-1] - 1 + + output_size = get_target_size(size) + masks_queries_logits = F.interpolate( + masks_queries_logits, + size=output_size, + mode="bilinear", + ) + + mask_probs_batch = self.unpad_image(masks_queries_logits, target_sizes, size) + pred_scores_batch, pred_labels_batch = class_queries_logits.softmax(dim=-1).max(-1) + + results: list = [] + + for i in range(batch_size): + mask_probs, pred_scores, pred_labels = remove_low_and_no_objects( + mask_probs_batch[i], pred_scores_batch[i], pred_labels_batch[i], threshold, num_labels + ) + + # No mask found + if mask_probs.shape[0] <= 0: + height, width = target_sizes[i] if target_sizes is not None else mask_probs.shape[1:] + segmentation = torch.zeros((height, width)) - 1 + results.append({"segmentation": segmentation, "segments_info": []}) + continue + + segmentation, segments = compute_segments( + mask_probs=mask_probs, + pred_scores=pred_scores, + pred_labels=pred_labels, + stuff_classes=stuff_classes, + mask_threshold=mask_threshold, + overlap_mask_area_threshold=overlap_mask_area_threshold, + target_size=target_sizes[i] if target_sizes is not None else None, + ) + + results.append({"segmentation": segmentation, "segments_info": segments}) + return results + + @filter_out_non_signature_kwargs() + def post_process_instance_segmentation( + self, + outputs, + target_sizes: list[tuple[int, int]], + threshold: float = 0.5, + size: Optional[dict[str, int]] = None, + ): + """Post-processes model outputs into Instance Segmentation Predictions.""" + + size = size if size is not None else self.size + + class_queries_logits = outputs.class_queries_logits + masks_queries_logits = outputs.masks_queries_logits + + output_size = get_target_size(size) + masks_queries_logits = F.interpolate( + masks_queries_logits, + size=output_size, + mode="bilinear", + ) + + mask_probs_batch = self.unpad_image(masks_queries_logits, target_sizes, size) + + device = masks_queries_logits.device + batch_size = class_queries_logits.shape[0] + num_queries = class_queries_logits.shape[-2] + + results = [] + + for i in range(batch_size): + mask_pred = mask_probs_batch[i] + mask_class = class_queries_logits[i] + + # Remove the null class `[..., :-1]` + scores, pred_classes = mask_class.softmax(dim=-1)[..., :-1].max(-1) + pred_masks = (mask_pred > 0).float() + + # Calculate average mask prob + mask_scores = (mask_pred.sigmoid().flatten(1) * pred_masks.flatten(1)).sum(1) / ( + pred_masks.flatten(1).sum(1) + 1e-6 + ) + pred_scores = scores * mask_scores + + segmentation = torch.zeros(target_sizes[i], device=device) - 1 + + instance_maps, segments = [], [] + current_segment_id = 0 + for j in range(num_queries): + score = pred_scores[j].item() + + if not torch.all(pred_masks[j] == 0) and score >= threshold: + segmentation[pred_masks[j] == 1] = current_segment_id + segments.append( + { + "id": current_segment_id, + "label_id": pred_classes[j].item(), + "score": round(score, 6), + } + ) + current_segment_id += 1 + instance_maps.append(pred_masks[j]) + + results.append({"segmentation": segmentation, "segments_info": segments}) + return results + + +__all__ = ["EomtImageProcessor"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/eomt/image_processing_eomt_fast.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/eomt/image_processing_eomt_fast.py new file mode 100644 index 0000000000000000000000000000000000000000..ca80231d3a7646411e1134e8e97986100da8b025 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/eomt/image_processing_eomt_fast.py @@ -0,0 +1,536 @@ +# coding=utf-8 +# Copyright 2025 Mobile Perception Systems Lab at TU/e and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Fast Image processor class for EoMT.""" + +import math +from typing import Optional, Union + +import numpy as np +import torch +from torchvision.transforms.v2 import functional as F + +from ...image_processing_utils import BatchFeature +from ...image_processing_utils_fast import ( + BaseImageProcessorFast, + DefaultFastImageProcessorKwargs, + group_images_by_shape, + reorder_images, +) +from ...image_utils import ( + IMAGENET_DEFAULT_MEAN, + IMAGENET_DEFAULT_STD, + ChannelDimension, + ImageInput, + PILImageResampling, + SizeDict, +) +from ...processing_utils import Unpack +from ...utils import ( + TensorType, + auto_docstring, + filter_out_non_signature_kwargs, +) +from .image_processing_eomt import ( + compute_segments, + convert_segmentation_map_to_binary_masks, + get_size_with_aspect_ratio, + remove_low_and_no_objects, +) + + +class EomtImageProcessorFastKwargs(DefaultFastImageProcessorKwargs): + """ + do_split_image (`bool`, *optional*, defaults to `False`): + Whether to split the input images into overlapping patches for semantic segmentation. If set to `True`, the + input images will be split into patches of size `size["shortest_edge"]` with an overlap between patches. + Otherwise, the input images will be padded to the target size. + do_pad (`bool`, *optional*, defaults to `False`): + Whether to pad the image. If `True`, will pad the patch dimension of the images in the batch to the largest + number of patches in the batch. Padding will be applied to the bottom and right with zeros. + ignore_index (`int`, *optional*): + Label to be assigned to background pixels in segmentation maps. If provided, segmentation map pixels + denoted with 0 (background) will be replaced with `ignore_index`. + """ + + do_split_image: bool + do_pad: bool + ignore_index: Optional[int] = None + + +def get_target_size(size_dict: dict[str, int]) -> tuple[int, int]: + """Returns the height and width from a size dict.""" + target_height = size_dict["shortest_edge"] + target_width = size_dict["longest_edge"] or target_height + + return target_height, target_width + + +def reorder_patches_and_offsets( + patches: list[torch.Tensor], offsets: list[list[int]] +) -> tuple[list[torch.Tensor], list[list[int]]]: + """Sorts patches and offsets according to the original image index.""" + + combined = list(zip(offsets, patches)) + combined.sort(key=lambda x: x[0][0]) + sorted_offsets, sorted_patches = zip(*combined) + + return list(sorted_patches), list(sorted_offsets) + + +@auto_docstring +class EomtImageProcessorFast(BaseImageProcessorFast): + resample = PILImageResampling.BILINEAR + image_mean = IMAGENET_DEFAULT_MEAN + image_std = IMAGENET_DEFAULT_STD + size = {"shortest_edge": 640, "longest_edge": 640} + default_to_square = False + do_resize = True + do_rescale = True + do_normalize = True + do_split_image = False + do_pad = False + ignore_index = None + valid_kwargs = EomtImageProcessorFastKwargs + + def __init__(self, **kwargs: Unpack[EomtImageProcessorFastKwargs]): + super().__init__(**kwargs) + + def _split_image(self, images: torch.Tensor, size: dict, image_indices: int) -> tuple[list, list]: + """Slices an image into overlapping patches for semantic segmentation.""" + + patches, patch_offsets = [], [] + + _, _, height, width = images.shape + patch_size = size["shortest_edge"] + + longer_side = max(height, width) + num_patches = math.ceil(longer_side / patch_size) + total_overlap = num_patches * patch_size - longer_side + overlap_per_patch = total_overlap / (num_patches - 1) if num_patches > 1 else 0 + + for i in range(num_patches): + start = int(i * (patch_size - overlap_per_patch)) + end = start + patch_size + + if height > width: + batch_patch = images[:, :, start:end, :] + else: + batch_patch = images[:, :, :, start:end] + + for batch_idx, single in enumerate(torch.unbind(batch_patch, dim=0)): + patches.append(single) + patch_offsets.append([image_indices[batch_idx], start, end]) + + return patches, patch_offsets + + def _pad(self, images: torch.Tensor, size: dict) -> torch.Tensor: + """Pads the image to the target size using zero padding.""" + _, _, height, width = images.shape + + target_height, target_width = get_target_size(size) + pad_h = max(0, target_height - height) + pad_w = max(0, target_width - width) + padding = (0, pad_w, 0, pad_h) + + padded_images = torch.nn.functional.pad(images, padding, mode="constant", value=0.0) + return padded_images + + @auto_docstring + def preprocess( + self, + images: ImageInput, + segmentation_maps: Optional[list[torch.Tensor]] = None, + instance_id_to_semantic_id: Optional[dict[int, int]] = None, + **kwargs: Unpack[EomtImageProcessorFastKwargs], + ) -> BatchFeature: + r""" + segmentation_maps (`ImageInput`, *optional*): + The segmentation maps to preprocess for corresponding images. + instance_id_to_semantic_id (`list[dict[int, int]]` or `dict[int, int]`, *optional*): + A mapping between object instance ids and class ids. + """ + return super().preprocess(images, segmentation_maps, instance_id_to_semantic_id, **kwargs) + + def _preprocess_image_like_inputs( + self, + images: ImageInput, + segmentation_maps: Optional[ImageInput], + instance_id_to_semantic_id: Optional[dict[int, int]], + do_convert_rgb: bool, + input_data_format: ChannelDimension, + device: Optional[Union[str, "torch.device"]] = None, + **kwargs: Unpack[EomtImageProcessorFastKwargs], + ) -> BatchFeature: + """ + Preprocess image-like inputs. + """ + images = self._prepare_image_like_inputs( + images=images, do_convert_rgb=do_convert_rgb, input_data_format=input_data_format, device=device + ) + ignore_index = kwargs.pop("ignore_index", None) + images_kwargs = kwargs.copy() + processed_images, patch_offsets = self._preprocess(images, **images_kwargs) + outputs = BatchFeature({"pixel_values": processed_images}) + + if segmentation_maps is not None: + processed_segmentation_maps = self._prepare_image_like_inputs( + images=segmentation_maps, + expected_ndims=2, + do_convert_rgb=False, + input_data_format=ChannelDimension.FIRST, + ) + + segmentation_maps_kwargs = kwargs.copy() + segmentation_maps_kwargs.update( + { + "do_normalize": False, + "do_rescale": False, + # Nearest interpolation is used for segmentation maps instead of BILINEAR. + "interpolation": F.InterpolationMode.NEAREST_EXACT, + } + ) + + processed_segmentation_maps, _ = self._preprocess( + images=processed_segmentation_maps, **segmentation_maps_kwargs + ) + processed_segmentation_maps = processed_segmentation_maps.squeeze(1).to(torch.int64) + # Convert to list of binary masks and labels + mask_labels, class_labels = [], [] + for idx, segmentation_map in enumerate(processed_segmentation_maps): + if isinstance(instance_id_to_semantic_id, list): + instance_id = instance_id_to_semantic_id[idx] + else: + instance_id = instance_id_to_semantic_id + # Use instance2class_id mapping per image + masks, classes = convert_segmentation_map_to_binary_masks( + segmentation_map, + instance_id, + ignore_index=ignore_index, + ) + + mask_labels.append(torch.from_numpy(masks)) + class_labels.append(torch.from_numpy(classes)) + + # we cannot batch them since they don't share a common class size + outputs["mask_labels"] = mask_labels + outputs["class_labels"] = class_labels + + if patch_offsets: + outputs["patch_offsets"] = [torch.tensor(offsets) for offsets in patch_offsets] + + return outputs + + def _preprocess( + self, + images: list["torch.Tensor"], + do_resize: bool, + size: SizeDict, + interpolation: Optional["F.InterpolationMode"], + do_rescale: bool, + rescale_factor: float, + do_normalize: bool, + do_split_image: bool, + do_pad: bool, + image_mean: Optional[Union[float, list[float]]], + image_std: Optional[Union[float, list[float]]], + disable_grouping: Optional[bool], + return_tensors: Optional[Union[str, TensorType]], + **kwargs, + ): + """Preprocesses the input images and masks if provided.""" + processed_images, patch_offsets = [], [] + + grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping) + resized_images_grouped = {} + + for shape, stacked_images in grouped_images.items(): + if do_resize: + stacked_images = self.resize(image=stacked_images, size=size, interpolation=interpolation) + resized_images_grouped[shape] = stacked_images + images = reorder_images(resized_images_grouped, grouped_images_index) + + # Group images by size for batched resizing, Needed in case do_resize is False. + grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping) + processed_images_grouped = {} + + for shape, stacked_images in grouped_images.items(): + original_indices = [ + original_idx for original_idx, (img_shape, _) in grouped_images_index.items() if img_shape == shape + ] + + if do_split_image: + patches, offsets = self._split_image(stacked_images, size, original_indices) + processed_images.extend(patches) + patch_offsets.extend(offsets) + + if do_pad: + stacked_images = self._pad(stacked_images, size) + processed_images_grouped[shape] = stacked_images + + if do_split_image: + images, patch_offsets = reorder_patches_and_offsets(processed_images, patch_offsets) + + if do_pad: + images = reorder_images(processed_images_grouped, grouped_images_index) + + grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping) + processed_images_grouped = {} + + for shape, stacked_images in grouped_images.items(): + stacked_images = self.rescale_and_normalize( + stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std + ) + processed_images_grouped[shape] = stacked_images + images = reorder_images(processed_images_grouped, grouped_images_index) + + processed_images = torch.stack(images, dim=0) if return_tensors else images + + return processed_images, patch_offsets + + def merge_image_patches( + self, + segmentation_logits: torch.Tensor, + patch_offsets: list[tuple[int, int, int]], + target_sizes: list[tuple[int, int]], + size: dict[str, int], + ) -> list[torch.Tensor]: + """ + Reconstructs full-size semantic segmentation logits from patch predictions. + + Args: + segmentation_logits (`torch.Tensor`): + A tensor of shape `(num_patches, num_classes, patch_height, patch_width)` representing predicted logits + for each image patch. + patch_offsets (`list[tuple[int, int, int]]`): + A list of tuples where each tuple contains: + - `image_index` (int): Index of the original image this patch belongs to. + - `start` (int): Start pixel index of the patch along the long dimension (height or width). + - `end` (int): End pixel index of the patch along the long dimension. + target_sizes (`list[tuple[int, int]]`): + list of original (height, width) dimensions for each image before preprocessing. + size (`dict[str, int]`): + A size dict which was used to resize. + """ + num_classes = segmentation_logits.shape[1] + aggregated_logits = [] + patch_counts = [] + + for image_size in target_sizes: + height, width = get_size_with_aspect_ratio(image_size, size["shortest_edge"], size["longest_edge"]) + aggregated_logits.append(torch.zeros((num_classes, height, width), device=segmentation_logits.device)) + patch_counts.append(torch.zeros((num_classes, height, width), device=segmentation_logits.device)) + + # Stitch patches back into full-sized logit maps + for patch_idx, (image_idx, patch_start, patch_end) in enumerate(patch_offsets): + if target_sizes[image_idx][0] > target_sizes[image_idx][1]: + aggregated_logits[image_idx][:, patch_start:patch_end, :] += segmentation_logits[patch_idx] + patch_counts[image_idx][:, patch_start:patch_end, :] += 1 + else: + aggregated_logits[image_idx][:, :, patch_start:patch_end] += segmentation_logits[patch_idx] + patch_counts[image_idx][:, :, patch_start:patch_end] += 1 + + # Normalize and resize logits to original image size + reconstructed_logits = [] + for idx, (logit_sum, count) in enumerate(zip(aggregated_logits, patch_counts)): + averaged_logits = logit_sum / count.clamp(min=1) + resized_logits = torch.nn.functional.interpolate( + averaged_logits[None, ...], + size=target_sizes[idx], + mode="bilinear", + align_corners=False, + )[0] + + reconstructed_logits.append(resized_logits) + + return reconstructed_logits + + def unpad_image( + self, + segmentation_logits: torch.Tensor, + target_sizes: list[tuple[int, int]], + size: dict[str, int], + ) -> list[torch.Tensor]: + """Restores panoptic segmentation logits to their original image resolutions.""" + + resized_logits = [] + + for idx, original_size in enumerate(target_sizes): + target_height, target_width = get_size_with_aspect_ratio( + original_size, size["shortest_edge"], size["longest_edge"] + ) + cropped_logits = segmentation_logits[idx][:, :target_height, :target_width] + upsampled_logits = torch.nn.functional.interpolate( + cropped_logits[None, ...], size=original_size, mode="bilinear", align_corners=False + )[0] + resized_logits.append(upsampled_logits) + return resized_logits + + def post_process_semantic_segmentation( + self, + outputs, + target_sizes: list[tuple[int, int]], + size: Optional[dict[str, int]] = None, + ) -> np.ndarray: + """Post-processes model outputs into final semantic segmentation prediction.""" + + size = size if size is not None else self.size + + masks_queries_logits = outputs.masks_queries_logits # [batch_size, num_queries, height, width] + class_queries_logits = outputs.class_queries_logits # [batch_size, num_queries, num_classes+1] + patch_offsets = outputs.patch_offsets + + output_size = get_target_size(size) + masks_queries_logits = torch.nn.functional.interpolate( + masks_queries_logits, + size=output_size, + mode="bilinear", + ) + + # Remove the null class `[..., :-1]` + masks_classes = class_queries_logits.softmax(dim=-1)[..., :-1] + masks_probs = masks_queries_logits.sigmoid() # [batch_size, num_queries, height, width] + + segmentation_logits = torch.einsum("bqc, bqhw -> bchw", masks_classes, masks_probs) + + output_logits = self.merge_image_patches(segmentation_logits, patch_offsets, target_sizes, size) + + preds = [logit.argmax(dim=0) for logit in output_logits] + return preds + + def post_process_panoptic_segmentation( + self, + outputs, + target_sizes: list[tuple[int, int]], + threshold: float = 0.8, + mask_threshold: float = 0.5, + overlap_mask_area_threshold: float = 0.8, + stuff_classes: Optional[list[int]] = None, + size: Optional[dict[str, int]] = None, + ): + """Post-processes model outputs into final panoptic segmentation prediction.""" + + size = size if size is not None else self.size + + masks_queries_logits = outputs.masks_queries_logits # [batch_size, num_queries, height, width] + class_queries_logits = outputs.class_queries_logits # [batch_size, num_queries, num_classes+1] + + batch_size = class_queries_logits.shape[0] + num_labels = class_queries_logits.shape[-1] - 1 + + output_size = get_target_size(size) + masks_queries_logits = torch.nn.functional.interpolate( + masks_queries_logits, + size=output_size, + mode="bilinear", + ) + + mask_probs_batch = self.unpad_image(masks_queries_logits, target_sizes, size) + pred_scores_batch, pred_labels_batch = class_queries_logits.softmax(dim=-1).max(-1) + + results: list = [] + + for i in range(batch_size): + mask_probs, pred_scores, pred_labels = remove_low_and_no_objects( + mask_probs_batch[i], pred_scores_batch[i], pred_labels_batch[i], threshold, num_labels + ) + + # No mask found + if mask_probs.shape[0] <= 0: + height, width = target_sizes[i] if target_sizes is not None else mask_probs.shape[1:] + segmentation = torch.zeros((height, width)) - 1 + results.append({"segmentation": segmentation, "segments_info": []}) + continue + + segmentation, segments = compute_segments( + mask_probs=mask_probs, + pred_scores=pred_scores, + pred_labels=pred_labels, + stuff_classes=stuff_classes, + mask_threshold=mask_threshold, + overlap_mask_area_threshold=overlap_mask_area_threshold, + target_size=target_sizes[i] if target_sizes is not None else None, + ) + + results.append({"segmentation": segmentation, "segments_info": segments}) + return results + + @filter_out_non_signature_kwargs() + def post_process_instance_segmentation( + self, + outputs, + target_sizes: list[tuple[int, int]], + threshold: float = 0.8, + size: Optional[dict[str, int]] = None, + ): + """Post-processes model outputs into Instance Segmentation Predictions.""" + + size = size if size is not None else self.size + + masks_queries_logits = outputs.masks_queries_logits + class_queries_logits = outputs.class_queries_logits + + output_size = get_target_size(size) + masks_queries_logits = torch.nn.functional.interpolate( + masks_queries_logits, + size=output_size, + mode="bilinear", + ) + + mask_probs_batch = self.unpad_image(masks_queries_logits, target_sizes, size) + + device = masks_queries_logits.device + batch_size = class_queries_logits.shape[0] + num_queries = class_queries_logits.shape[-2] + + results = [] + + for i in range(batch_size): + mask_pred = mask_probs_batch[i] + mask_class = class_queries_logits[i] + + # Remove the null class `[..., :-1]` + scores, pred_classes = mask_class.softmax(dim=-1)[..., :-1].max(-1) + pred_masks = (mask_pred > 0).float() + + # Calculate average mask prob + mask_scores = (mask_pred.sigmoid().flatten(1) * pred_masks.flatten(1)).sum(1) / ( + pred_masks.flatten(1).sum(1) + 1e-6 + ) + pred_scores = scores * mask_scores + + segmentation = torch.zeros(target_sizes[i], device=device) - 1 + + instance_maps, segments = [], [] + current_segment_id = 0 + for j in range(num_queries): + score = pred_scores[j].item() + + if not torch.all(pred_masks[j] == 0) and score >= threshold: + segmentation[pred_masks[j] == 1] = current_segment_id + segments.append( + { + "id": current_segment_id, + "label_id": pred_classes[j].item(), + "score": round(score, 6), + } + ) + current_segment_id += 1 + instance_maps.append(pred_masks[j]) + + results.append({"segmentation": segmentation, "segments_info": segments}) + return results + + +__all__ = ["EomtImageProcessorFast"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/eomt/modeling_eomt.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/eomt/modeling_eomt.py new file mode 100644 index 0000000000000000000000000000000000000000..047baa1ff081ed2fd3b17bc94b972008cd157384 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/eomt/modeling_eomt.py @@ -0,0 +1,1226 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/eomt/modular_eomt.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_eomt.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright 2025 Mobile Perception Systems Lab at TU/e and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import collections.abc +import math +from dataclasses import dataclass +from typing import Callable, Optional + +import numpy as np +import torch +import torch.nn.functional as F +from torch import Tensor, nn + +from ...activations import ACT2FN +from ...file_utils import ModelOutput, is_scipy_available, requires_backends +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel +from ...processing_utils import Unpack +from ...utils import TransformersKwargs, auto_docstring, is_accelerate_available +from ...utils.generic import check_model_inputs +from .configuration_eomt import EomtConfig + + +if is_scipy_available(): + from scipy.optimize import linear_sum_assignment + +if is_accelerate_available(): + from accelerate import PartialState + from accelerate.utils import reduce + + +@dataclass +@auto_docstring( + custom_intro=""" + Class for outputs of [`EomtForUniversalSegmentationOutput`]. + + This output can be directly passed to [`~EomtImageProcessor.post_process_semantic_segmentation`] or + [`~EomtImageProcessor.post_process_instance_segmentation`] or + [`~EomtImageProcessor.post_process_panoptic_segmentation`] to compute final segmentation maps. Please, see + [`~EomtImageProcessor] for details regarding usage. + """ +) +class EomtForUniversalSegmentationOutput(ModelOutput): + r""" + loss (`torch.Tensor`, *optional*): + The computed loss, returned when labels are present. + class_queries_logits (`torch.FloatTensor`): + A tensor of shape `(batch_size, num_queries, num_labels + 1)` representing the proposed classes for each + query. Note the `+ 1` is needed because we incorporate the null class. + masks_queries_logits (`torch.FloatTensor`): + A tensor of shape `(batch_size, num_queries, height, width)` representing the proposed masks for each + query. + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + Last hidden states (final feature map) of the last layer. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of + shape `(batch_size, sequence_length, hidden_size)`. Hidden-states all layers of the model. + attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `tuple(torch.FloatTensor)` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. Self and Cross Attentions weights from transformer decoder. + patch_offsets (`list[torch.Tensor]`, *optional*): + list of tuples indicating the image index and start and end positions of patches for semantic segmentation. + """ + + loss: Optional[torch.FloatTensor] = None + class_queries_logits: Optional[torch.FloatTensor] = None + masks_queries_logits: Optional[torch.FloatTensor] = None + last_hidden_state: Optional[torch.FloatTensor] = None + hidden_states: Optional[tuple[torch.FloatTensor]] = None + attentions: Optional[tuple[torch.FloatTensor]] = None + patch_offsets: Optional[list[torch.Tensor]] = None + + +# Adapted from https://github.com/facebookresearch/detectron2/blob/main/projects/PointRend/point_rend/point_features.py +def sample_point( + input_features: torch.Tensor, point_coordinates: torch.Tensor, add_dim=False, **kwargs +) -> torch.Tensor: + """ + A wrapper around `torch.nn.functional.grid_sample` to support 3D point_coordinates tensors. + + Args: + input_features (`torch.Tensor` of shape (batch_size, channels, height, width)): + A tensor that contains features map on a height * width grid + point_coordinates (`torch.Tensor` of shape (batch_size, num_points, 2) or (batch_size, grid_height, grid_width,: + 2)): + A tensor that contains [0, 1] * [0, 1] normalized point coordinates + add_dim (`bool`): + boolean value to keep track of added dimension + + Returns: + point_features (`torch.Tensor` of shape (batch_size, channels, num_points) or (batch_size, channels, + height_grid, width_grid): + A tensor that contains features for points in `point_coordinates`. + """ + if point_coordinates.dim() == 3: + add_dim = True + point_coordinates = point_coordinates.unsqueeze(2) + + # use nn.function.grid_sample to get features for points in `point_coordinates` via bilinear interpolation + point_features = torch.nn.functional.grid_sample(input_features, 2.0 * point_coordinates - 1.0, **kwargs) + if add_dim: + point_features = point_features.squeeze(3) + + return point_features + + +def pair_wise_dice_loss(inputs: Tensor, labels: Tensor) -> Tensor: + """ + A pair wise version of the dice loss, see `dice_loss` for usage. + + Args: + inputs (`torch.Tensor`): + A tensor representing a mask + labels (`torch.Tensor`): + A tensor with the same shape as inputs. Stores the binary classification labels for each element in inputs + (0 for the negative class and 1 for the positive class). + + Returns: + `torch.Tensor`: The computed loss between each pairs. + """ + inputs = inputs.sigmoid().flatten(1) + numerator = 2 * torch.matmul(inputs, labels.T) + # using broadcasting to get a [num_queries, NUM_CLASSES] matrix + denominator = inputs.sum(-1)[:, None] + labels.sum(-1)[None, :] + loss = 1 - (numerator + 1) / (denominator + 1) + return loss + + +def pair_wise_sigmoid_cross_entropy_loss(inputs: torch.Tensor, labels: torch.Tensor) -> torch.Tensor: + r""" + A pair wise version of the cross entropy loss, see `sigmoid_cross_entropy_loss` for usage. + + Args: + inputs (`torch.Tensor`): + A tensor representing a mask. + labels (`torch.Tensor`): + A tensor with the same shape as inputs. Stores the binary classification labels for each element in inputs + (0 for the negative class and 1 for the positive class). + + Returns: + loss (`torch.Tensor`): The computed loss between each pairs. + """ + + height_and_width = inputs.shape[1] + + criterion = nn.BCEWithLogitsLoss(reduction="none") + cross_entropy_loss_pos = criterion(inputs, torch.ones_like(inputs)) + cross_entropy_loss_neg = criterion(inputs, torch.zeros_like(inputs)) + + loss_pos = torch.matmul(cross_entropy_loss_pos / height_and_width, labels.T) + loss_neg = torch.matmul(cross_entropy_loss_neg / height_and_width, (1 - labels).T) + loss = loss_pos + loss_neg + return loss + + +# Adapted from https://github.com/facebookresearch/Eomt/blob/main/eomt/modeling/matcher.py +class EomtHungarianMatcher(nn.Module): + """This class computes an assignment between the labels and the predictions of the network. + + For efficiency reasons, the labels don't include the no_object. Because of this, in general, there are more + predictions than labels. In this case, we do a 1-to-1 matching of the best predictions, while the others are + un-matched (and thus treated as non-objects). + """ + + def __init__( + self, cost_class: float = 1.0, cost_mask: float = 1.0, cost_dice: float = 1.0, num_points: int = 12544 + ): + """Creates the matcher + + Params: + cost_class (`float`, *optional*, defaults to 1.0): + Relative weight of the classification error in the matching cost. + cost_mask (`float`, *optional*, defaults to 1.0): + This is the relative weight of the focal loss of the binary mask in the matching cost. + cost_dice (`float`, *optional*, defaults to 1.0): + This is the relative weight of the dice loss of the binary mask in the matching cost. + num_points (`int`, *optional*, defaults to 12544): + No. of points to sample on which the mask loss will be calculated. The same set of K points are + uniformly sampled for all prediction and ground truth masks to construct the cost matrix for bipartite + matching. + """ + super().__init__() + if cost_class == 0 and cost_mask == 0 and cost_dice == 0: + raise ValueError("All costs can't be 0") + + self.num_points = num_points + self.cost_class = cost_class + self.cost_mask = cost_mask + self.cost_dice = cost_dice + + @torch.no_grad() + def forward( + self, + masks_queries_logits: torch.Tensor, + class_queries_logits: torch.Tensor, + mask_labels: torch.Tensor, + class_labels: torch.Tensor, + ) -> list[tuple[Tensor]]: + """ + Params: + masks_queries_logits (`torch.Tensor`): + A tensor of dim `batch_size, num_queries, num_labels` with the classification logits. + class_queries_logits (`torch.Tensor`): + A tensor of dim `batch_size, num_queries, height, width` with the predicted masks. + class_labels (`torch.Tensor`): + A tensor of dim `num_target_boxes` (where num_target_boxes is the number of ground-truth objects in the + target) containing the class labels. + mask_labels (`torch.Tensor`): + A tensor of dim `num_target_boxes, height, width` containing the target masks. + + Returns: + matched_indices (`list[tuple[Tensor]]`): A list of size batch_size, containing tuples of (index_i, index_j) + where: + - index_i is the indices of the selected predictions (in order) + - index_j is the indices of the corresponding selected labels (in order) + For each batch element, it holds: + len(index_i) = len(index_j) = min(num_queries, num_target_boxes). + """ + indices: list[tuple[np.array]] = [] + + # iterate through batch size + batch_size = masks_queries_logits.shape[0] + for i in range(batch_size): + pred_probs = class_queries_logits[i].softmax(-1) + pred_mask = masks_queries_logits[i] + + # Compute the classification cost. Contrary to the loss, we don't use the NLL, but approximate it in 1 - proba[target class]. The 1 is a constant that doesn't change the matching, it can be omitted. + cost_class = -pred_probs[:, class_labels[i]] + target_mask = mask_labels[i].to(pred_mask) + target_mask = target_mask[:, None] + pred_mask = pred_mask[:, None] + + # Sample ground truth and predicted masks + point_coordinates = torch.rand(1, self.num_points, 2, device=pred_mask.device) + + target_coordinates = point_coordinates.repeat(target_mask.shape[0], 1, 1) + target_mask = sample_point(target_mask, target_coordinates, align_corners=False).squeeze(1) + + pred_coordinates = point_coordinates.repeat(pred_mask.shape[0], 1, 1) + pred_mask = sample_point(pred_mask, pred_coordinates, align_corners=False).squeeze(1) + + # compute the cross entropy loss between each mask pairs -> shape (num_queries, num_labels) + cost_mask = pair_wise_sigmoid_cross_entropy_loss(pred_mask, target_mask) + # Compute the dice loss between each mask pairs -> shape (num_queries, num_labels) + cost_dice = pair_wise_dice_loss(pred_mask, target_mask) + # final cost matrix + cost_matrix = self.cost_mask * cost_mask + self.cost_class * cost_class + self.cost_dice * cost_dice + # eliminate infinite values in cost_matrix to avoid the error ``ValueError: cost matrix is infeasible`` + cost_matrix = torch.minimum(cost_matrix, torch.tensor(1e10)) + cost_matrix = torch.maximum(cost_matrix, torch.tensor(-1e10)) + cost_matrix = torch.nan_to_num(cost_matrix, 0) + # do the assignment using the hungarian algorithm in scipy + assigned_indices: tuple[np.array] = linear_sum_assignment(cost_matrix.cpu()) + indices.append(assigned_indices) + + # It could be stacked in one tensor + matched_indices = [ + (torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices + ] + return matched_indices + + +def dice_loss(inputs: Tensor, labels: Tensor, num_masks: int) -> Tensor: + r""" + Compute the DICE loss, similar to generalized IOU for masks as follows: + + $$ \mathcal{L}_{\text{dice}(x, y) = 1 - \frac{2 * x \cap y }{x \cup y + 1}} $$ + + In practice, since `labels` is a binary mask, (only 0s and 1s), dice can be computed as follow + + $$ \mathcal{L}_{\text{dice}(x, y) = 1 - \frac{2 * x * y }{x + y + 1}} $$ + + Args: + inputs (`torch.Tensor`): + A tensor representing a mask. + labels (`torch.Tensor`): + A tensor with the same shape as inputs. Stores the binary classification labels for each element in inputs + (0 for the negative class and 1 for the positive class). + num_masks (`int`): + The number of masks present in the current batch, used for normalization. + + Returns: + `torch.Tensor`: The computed loss. + """ + probs = inputs.sigmoid().flatten(1) + numerator = 2 * (probs * labels).sum(-1) + denominator = probs.sum(-1) + labels.sum(-1) + loss = 1 - (numerator + 1) / (denominator + 1) + loss = loss.sum() / num_masks + return loss + + +def sigmoid_cross_entropy_loss(inputs: torch.Tensor, labels: torch.Tensor, num_masks: int) -> torch.Tensor: + r""" + Args: + inputs (`torch.Tensor`): + A float tensor of arbitrary shape. + labels (`torch.Tensor`): + A tensor with the same shape as inputs. Stores the binary classification labels for each element in inputs + (0 for the negative class and 1 for the positive class). + + Returns: + loss (`torch.Tensor`): The computed loss. + """ + criterion = nn.BCEWithLogitsLoss(reduction="none") + cross_entropy_loss = criterion(inputs, labels) + + loss = cross_entropy_loss.mean(1).sum() / num_masks + return loss + + +# Adapted from https://github.com/facebookresearch/Eomt/blob/main/eomt/modeling/criterion.py +class EomtLoss(nn.Module): + def __init__(self, config: EomtConfig, weight_dict: dict[str, float]): + """ + The Eomt Loss. The loss is computed very similar to DETR. The process happens in two steps: 1) we + compute hungarian assignment between ground truth masks and the outputs of the model 2) we supervise each pair + of matched ground-truth / prediction (supervise class and mask) + + Args: + config (`EomtConfig`): + The configuration for Eomt model also containing loss calculation specific parameters. + weight_dict (`dict[str, float]`): + A dictionary of weights to be applied to the different losses. + """ + super().__init__() + requires_backends(self, ["scipy"]) + self.num_labels = config.num_labels + self.weight_dict = weight_dict + + # Weight to apply to the null class + self.eos_coef = config.no_object_weight + empty_weight = torch.ones(self.num_labels + 1) + empty_weight[-1] = self.eos_coef + self.register_buffer("empty_weight", empty_weight) + + # pointwise mask loss parameters + self.num_points = config.train_num_points + self.oversample_ratio = config.oversample_ratio + self.importance_sample_ratio = config.importance_sample_ratio + + self.matcher = EomtHungarianMatcher( + cost_class=config.class_weight, + cost_dice=config.dice_weight, + cost_mask=config.mask_weight, + num_points=self.num_points, + ) + + def _max_by_axis(self, sizes: list[list[int]]) -> list[int]: + maxes = sizes[0] + for sublist in sizes[1:]: + for index, item in enumerate(sublist): + maxes[index] = max(maxes[index], item) + return maxes + + # Adapted from nested_tensor_from_tensor_list() in original implementation + def _pad_images_to_max_in_batch(self, tensors: list[Tensor]) -> tuple[Tensor, Tensor]: + # get the maximum size in the batch + max_size = self._max_by_axis([list(tensor.shape) for tensor in tensors]) + # compute final size + batch_shape = [len(tensors)] + max_size + batch_size, _, height, width = batch_shape + dtype = tensors[0].dtype + device = tensors[0].device + padded_tensors = torch.zeros(batch_shape, dtype=dtype, device=device) + padding_masks = torch.ones((batch_size, height, width), dtype=torch.bool, device=device) + # pad the tensors to the size of the biggest one + for tensor, padded_tensor, padding_mask in zip(tensors, padded_tensors, padding_masks): + padded_tensor[: tensor.shape[0], : tensor.shape[1], : tensor.shape[2]].copy_(tensor) + padding_mask[: tensor.shape[1], : tensor.shape[2]] = False + + return padded_tensors, padding_masks + + def loss_labels( + self, class_queries_logits: Tensor, class_labels: list[Tensor], indices: tuple[np.array] + ) -> dict[str, Tensor]: + """Compute the losses related to the labels using cross entropy. + + Args: + class_queries_logits (`torch.Tensor`): + A tensor of shape `batch_size, num_queries, num_labels` + class_labels (`list[torch.Tensor]`): + List of class labels of shape `(labels)`. + indices (`tuple[np.array])`: + The indices computed by the Hungarian matcher. + + Returns: + `dict[str, Tensor]`: A dict of `torch.Tensor` containing the following key: + - **loss_cross_entropy** -- The loss computed using cross entropy on the predicted and ground truth labels. + """ + pred_logits = class_queries_logits + batch_size, num_queries, _ = pred_logits.shape + criterion = nn.CrossEntropyLoss(weight=self.empty_weight) + idx = self._get_predictions_permutation_indices(indices) # shape of (batch_size, num_queries) + target_classes_o = torch.cat( + [target[j] for target, (_, j) in zip(class_labels, indices)] + ) # shape of (batch_size, num_queries) + target_classes = torch.full( + (batch_size, num_queries), fill_value=self.num_labels, dtype=torch.int64, device=pred_logits.device + ) + target_classes[idx] = target_classes_o + # Permute target_classes (batch_size, num_queries, num_labels) -> (batch_size, num_labels, num_queries) + pred_logits_transposed = pred_logits.transpose(1, 2) + loss_ce = criterion(pred_logits_transposed, target_classes) + losses = {"loss_cross_entropy": loss_ce} + return losses + + def loss_masks( + self, + masks_queries_logits: torch.Tensor, + mask_labels: list[torch.Tensor], + indices: tuple[np.array], + num_masks: int, + ) -> dict[str, torch.Tensor]: + """Compute the losses related to the masks using sigmoid_cross_entropy_loss and dice loss. + + Args: + masks_queries_logits (`torch.Tensor`): + A tensor of shape `(batch_size, num_queries, height, width)`. + mask_labels (`torch.Tensor`): + List of mask labels of shape `(labels, height, width)`. + indices (`tuple[np.array])`: + The indices computed by the Hungarian matcher. + num_masks (`int)`: + The number of masks, used for normalization. + + Returns: + losses (`dict[str, Tensor]`): A dict of `torch.Tensor` containing two keys: + - **loss_mask** -- The loss computed using sigmoid cross entropy loss on the predicted and ground truth. + masks. + - **loss_dice** -- The loss computed using dice loss on the predicted on the predicted and ground truth, + masks. + """ + src_idx = self._get_predictions_permutation_indices(indices) + tgt_idx = self._get_targets_permutation_indices(indices) + # shape (batch_size * num_queries, height, width) + pred_masks = masks_queries_logits[src_idx] + # shape (batch_size, num_queries, height, width) + # pad all and stack the targets to the num_labels dimension + target_masks, _ = self._pad_images_to_max_in_batch(mask_labels) + target_masks = target_masks[tgt_idx] + + # No need to upsample predictions as we are using normalized coordinates + pred_masks = pred_masks[:, None] + target_masks = target_masks[:, None] + + # Sample point coordinates + with torch.no_grad(): + point_coordinates = self.sample_points_using_uncertainty( + pred_masks, + lambda logits: self.calculate_uncertainty(logits), + self.num_points, + self.oversample_ratio, + self.importance_sample_ratio, + ) + + point_labels = sample_point(target_masks, point_coordinates, align_corners=False).squeeze(1) + + point_logits = sample_point(pred_masks, point_coordinates, align_corners=False).squeeze(1) + + losses = { + "loss_mask": sigmoid_cross_entropy_loss(point_logits, point_labels, num_masks), + "loss_dice": dice_loss(point_logits, point_labels, num_masks), + } + + del pred_masks + del target_masks + return losses + + def _get_predictions_permutation_indices(self, indices): + # Permute predictions following indices + batch_indices = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)]) + predictions_indices = torch.cat([src for (src, _) in indices]) + return batch_indices, predictions_indices + + def _get_targets_permutation_indices(self, indices): + # Permute labels following indices + batch_indices = torch.cat([torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)]) + target_indices = torch.cat([tgt for (_, tgt) in indices]) + return batch_indices, target_indices + + def calculate_uncertainty(self, logits: torch.Tensor) -> torch.Tensor: + """ + In Eomt paper, uncertainty is estimated as L1 distance between 0.0 and the logit prediction in 'logits' + for the foreground class in `classes`. + + Args: + logits (`torch.Tensor`): + A tensor of shape (R, 1, ...) for class-specific or class-agnostic, where R is the total number of predicted masks in all images and C is: + the number of foreground classes. The values are logits. + + Returns: + scores (`torch.Tensor`): A tensor of shape (R, 1, ...) that contains uncertainty scores with the most + uncertain locations having the highest uncertainty score. + """ + uncertainty_scores = -(torch.abs(logits)) + return uncertainty_scores + + def sample_points_using_uncertainty( + self, + logits: torch.Tensor, + uncertainty_function, + num_points: int, + oversample_ratio: int, + importance_sample_ratio: float, + ) -> torch.Tensor: + """ + This function is meant for sampling points in [0, 1] * [0, 1] coordinate space based on their uncertainty. The + uncertainty is calculated for each point using the passed `uncertainty function` that takes points logit + prediction as input. + + Args: + logits (`float`): + Logit predictions for P points. + uncertainty_function: + A function that takes logit predictions for P points and returns their uncertainties. + num_points (`int`): + The number of points P to sample. + oversample_ratio (`int`): + Oversampling parameter. + importance_sample_ratio (`float`): + Ratio of points that are sampled via importance sampling. + + Returns: + point_coordinates (`torch.Tensor`): + Coordinates for P sampled points. + """ + + num_boxes = logits.shape[0] + num_points_sampled = int(num_points * oversample_ratio) + + # Get random point coordinates + point_coordinates = torch.rand(num_boxes, num_points_sampled, 2, device=logits.device) + # Get sampled prediction value for the point coordinates + point_logits = sample_point(logits, point_coordinates, align_corners=False) + # Calculate the uncertainties based on the sampled prediction values of the points + point_uncertainties = uncertainty_function(point_logits) + + num_uncertain_points = int(importance_sample_ratio * num_points) + num_random_points = num_points - num_uncertain_points + + idx = torch.topk(point_uncertainties[:, 0, :], k=num_uncertain_points, dim=1)[1] + shift = num_points_sampled * torch.arange(num_boxes, dtype=torch.long, device=logits.device) + idx += shift[:, None] + point_coordinates = point_coordinates.view(-1, 2)[idx.view(-1), :].view(num_boxes, num_uncertain_points, 2) + + if num_random_points > 0: + point_coordinates = torch.cat( + [point_coordinates, torch.rand(num_boxes, num_random_points, 2, device=logits.device)], + dim=1, + ) + return point_coordinates + + def forward( + self, + masks_queries_logits: torch.Tensor, + class_queries_logits: torch.Tensor, + mask_labels: list[torch.Tensor], + class_labels: list[torch.Tensor], + auxiliary_predictions: Optional[dict[str, torch.Tensor]] = None, + ) -> dict[str, torch.Tensor]: + """ + This performs the loss computation. + + Args: + masks_queries_logits (`torch.Tensor`): + A tensor of shape `(batch_size, num_queries, height, width)`. + class_queries_logits (`torch.Tensor`): + A tensor of shape `(batch_size, num_queries, num_labels)`. + mask_labels (`torch.Tensor`): + List of mask labels of shape `(labels, height, width)`. + class_labels (`list[torch.Tensor]`): + List of class labels of shape `(labels)`. + auxiliary_predictions (`dict[str, torch.Tensor]`, *optional*): + if `use_auxiliary_loss` was set to `true` in [`EomtConfig`], then it contains the logits from + the inner layers of the EomtMaskedAttentionDecoder. + + Returns: + losses (`dict[str, Tensor]`): A dict of `torch.Tensor` containing three keys: + - **loss_cross_entropy** -- The loss computed using cross entropy on the predicted and ground truth labels. + - **loss_mask** -- The loss computed using sigmoid cross_entropy loss on the predicted and ground truth + masks. + - **loss_dice** -- The loss computed using dice loss on the predicted on the predicted and ground truth + masks. + if `use_auxiliary_loss` was set to `true` in [`EomtConfig`], the dictionary contains additional + losses for each auxiliary predictions. + """ + + # retrieve the matching between the outputs of the last layer and the labels + indices = self.matcher(masks_queries_logits, class_queries_logits, mask_labels, class_labels) + # compute the average number of target masks for normalization purposes + num_masks = self.get_num_masks(class_labels, device=class_labels[0].device) + # get all the losses + losses: dict[str, Tensor] = { + **self.loss_masks(masks_queries_logits, mask_labels, indices, num_masks), + **self.loss_labels(class_queries_logits, class_labels, indices), + } + # in case of auxiliary losses, we repeat this process with the output of each intermediate layer. + if auxiliary_predictions is not None: + for idx, aux_outputs in enumerate(auxiliary_predictions): + masks_queries_logits = aux_outputs["masks_queries_logits"] + class_queries_logits = aux_outputs["class_queries_logits"] + loss_dict = self.forward(masks_queries_logits, class_queries_logits, mask_labels, class_labels) + loss_dict = {f"{key}_{idx}": value for key, value in loss_dict.items()} + losses.update(loss_dict) + + return losses + + def get_num_masks(self, class_labels: torch.Tensor, device: torch.device) -> torch.Tensor: + """ + Computes the average number of target masks across the batch, for normalization purposes. + """ + num_masks = sum(len(classes) for classes in class_labels) + num_masks = torch.as_tensor(num_masks, dtype=torch.float, device=device) + world_size = 1 + if is_accelerate_available(): + if PartialState._shared_state != {}: + num_masks = reduce(num_masks) + world_size = PartialState().num_processes + + num_masks = torch.clamp(num_masks / world_size, min=1) + return num_masks + + +class EomtPatchEmbeddings(nn.Module): + """ + This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial + `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a + Transformer. + """ + + def __init__(self, config): + super().__init__() + image_size, patch_size = config.image_size, config.patch_size + num_channels, hidden_size = config.num_channels, config.hidden_size + + image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size) + patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size) + num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.num_patches = num_patches + + self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size) + + def forward(self, pixel_values: torch.Tensor) -> torch.Tensor: + num_channels = pixel_values.shape[1] + if num_channels != self.num_channels: + raise ValueError( + "Make sure that the channel dimension of the pixel values match with the one set in the configuration." + f" Expected {self.num_channels} but got {num_channels}." + ) + embeddings = self.projection(pixel_values).flatten(2).transpose(1, 2) + return embeddings + + +class EomtEmbeddings(nn.Module): + """ + Construct the CLS token, mask token, position and patch embeddings. + """ + + def __init__(self, config: EomtConfig) -> None: + super().__init__() + + self.config = config + self.patch_size = config.patch_size + + self.cls_token = nn.Parameter(torch.randn(1, 1, config.hidden_size)) + self.register_tokens = nn.Parameter(torch.zeros(1, config.num_register_tokens, config.hidden_size)) + + self.patch_embeddings = EomtPatchEmbeddings(config) + num_patches = self.patch_embeddings.num_patches + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.num_prefix_tokens = 1 + config.num_register_tokens # 1 for [CLS] + self.position_embeddings = nn.Embedding(num_patches, config.hidden_size) + self.register_buffer("position_ids", torch.arange(num_patches).expand((1, -1)), persistent=False) + + def forward(self, pixel_values: torch.Tensor) -> torch.Tensor: + batch_size, _, _, _ = pixel_values.shape + target_dtype = self.patch_embeddings.projection.weight.dtype + embeddings = self.patch_embeddings(pixel_values.to(dtype=target_dtype)) + + cls_tokens = self.cls_token.expand(batch_size, -1, -1) + register_tokens = self.register_tokens.expand(batch_size, -1, -1) + + embeddings = embeddings + self.position_embeddings(self.position_ids) + embeddings = torch.cat([cls_tokens, register_tokens, embeddings], dim=1) + + embeddings = self.dropout(embeddings) + + return embeddings + + +def eager_attention_forward( + module: nn.Module, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attention_mask: Optional[torch.Tensor], + scaling: float, + dropout: float = 0.0, + **kwargs, +): + attn_weights = torch.matmul(query, key.transpose(-1, -2)) * scaling + if attention_mask is not None: + attn_weights = attn_weights + attention_mask + + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype) + attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) + + attn_output = torch.matmul(attn_weights, value) + attn_output = attn_output.transpose(1, 2).contiguous() + + return attn_output, attn_weights + + +class EomtAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.embed_dim // self.num_heads + if self.head_dim * self.num_heads != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:" + f" {self.num_heads})." + ) + self.scale = self.head_dim**-0.5 + self.dropout = config.attention_dropout + self.is_causal = False + + self.k_proj = nn.Linear(self.embed_dim, self.embed_dim) + self.v_proj = nn.Linear(self.embed_dim, self.embed_dim) + self.q_proj = nn.Linear(self.embed_dim, self.embed_dim) + self.out_proj = nn.Linear(self.embed_dim, self.embed_dim) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + **kwargs, + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: + """Input shape: Batch x Time x Channel""" + + batch_size, seq_length, embed_dim = hidden_states.shape + + queries = self.q_proj(hidden_states) + keys = self.k_proj(hidden_states) + values = self.v_proj(hidden_states) + + queries = queries.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2) + keys = keys.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2) + values = values.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2) + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + attn_output, attn_weights = attention_interface( + self, + queries, + keys, + values, + attention_mask, + is_causal=self.is_causal, + scaling=self.scale, + dropout=0.0 if not self.training else self.dropout, + ) + + attn_output = attn_output.reshape(batch_size, seq_length, embed_dim).contiguous() + attn_output = self.out_proj(attn_output) + + return attn_output, attn_weights + + +class EomtLayerScale(nn.Module): + def __init__(self, config) -> None: + super().__init__() + self.lambda1 = nn.Parameter(config.layerscale_value * torch.ones(config.hidden_size)) + + def forward(self, hidden_state: torch.Tensor) -> torch.Tensor: + return hidden_state * self.lambda1 + + +def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor: + """ + Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + + Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks, + however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... + See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the + layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the + argument. + """ + if drop_prob == 0.0 or not training: + return input + keep_prob = 1 - drop_prob + shape = (input.shape[0],) + (1,) * (input.ndim - 1) # work with diff dim tensors, not just 2D ConvNets + random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device) + random_tensor.floor_() # binarize + output = input.div(keep_prob) * random_tensor + return output + + +class EomtDropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" + + def __init__(self, drop_prob: Optional[float] = None) -> None: + super().__init__() + self.drop_prob = drop_prob + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + return drop_path(hidden_states, self.drop_prob, self.training) + + def extra_repr(self) -> str: + return f"p={self.drop_prob}" + + +class EomtMLP(nn.Module): + def __init__(self, config) -> None: + super().__init__() + in_features = out_features = config.hidden_size + hidden_features = int(config.hidden_size * config.mlp_ratio) + self.fc1 = nn.Linear(in_features, hidden_features, bias=True) + if isinstance(config.hidden_act, str): + self.activation = ACT2FN[config.hidden_act] + else: + self.activation = config.hidden_act + self.fc2 = nn.Linear(hidden_features, out_features, bias=True) + + def forward(self, hidden_state: torch.Tensor) -> torch.Tensor: + hidden_state = self.fc1(hidden_state) + hidden_state = self.activation(hidden_state) + hidden_state = self.fc2(hidden_state) + return hidden_state + + +class EomtSwiGLUFFN(nn.Module): + def __init__(self, config) -> None: + super().__init__() + in_features = out_features = config.hidden_size + hidden_features = int(config.hidden_size * config.mlp_ratio) + hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8 + + self.weights_in = nn.Linear(in_features, 2 * hidden_features, bias=True) + self.weights_out = nn.Linear(hidden_features, out_features, bias=True) + + def forward(self, hidden_state: torch.Tensor) -> torch.Tensor: + hidden_state = self.weights_in(hidden_state) + x1, x2 = hidden_state.chunk(2, dim=-1) + hidden = nn.functional.silu(x1) * x2 + return self.weights_out(hidden) + + +class EomtLayer(GradientCheckpointingLayer): + """This corresponds to the Block class in the original implementation.""" + + def __init__(self, config: EomtConfig) -> None: + super().__init__() + + self.norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.attention = EomtAttention(config) + self.layer_scale1 = EomtLayerScale(config) + self.drop_path = EomtDropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity() + + self.norm2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + if config.use_swiglu_ffn: + self.mlp = EomtSwiGLUFFN(config) + else: + self.mlp = EomtMLP(config) + self.layer_scale2 = EomtLayerScale(config) + + def forward( + self, + hidden_states: torch.Tensor, + head_mask: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + hidden_states_norm = self.norm1(hidden_states) + self_attention_output, _ = self.attention(hidden_states_norm, head_mask) + self_attention_output = self.layer_scale1(self_attention_output) + + # first residual connection + hidden_states = self.drop_path(self_attention_output) + hidden_states + + # in Eomt, layernorm is also applied after self-attention + layer_output = self.norm2(hidden_states) + layer_output = self.mlp(layer_output) + layer_output = self.layer_scale2(layer_output) + + # second residual connection + layer_output = self.drop_path(layer_output) + hidden_states + + return layer_output + + +class EomtLayerNorm2d(nn.LayerNorm): + def __init__(self, num_channels, eps=1e-6, affine=True): + super().__init__(num_channels, eps=eps, elementwise_affine=affine) + + def forward(self, hidden_state: torch.Tensor) -> torch.Tensor: + hidden_state = hidden_state.permute(0, 2, 3, 1) + hidden_state = F.layer_norm(hidden_state, self.normalized_shape, self.weight, self.bias, self.eps) + hidden_state = hidden_state.permute(0, 3, 1, 2) + return hidden_state + + +class EomtScaleLayer(nn.Module): + def __init__(self, config: EomtConfig): + super().__init__() + hidden_size = config.hidden_size + self.conv1 = nn.ConvTranspose2d(hidden_size, hidden_size, kernel_size=2, stride=2) + self.activation = ACT2FN[config.hidden_act] + self.conv2 = nn.Conv2d( + hidden_size, + hidden_size, + kernel_size=3, + padding=1, + groups=hidden_size, + bias=False, + ) + + self.layernorm2d = EomtLayerNorm2d(hidden_size) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.conv1(hidden_states) + hidden_states = self.activation(hidden_states) + hidden_states = self.conv2(hidden_states) + hidden_states = self.layernorm2d(hidden_states) + return hidden_states + + +class EomtScaleBlock(nn.Module): + def __init__(self, config: EomtConfig): + super().__init__() + self.num_blocks = config.num_upscale_blocks + self.block = nn.ModuleList([EomtScaleLayer(config) for _ in range(self.num_blocks)]) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + for block in self.block: + hidden_states = block(hidden_states) + return hidden_states + + +class EomtMaskHead(nn.Module): + def __init__(self, config: EomtConfig): + super().__init__() + + hidden_size = config.hidden_size + self.fc1 = nn.Linear(hidden_size, hidden_size) + self.fc2 = nn.Linear(hidden_size, hidden_size) + self.fc3 = nn.Linear(hidden_size, hidden_size) + self.activation = ACT2FN[config.hidden_act] + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.activation(self.fc1(hidden_states)) + hidden_states = self.activation(self.fc2(hidden_states)) + hidden_states = self.fc3(hidden_states) + return hidden_states + + +@auto_docstring +class EomtPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config: EomtConfig + base_model_prefix = "eomt" + main_input_name = "pixel_values" + supports_gradient_checkpointing = False + _no_split_modules = ["EomtLayer"] + _supports_sdpa = True + _can_record_outputs = { + "hidden_states": EomtLayer, + "attentions": EomtAttention, + } + + def _init_weights(self, module: nn.Module) -> None: + std = self.config.initializer_range + if isinstance(module, (nn.Linear, nn.Conv2d, nn.ConvTranspose2d)): + nn.init.kaiming_uniform_(module.weight, a=math.sqrt(5)) + if module.bias is not None: + fan_in, _ = nn.init._calculate_fan_in_and_fan_out(module.weight) + bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0 + nn.init.uniform_(module.bias, -bound, bound) + elif isinstance(module, nn.LayerNorm): + module.weight.data.fill_(1.0) + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=1) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, EomtLayerScale): + if hasattr(module, "lambda1"): + module.lambda1.data.fill_(self.config.layerscale_value) + elif isinstance(module, EomtEmbeddings): + module.cls_token.data = nn.init.trunc_normal_( + module.cls_token.data.to(torch.float32), mean=0.0, std=std + ).to(module.cls_token.dtype) + module.register_tokens.data.zero_() + + +@auto_docstring( + custom_intro=""" + The EoMT Model with head on top for instance/semantic/panoptic segmentation. + """ +) +class EomtForUniversalSegmentation(EomtPreTrainedModel): + main_input_name = "pixel_values" + + def __init__(self, config: EomtConfig): + super().__init__(config) + self.config = config + self.num_hidden_layers = config.num_hidden_layers + self.embeddings = EomtEmbeddings(config) + self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + self.query = nn.Embedding(config.num_queries, config.hidden_size) + self.layers = nn.ModuleList([EomtLayer(config) for _ in range(config.num_hidden_layers)]) + + self.upscale_block = EomtScaleBlock(config) + self.mask_head = EomtMaskHead(config) + + self.class_predictor = nn.Linear(config.hidden_size, config.num_labels + 1) + + self.grid_size = (config.image_size // config.patch_size, config.image_size // config.patch_size) + self.weight_dict: dict[str, float] = { + "loss_cross_entropy": config.class_weight, + "loss_mask": config.mask_weight, + "loss_dice": config.dice_weight, + } + + self.criterion = EomtLoss(config=config, weight_dict=self.weight_dict) + + self.register_buffer("attn_mask_probs", torch.ones(config.num_blocks)) + + self.post_init() + + def get_loss_dict( + self, + masks_queries_logits: Tensor, + class_queries_logits: Tensor, + mask_labels: Tensor, + class_labels: Tensor, + auxiliary_predictions: dict[str, Tensor], + ) -> dict[str, Tensor]: + loss_dict: dict[str, Tensor] = self.criterion( + masks_queries_logits=masks_queries_logits, + class_queries_logits=class_queries_logits, + mask_labels=mask_labels, + class_labels=class_labels, + auxiliary_predictions=auxiliary_predictions, + ) + + # weight each loss by `self.weight_dict[]` including auxiliary losses + for key, weight in self.weight_dict.items(): + for loss_key, loss in loss_dict.items(): + if key in loss_key: + loss *= weight + + return loss_dict + + def get_loss(self, loss_dict: dict[str, Tensor]) -> Tensor: + return sum(loss_dict.values()) + + @check_model_inputs + @auto_docstring + def forward( + self, + pixel_values: Tensor, + mask_labels: Optional[list[Tensor]] = None, + class_labels: Optional[list[Tensor]] = None, + patch_offsets: Optional[list[Tensor]] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> EomtForUniversalSegmentationOutput: + r""" + mask_labels (`list[torch.Tensor]`, *optional*): + list of mask labels of shape `(num_labels, height, width)` to be fed to a model + class_labels (`list[torch.LongTensor]`, *optional*): + list of target class labels of shape `(num_labels, height, width)` to be fed to a model. They identify the + labels of `mask_labels`, e.g. the label of `mask_labels[i][j]` if `class_labels[i][j]`. + patch_offsets (`list[torch.Tensor]`, *optional*): + list of tuples indicating the image index and start and end positions of patches for semantic segmentation. + """ + + masks_queries_logits_per_layer, class_queries_logits_per_layer = (), () + attention_mask = None + + if pixel_values is None: + raise ValueError("You have to specify pixel_values") + + hidden_states = self.embeddings(pixel_values) + + for idx, layer_module in enumerate(self.layers): + if idx == self.num_hidden_layers - self.config.num_blocks: + query = self.query.weight[None, :, :].expand(hidden_states.shape[0], -1, -1).to(hidden_states.device) + hidden_states = torch.cat((query, hidden_states), dim=1) + + if idx >= self.num_hidden_layers - self.config.num_blocks and ( + self.training or self.attn_mask_probs[idx - self.num_hidden_layers + self.config.num_blocks] > 0 + ): + norm_hidden_states = self.layernorm(hidden_states) + masks_queries_logits, class_queries_logits = self.predict(norm_hidden_states) + + masks_queries_logits_per_layer += (masks_queries_logits,) + class_queries_logits_per_layer += (class_queries_logits,) + + attention_mask = torch.ones( + hidden_states.shape[0], + hidden_states.shape[1], + hidden_states.shape[1], + device=hidden_states.device, + dtype=torch.bool, + ) + + interpolated_logits = F.interpolate(masks_queries_logits, size=self.grid_size, mode="bilinear") + interpolated_logits = interpolated_logits.view( + interpolated_logits.size(0), interpolated_logits.size(1), -1 + ) + + num_query_tokens = self.config.num_queries + encoder_start_tokens = num_query_tokens + self.embeddings.num_prefix_tokens + + # Set attention mask for queries to focus on encoder tokens based on interpolated logits + attention_mask[:, :num_query_tokens, encoder_start_tokens:] = interpolated_logits > 0 + + # Disable attention mask for random query tokens. + attention_mask = self._disable_attention_mask( + attention_mask, + prob=self.attn_mask_probs[idx - self.num_hidden_layers + self.config.num_blocks], + num_query_tokens=num_query_tokens, + encoder_start_tokens=encoder_start_tokens, + device=attention_mask.device, + ) + + # Expand attention mask to 4d mask. + attention_mask = attention_mask[:, None, ...].expand(-1, self.config.num_attention_heads, -1, -1) + attention_mask = attention_mask.float().masked_fill(~attention_mask, -1e9) + + hidden_states = layer_module(hidden_states, attention_mask) + + sequence_output = self.layernorm(hidden_states) + + masks_queries_logits, class_queries_logits = self.predict(sequence_output) + masks_queries_logits_per_layer += (masks_queries_logits,) + class_queries_logits_per_layer += (class_queries_logits,) + + loss = None + if mask_labels is not None and class_labels is not None: + loss = 0.0 + for masks_queries_logits, class_queries_logits in zip( + masks_queries_logits_per_layer, class_queries_logits_per_layer + ): + loss_dict = self.get_loss_dict( + masks_queries_logits=masks_queries_logits, + class_queries_logits=class_queries_logits, + mask_labels=mask_labels, + class_labels=class_labels, + auxiliary_predictions=None, + ) + loss += self.get_loss(loss_dict) + + return EomtForUniversalSegmentationOutput( + loss=loss, + masks_queries_logits=masks_queries_logits, + class_queries_logits=class_queries_logits, + last_hidden_state=sequence_output, + patch_offsets=patch_offsets, + ) + + def get_input_embeddings(self): + return self.embeddings.patch_embeddings + + def predict(self, logits: torch.Tensor): + query_tokens = logits[:, : self.config.num_queries, :] + class_logits = self.class_predictor(query_tokens) + + prefix_tokens = logits[:, self.config.num_queries + self.embeddings.num_prefix_tokens :, :] + prefix_tokens = prefix_tokens.transpose(1, 2) + + prefix_tokens = prefix_tokens.reshape(prefix_tokens.shape[0], -1, *self.grid_size) + + query_tokens = self.mask_head(query_tokens) + prefix_tokens = self.upscale_block(prefix_tokens) + + mask_logits = torch.einsum("bqc, bchw -> bqhw", query_tokens, prefix_tokens) + + return mask_logits, class_logits + + @staticmethod + def _disable_attention_mask(attn_mask, prob, num_query_tokens, encoder_start_tokens, device): + if prob < 1: + # Generate random queries to disable based on the probs + random_queries = torch.rand(attn_mask.shape[0], num_query_tokens, device=device) > prob + + # Disable attention to the query tokens, considering the prefix tokens + attn_mask[:, :num_query_tokens, encoder_start_tokens:][random_queries] = 1 + + return attn_mask + + +__all__ = ["EomtPreTrainedModel", "EomtForUniversalSegmentation"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/eomt/modular_eomt.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/eomt/modular_eomt.py new file mode 100644 index 0000000000000000000000000000000000000000..17fb96ac60aae7d57b510d7e19830d8d1cc0ed9f --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/eomt/modular_eomt.py @@ -0,0 +1,601 @@ +# coding=utf-8 +# Copyright 2025 Mobile Perception Systems Lab at TU/e and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch EoMT model.""" + +import math +from dataclasses import dataclass +from typing import Optional + +import torch +import torch.nn.functional as F +from torch import Tensor, nn + +from ...activations import ACT2FN +from ...file_utils import ( + ModelOutput, +) +from ...modeling_utils import PreTrainedModel +from ...processing_utils import Unpack +from ...utils import ( + TransformersKwargs, + auto_docstring, + logging, +) +from ...utils.generic import check_model_inputs +from ..dinov2.modeling_dinov2 import ( + Dinov2Embeddings, + Dinov2Layer, + Dinov2LayerScale, + Dinov2PatchEmbeddings, +) +from ..mask2former.modeling_mask2former import Mask2FormerForUniversalSegmentation, Mask2FormerLoss +from ..siglip.modeling_siglip import SiglipAttention +from ..vit.configuration_vit import ViTConfig + + +logger = logging.get_logger(__name__) + + +class EomtConfig(ViTConfig): + r""" + This is the configuration class to store the configuration of a [`EomtForUniversalSegmentation`]. It is used to instantiate an EoMT model + according to the specified arguments, defining the model architecture. Instantiating a configuration with the + defaults will yield a similar configuration to that of the EoMT + [tue-mps/coco_panoptic_eomt_large_640](https://huggingface.co/tue-mps/coco_panoptic_eomt_large_640) + architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + hidden_size (`int`, *optional*, defaults to 1024): + Dimensionality of the hidden representations. + num_hidden_layers (`int`, *optional*, defaults to 24): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 16): + Number of attention heads in each attention layer. + mlp_ratio (`int`, *optional*, defaults to 4): + Ratio of the MLP hidden dimensionality to the hidden size. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder. + hidden_dropout_prob (`float`, *optional*, defaults to 0.0): + The dropout probability for all fully connected layers in the embeddings and encoder. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon used by the layer normalization layers. + image_size (`int`, *optional*, defaults to 640): + The size (resolution) of each input image. + patch_size (`int`, *optional*, defaults to 16): + The size (resolution) of each patch. + num_channels (`int`, *optional*, defaults to 3): + The number of input channels. + layerscale_value (`float`, *optional*, defaults to 1.0): + Initial value for the LayerScale parameter. + drop_path_rate (`float`, *optional*, defaults to 0.0): + The stochastic depth rate (drop path) used during training. + num_upscale_blocks (`int`, *optional*, defaults to 2): + Number of upsampling blocks used in the decoder or segmentation head. + attention_dropout (`float`, *optional*, defaults to 0.0): + Dropout probability applied after attention projection. + use_swiglu_ffn (`bool`, *optional*, defaults to `False`): + Whether to use the SwiGLU feedforward neural network. + num_blocks (`int`, *optional*, defaults to 4): + Number of feature blocks or stages in the architecture. + no_object_weight (`float`, *optional*, defaults to 0.1): + Loss weight for the 'no object' class in panoptic/instance segmentation. + class_weight (`float`, *optional*, defaults to 2.0): + Loss weight for classification targets. + mask_weight (`float`, *optional*, defaults to 5.0): + Loss weight for mask prediction. + dice_weight (`float`, *optional*, defaults to 5.0): + Loss weight for the dice loss component. + train_num_points (`int`, *optional*, defaults to 12544): + Number of points to sample for mask loss computation during training. + oversample_ratio (`float`, *optional*, defaults to 3.0): + Oversampling ratio used in point sampling for mask training. + importance_sample_ratio (`float`, *optional*, defaults to 0.75): + Ratio of points to sample based on importance during training. + num_queries (`int`, *optional*, defaults to 200): + Number of object queries in the Transformer. + num_register_tokens (`int`, *optional*, defaults to 4): + Number of learnable register tokens added to the transformer input. + + Example: + + ```python + >>> from transformers import EomtConfig, EomtForUniversalSegmentation + + >>> # Initialize configuration + >>> config = EomtConfig() + + >>> # Initialize model + >>> model = EomtForUniversalSegmentation(config) + + >>> # Access config + >>> config = model.config + ```""" + + model_type = "eomt" + + def __init__( + self, + hidden_size=1024, + num_hidden_layers=24, + num_attention_heads=16, + mlp_ratio=4, + hidden_act="gelu", + hidden_dropout_prob=0.0, + initializer_range=0.02, + layer_norm_eps=1e-6, + image_size=640, + patch_size=16, + num_channels=3, + layerscale_value=1.0, + drop_path_rate=0.0, + num_upscale_blocks=2, + attention_dropout=0.0, + use_swiglu_ffn=False, + num_blocks=4, + no_object_weight: float = 0.1, + class_weight: float = 2.0, + mask_weight: float = 5.0, + dice_weight: float = 5.0, + train_num_points: int = 12544, + oversample_ratio: float = 3.0, + importance_sample_ratio: float = 0.75, + num_queries=200, + num_register_tokens=4, + **kwargs, + ): + super().__init__( + hidden_size=hidden_size, + num_hidden_layers=num_hidden_layers, + num_attention_heads=num_attention_heads, + hidden_dropout_prob=hidden_dropout_prob, + hidden_act=hidden_act, + initializer_range=initializer_range, + layer_norm_eps=layer_norm_eps, + image_size=image_size, + patch_size=patch_size, + num_channels=num_channels, + **kwargs, + ) + + del self.intermediate_size + del self.qkv_bias + del self.pooler_act + del self.pooler_output_size + del self.encoder_stride + del self.attention_probs_dropout_prob + + self.mlp_ratio = mlp_ratio + self.attention_dropout = attention_dropout + self.layerscale_value = layerscale_value + self.drop_path_rate = drop_path_rate + self.num_upscale_blocks = num_upscale_blocks + self.use_swiglu_ffn = use_swiglu_ffn + self.num_blocks = num_blocks + self.no_object_weight = no_object_weight + self.class_weight = class_weight + self.mask_weight = mask_weight + self.dice_weight = dice_weight + self.train_num_points = train_num_points + self.oversample_ratio = oversample_ratio + self.importance_sample_ratio = importance_sample_ratio + self.num_queries = num_queries + self.num_register_tokens = num_register_tokens + + +@dataclass +@auto_docstring( + custom_intro=""" + Class for outputs of [`EomtForUniversalSegmentationOutput`]. + + This output can be directly passed to [`~EomtImageProcessor.post_process_semantic_segmentation`] or + [`~EomtImageProcessor.post_process_instance_segmentation`] or + [`~EomtImageProcessor.post_process_panoptic_segmentation`] to compute final segmentation maps. Please, see + [`~EomtImageProcessor] for details regarding usage. + """ +) +class EomtForUniversalSegmentationOutput(ModelOutput): + r""" + loss (`torch.Tensor`, *optional*): + The computed loss, returned when labels are present. + class_queries_logits (`torch.FloatTensor`): + A tensor of shape `(batch_size, num_queries, num_labels + 1)` representing the proposed classes for each + query. Note the `+ 1` is needed because we incorporate the null class. + masks_queries_logits (`torch.FloatTensor`): + A tensor of shape `(batch_size, num_queries, height, width)` representing the proposed masks for each + query. + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + Last hidden states (final feature map) of the last layer. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of + shape `(batch_size, sequence_length, hidden_size)`. Hidden-states all layers of the model. + attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `tuple(torch.FloatTensor)` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. Self and Cross Attentions weights from transformer decoder. + patch_offsets (`list[torch.Tensor]`, *optional*): + list of tuples indicating the image index and start and end positions of patches for semantic segmentation. + """ + + loss: Optional[torch.FloatTensor] = None + class_queries_logits: Optional[torch.FloatTensor] = None + masks_queries_logits: Optional[torch.FloatTensor] = None + last_hidden_state: Optional[torch.FloatTensor] = None + hidden_states: Optional[tuple[torch.FloatTensor]] = None + attentions: Optional[tuple[torch.FloatTensor]] = None + patch_offsets: Optional[list[torch.Tensor]] = None + + +class EomtLoss(Mask2FormerLoss): + pass + + +class EomtPatchEmbeddings(Dinov2PatchEmbeddings): + pass + + +class EomtEmbeddings(Dinov2Embeddings): + def __init__(self, config: EomtConfig) -> None: + nn.Module.__init__(self) + + self.config = config + self.patch_size = config.patch_size + + self.cls_token = nn.Parameter(torch.randn(1, 1, config.hidden_size)) + self.register_tokens = nn.Parameter(torch.zeros(1, config.num_register_tokens, config.hidden_size)) + + self.patch_embeddings = EomtPatchEmbeddings(config) + num_patches = self.patch_embeddings.num_patches + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.num_prefix_tokens = 1 + config.num_register_tokens # 1 for [CLS] + self.position_embeddings = nn.Embedding(num_patches, config.hidden_size) + self.register_buffer("position_ids", torch.arange(num_patches).expand((1, -1)), persistent=False) + + def interpolate_pos_encoding(self): + raise AttributeError("Not needed for Eomt Model") + + def forward(self, pixel_values: torch.Tensor) -> torch.Tensor: + batch_size, _, _, _ = pixel_values.shape + target_dtype = self.patch_embeddings.projection.weight.dtype + embeddings = self.patch_embeddings(pixel_values.to(dtype=target_dtype)) + + cls_tokens = self.cls_token.expand(batch_size, -1, -1) + register_tokens = self.register_tokens.expand(batch_size, -1, -1) + + embeddings = embeddings + self.position_embeddings(self.position_ids) + embeddings = torch.cat([cls_tokens, register_tokens, embeddings], dim=1) + + embeddings = self.dropout(embeddings) + + return embeddings + + +class EomtAttention(SiglipAttention): + pass + + +class EomtLayerScale(Dinov2LayerScale): + pass + + +class EomtLayer(Dinov2Layer): + def forward( + self, + hidden_states: torch.Tensor, + head_mask: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + hidden_states_norm = self.norm1(hidden_states) + self_attention_output, _ = self.attention(hidden_states_norm, head_mask) + self_attention_output = self.layer_scale1(self_attention_output) + + # first residual connection + hidden_states = self.drop_path(self_attention_output) + hidden_states + + # in Eomt, layernorm is also applied after self-attention + layer_output = self.norm2(hidden_states) + layer_output = self.mlp(layer_output) + layer_output = self.layer_scale2(layer_output) + + # second residual connection + layer_output = self.drop_path(layer_output) + hidden_states + + return layer_output + + +class EomtLayerNorm2d(nn.LayerNorm): + def __init__(self, num_channels, eps=1e-6, affine=True): + super().__init__(num_channels, eps=eps, elementwise_affine=affine) + + def forward(self, hidden_state: torch.Tensor) -> torch.Tensor: + hidden_state = hidden_state.permute(0, 2, 3, 1) + hidden_state = F.layer_norm(hidden_state, self.normalized_shape, self.weight, self.bias, self.eps) + hidden_state = hidden_state.permute(0, 3, 1, 2) + return hidden_state + + +class EomtScaleLayer(nn.Module): + def __init__(self, config: EomtConfig): + super().__init__() + hidden_size = config.hidden_size + self.conv1 = nn.ConvTranspose2d(hidden_size, hidden_size, kernel_size=2, stride=2) + self.activation = ACT2FN[config.hidden_act] + self.conv2 = nn.Conv2d( + hidden_size, + hidden_size, + kernel_size=3, + padding=1, + groups=hidden_size, + bias=False, + ) + + self.layernorm2d = EomtLayerNorm2d(hidden_size) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.conv1(hidden_states) + hidden_states = self.activation(hidden_states) + hidden_states = self.conv2(hidden_states) + hidden_states = self.layernorm2d(hidden_states) + return hidden_states + + +class EomtScaleBlock(nn.Module): + def __init__(self, config: EomtConfig): + super().__init__() + self.num_blocks = config.num_upscale_blocks + self.block = nn.ModuleList([EomtScaleLayer(config) for _ in range(self.num_blocks)]) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + for block in self.block: + hidden_states = block(hidden_states) + return hidden_states + + +class EomtMaskHead(nn.Module): + def __init__(self, config: EomtConfig): + super().__init__() + + hidden_size = config.hidden_size + self.fc1 = nn.Linear(hidden_size, hidden_size) + self.fc2 = nn.Linear(hidden_size, hidden_size) + self.fc3 = nn.Linear(hidden_size, hidden_size) + self.activation = ACT2FN[config.hidden_act] + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.activation(self.fc1(hidden_states)) + hidden_states = self.activation(self.fc2(hidden_states)) + hidden_states = self.fc3(hidden_states) + return hidden_states + + +@auto_docstring +class EomtPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config: EomtConfig + base_model_prefix = "eomt" + main_input_name = "pixel_values" + supports_gradient_checkpointing = False + _no_split_modules = ["EomtLayer"] + _supports_sdpa = True + _can_record_outputs = { + "hidden_states": EomtLayer, + "attentions": EomtAttention, + } + + def _init_weights(self, module: nn.Module) -> None: + std = self.config.initializer_range + if isinstance(module, (nn.Linear, nn.Conv2d, nn.ConvTranspose2d)): + nn.init.kaiming_uniform_(module.weight, a=math.sqrt(5)) + if module.bias is not None: + fan_in, _ = nn.init._calculate_fan_in_and_fan_out(module.weight) + bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0 + nn.init.uniform_(module.bias, -bound, bound) + elif isinstance(module, nn.LayerNorm): + module.weight.data.fill_(1.0) + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=1) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, EomtLayerScale): + if hasattr(module, "lambda1"): + module.lambda1.data.fill_(self.config.layerscale_value) + elif isinstance(module, EomtEmbeddings): + module.cls_token.data = nn.init.trunc_normal_( + module.cls_token.data.to(torch.float32), mean=0.0, std=std + ).to(module.cls_token.dtype) + module.register_tokens.data.zero_() + + +@auto_docstring( + custom_intro=""" + The EoMT Model with head on top for instance/semantic/panoptic segmentation. + """ +) +class EomtForUniversalSegmentation(Mask2FormerForUniversalSegmentation): + def __init__(self, config: EomtConfig): + PreTrainedModel.__init__(self, config) + self.config = config + self.num_hidden_layers = config.num_hidden_layers + self.embeddings = EomtEmbeddings(config) + self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + self.query = nn.Embedding(config.num_queries, config.hidden_size) + self.layers = nn.ModuleList([EomtLayer(config) for _ in range(config.num_hidden_layers)]) + + self.upscale_block = EomtScaleBlock(config) + self.mask_head = EomtMaskHead(config) + + self.class_predictor = nn.Linear(config.hidden_size, config.num_labels + 1) + + self.grid_size = (config.image_size // config.patch_size, config.image_size // config.patch_size) + self.weight_dict: dict[str, float] = { + "loss_cross_entropy": config.class_weight, + "loss_mask": config.mask_weight, + "loss_dice": config.dice_weight, + } + + self.criterion = EomtLoss(config=config, weight_dict=self.weight_dict) + + self.register_buffer("attn_mask_probs", torch.ones(config.num_blocks)) + + self.post_init() + + def get_input_embeddings(self): + return self.embeddings.patch_embeddings + + def get_auxiliary_logits(self): + raise AttributeError("Note needed for Eomt Model.") + + def predict(self, logits: torch.Tensor): + query_tokens = logits[:, : self.config.num_queries, :] + class_logits = self.class_predictor(query_tokens) + + prefix_tokens = logits[:, self.config.num_queries + self.embeddings.num_prefix_tokens :, :] + prefix_tokens = prefix_tokens.transpose(1, 2) + + prefix_tokens = prefix_tokens.reshape(prefix_tokens.shape[0], -1, *self.grid_size) + + query_tokens = self.mask_head(query_tokens) + prefix_tokens = self.upscale_block(prefix_tokens) + + mask_logits = torch.einsum("bqc, bchw -> bqhw", query_tokens, prefix_tokens) + + return mask_logits, class_logits + + @staticmethod + def _disable_attention_mask(attn_mask, prob, num_query_tokens, encoder_start_tokens, device): + if prob < 1: + # Generate random queries to disable based on the probs + random_queries = torch.rand(attn_mask.shape[0], num_query_tokens, device=device) > prob + + # Disable attention to the query tokens, considering the prefix tokens + attn_mask[:, :num_query_tokens, encoder_start_tokens:][random_queries] = 1 + + return attn_mask + + @check_model_inputs + @auto_docstring + def forward( + self, + pixel_values: Tensor, + mask_labels: Optional[list[Tensor]] = None, + class_labels: Optional[list[Tensor]] = None, + patch_offsets: Optional[list[Tensor]] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> EomtForUniversalSegmentationOutput: + r""" + mask_labels (`list[torch.Tensor]`, *optional*): + list of mask labels of shape `(num_labels, height, width)` to be fed to a model + class_labels (`list[torch.LongTensor]`, *optional*): + list of target class labels of shape `(num_labels, height, width)` to be fed to a model. They identify the + labels of `mask_labels`, e.g. the label of `mask_labels[i][j]` if `class_labels[i][j]`. + patch_offsets (`list[torch.Tensor]`, *optional*): + list of tuples indicating the image index and start and end positions of patches for semantic segmentation. + """ + + masks_queries_logits_per_layer, class_queries_logits_per_layer = (), () + attention_mask = None + + if pixel_values is None: + raise ValueError("You have to specify pixel_values") + + hidden_states = self.embeddings(pixel_values) + + for idx, layer_module in enumerate(self.layers): + if idx == self.num_hidden_layers - self.config.num_blocks: + query = self.query.weight[None, :, :].expand(hidden_states.shape[0], -1, -1).to(hidden_states.device) + hidden_states = torch.cat((query, hidden_states), dim=1) + + if idx >= self.num_hidden_layers - self.config.num_blocks and ( + self.training or self.attn_mask_probs[idx - self.num_hidden_layers + self.config.num_blocks] > 0 + ): + norm_hidden_states = self.layernorm(hidden_states) + masks_queries_logits, class_queries_logits = self.predict(norm_hidden_states) + + masks_queries_logits_per_layer += (masks_queries_logits,) + class_queries_logits_per_layer += (class_queries_logits,) + + attention_mask = torch.ones( + hidden_states.shape[0], + hidden_states.shape[1], + hidden_states.shape[1], + device=hidden_states.device, + dtype=torch.bool, + ) + + interpolated_logits = F.interpolate(masks_queries_logits, size=self.grid_size, mode="bilinear") + interpolated_logits = interpolated_logits.view( + interpolated_logits.size(0), interpolated_logits.size(1), -1 + ) + + num_query_tokens = self.config.num_queries + encoder_start_tokens = num_query_tokens + self.embeddings.num_prefix_tokens + + # Set attention mask for queries to focus on encoder tokens based on interpolated logits + attention_mask[:, :num_query_tokens, encoder_start_tokens:] = interpolated_logits > 0 + + # Disable attention mask for random query tokens. + attention_mask = self._disable_attention_mask( + attention_mask, + prob=self.attn_mask_probs[idx - self.num_hidden_layers + self.config.num_blocks], + num_query_tokens=num_query_tokens, + encoder_start_tokens=encoder_start_tokens, + device=attention_mask.device, + ) + + # Expand attention mask to 4d mask. + attention_mask = attention_mask[:, None, ...].expand(-1, self.config.num_attention_heads, -1, -1) + attention_mask = attention_mask.float().masked_fill(~attention_mask, -1e9) + + hidden_states = layer_module(hidden_states, attention_mask) + + sequence_output = self.layernorm(hidden_states) + + masks_queries_logits, class_queries_logits = self.predict(sequence_output) + masks_queries_logits_per_layer += (masks_queries_logits,) + class_queries_logits_per_layer += (class_queries_logits,) + + loss = None + if mask_labels is not None and class_labels is not None: + loss = 0.0 + for masks_queries_logits, class_queries_logits in zip( + masks_queries_logits_per_layer, class_queries_logits_per_layer + ): + loss_dict = self.get_loss_dict( + masks_queries_logits=masks_queries_logits, + class_queries_logits=class_queries_logits, + mask_labels=mask_labels, + class_labels=class_labels, + auxiliary_predictions=None, + ) + loss += self.get_loss(loss_dict) + + return EomtForUniversalSegmentationOutput( + loss=loss, + masks_queries_logits=masks_queries_logits, + class_queries_logits=class_queries_logits, + last_hidden_state=sequence_output, + patch_offsets=patch_offsets, + ) + + +__all__ = ["EomtConfig", "EomtPreTrainedModel", "EomtForUniversalSegmentation"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/ernie4_5/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/ernie4_5/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..5d6e69432c9a8ac0471c3dd91660eaf912dc149c --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/ernie4_5/__init__.py @@ -0,0 +1,27 @@ +# Copyright 2025 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_ernie4_5 import * + from .modeling_ernie4_5 import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/ernie4_5/configuration_ernie4_5.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/ernie4_5/configuration_ernie4_5.py new file mode 100644 index 0000000000000000000000000000000000000000..e6e2795b5daa2bcd2a873997ed8890fa5703b115 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/ernie4_5/configuration_ernie4_5.py @@ -0,0 +1,202 @@ +# Copyright (c) 2025 Baidu, Inc. and HuggingFace Inc. team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Ernie 4.5 model configuration""" + +from ...configuration_utils import PretrainedConfig +from ...modeling_rope_utils import rope_config_validation + + +class Ernie4_5Config(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`Ernie4_5Model`]. It is used to instantiate an Ernie 4.5 + model according to the specified arguments, defining the model architecture. Instantiating a configuration with the + defaults will yield a similar configuration to that of the Ernie 4.5 0.3B. + e.g. [baidu/ERNIE-4.5-0.3B-PT](https://huggingface.co/baidu/ERNIE-4.5-0.3B-PT) + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 103424): + Vocabulary size of the Ernie 4.5 model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`Ernie4_5Model`] + hidden_size (`int`, *optional*, defaults to 1024): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 3072): + Dimension of the MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 18): + Number of hidden layers in the Transformer decoder. + num_attention_heads (`int`, *optional*, defaults to 16): + Number of attention heads for each attention layer in the Transformer decoder. + num_key_value_heads (`int`, *optional*, defaults to 2): + This is the number of key_value heads that should be used to implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When + converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed + by meanpooling all the original heads within that group. For more details, check out [this + paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to + `num_attention_heads`. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the decoder. + max_position_embeddings (`int`, *optional*, defaults to 131072): + The maximum sequence length that this model might ever be used with. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + rms_norm_eps (`float`, *optional*, defaults to 1e-05): + The epsilon used by the rms normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions. + pad_token_id (`int`, *optional*, defaults to 0): + Padding token id. + bos_token_id (`int`, *optional*, defaults to 1): + Beginning of stream token id. + eos_token_id (`int`, *optional*, defaults to 2): + End of stream token id. + tie_word_embeddings (`bool`, *optional*, defaults to `True`): + Whether to tie weight embeddings + rope_theta (`float`, *optional*, defaults to 500000.0): + The base period of the RoPE embeddings. + rope_scaling (`Dict`, *optional*): + Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type + and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value + accordingly. + Expected contents: + `rope_type` (`str`): + The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', + 'llama3'], with 'default' being the original RoPE implementation. + `factor` (`float`, *optional*): + Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In + most scaling types, a `factor` of x will enable the model to handle sequences of length x * + original maximum pre-trained length. + `original_max_position_embeddings` (`int`, *optional*): + Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during + pretraining. + `attention_factor` (`float`, *optional*): + Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention + computation. If unspecified, it defaults to value recommended by the implementation, using the + `factor` field to infer the suggested value. + `beta_fast` (`float`, *optional*): + Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear + ramp function. If unspecified, it defaults to 32. + `beta_slow` (`float`, *optional*): + Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear + ramp function. If unspecified, it defaults to 1. + `short_factor` (`list[float]`, *optional*): + Only used with 'longrope'. The scaling factor to be applied to short contexts (< + `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden + size divided by the number of attention heads divided by 2 + `long_factor` (`list[float]`, *optional*): + Only used with 'longrope'. The scaling factor to be applied to long contexts (< + `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden + size divided by the number of attention heads divided by 2 + `low_freq_factor` (`float`, *optional*): + Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE + `high_freq_factor` (`float`, *optional*): + Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE + use_bias (`bool`, *optional*, defaults to `False`): + Whether to use a bias in any of the projections including mlp and attention for example. + head_dim (`int`, *optional*, defaults to 128): + The attention head dimension. If None, it will default to hidden_size // num_attention_heads + + ```python + >>> from transformers import Ernie4_5Model, Ernie4_5Config + + >>> # Initializing a Ernie4_5 0.3B style configuration + >>> configuration = Ernie4_5Config() + + >>> # Initializing a model from the 0.3B style configuration + >>> model = Ernie4_5Model(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "ernie4_5" + keys_to_ignore_at_inference = ["past_key_values"] + # Default tensor parallel plan for base model `Ernie4_5Model` + base_model_tp_plan = { + "layers.*.self_attn.q_proj": "colwise", + "layers.*.self_attn.k_proj": "colwise", + "layers.*.self_attn.v_proj": "colwise", + "layers.*.self_attn.o_proj": "rowwise", + "layers.*.mlp.gate_proj": "colwise", + "layers.*.mlp.up_proj": "colwise", + "layers.*.mlp.down_proj": "rowwise", + } + base_model_pp_plan = { + "embed_tokens": (["input_ids"], ["inputs_embeds"]), + "layers": (["hidden_states", "attention_mask"], ["hidden_states"]), + "norm": (["hidden_states"], ["hidden_states"]), + } + + def __init__( + self, + vocab_size=103424, + hidden_size=1024, + intermediate_size=3072, + num_hidden_layers=18, + num_attention_heads=16, + num_key_value_heads=2, + hidden_act="silu", + max_position_embeddings=131072, + initializer_range=0.02, + rms_norm_eps=1e-05, + use_cache=True, + pad_token_id=0, + bos_token_id=1, + eos_token_id=2, + tie_word_embeddings=True, + rope_theta=500000.0, + rope_scaling=None, + use_bias=False, + head_dim=128, + **kwargs, + ): + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + + # for backward compatibility + if num_key_value_heads is None: + num_key_value_heads = num_attention_heads + + self.num_key_value_heads = num_key_value_heads + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + self.use_cache = use_cache + self.rope_theta = rope_theta + self.rope_scaling = rope_scaling + self.use_bias = use_bias + self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads + # Validate the correctness of rotary position embeddings parameters + # BC: if there is a 'type' field, copy it it to 'rope_type'. + if self.rope_scaling is not None and "type" in self.rope_scaling: + self.rope_scaling["rope_type"] = self.rope_scaling["type"] + rope_config_validation(self) + + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) + + +__all__ = ["Ernie4_5Config"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/ernie4_5/modeling_ernie4_5.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/ernie4_5/modeling_ernie4_5.py new file mode 100644 index 0000000000000000000000000000000000000000..13ec6fb3a3b619e2e6315745ae3a6f37038fdbcd --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/ernie4_5/modeling_ernie4_5.py @@ -0,0 +1,471 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/ernie4_5/modular_ernie4_5.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_ernie4_5.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# Copyright (c) 2025 Baidu, Inc. and HuggingFace Inc. team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Callable, Optional, Union + +import torch +from torch import nn + +from ...activations import ACT2FN +from ...cache_utils import Cache, DynamicCache +from ...generation import GenerationMixin +from ...integrations import use_kernel_forward_from_hub +from ...masking_utils import create_causal_mask +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast +from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update +from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel +from ...processing_utils import Unpack +from ...utils import TransformersKwargs, auto_docstring, can_return_tuple +from ...utils.deprecation import deprecate_kwarg +from ...utils.generic import check_model_inputs +from .configuration_ernie4_5 import Ernie4_5Config + + +class Ernie4_5RotaryEmbedding(nn.Module): + inv_freq: torch.Tensor # fix linting for `register_buffer` + + def __init__(self, config: Ernie4_5Config, device=None): + super().__init__() + # BC: "rope_type" was originally "type" + if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): + self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) + else: + self.rope_type = "default" + self.max_seq_len_cached = config.max_position_embeddings + self.original_max_seq_len = config.max_position_embeddings + + self.config = config + self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + + inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) + self.original_inv_freq = self.inv_freq + + @torch.no_grad() + @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) + def forward(self, x, position_ids): + inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device) + position_ids_expanded = position_ids[:, None, :].float() + + device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" + with torch.autocast(device_type=device_type, enabled=False): # Force float32 + freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) + emb = torch.cat((freqs, freqs), dim=-1) + cos = emb.cos() * self.attention_scaling + sin = emb.sin() * self.attention_scaling + + # keeping it in full precision + return cos, sin + + +class Ernie4_5MLP(nn.Module): + def __init__(self, config: Ernie4_5Config): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.intermediate_size = config.intermediate_size + + self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.use_bias) + self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.use_bias) + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.use_bias) + self.act_fn = ACT2FN[config.hidden_act] + + def forward(self, x): + down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) + return down_proj + + +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., 0::2] + x2 = x[..., 1::2] + return torch.stack((-x2, x1), dim=-1).flatten(-2) + + +def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: + """ + This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, + num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) + """ + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) + + +def eager_attention_forward( + module: nn.Module, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attention_mask: Optional[torch.Tensor], + scaling: float, + dropout: float = 0.0, + **kwargs: Unpack[TransformersKwargs], +): + key_states = repeat_kv(key, module.num_key_value_groups) + value_states = repeat_kv(value, module.num_key_value_groups) + + attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling + if attention_mask is not None: + causal_mask = attention_mask[:, :, :, : key_states.shape[-2]] + attn_weights = attn_weights + causal_mask + + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype) + attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) + attn_output = torch.matmul(attn_weights, value_states) + attn_output = attn_output.transpose(1, 2).contiguous() + + return attn_output, attn_weights + + +def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1): + """Applies Rotary Position Embedding to the query and key tensors. + + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + position_ids (`torch.Tensor`, *optional*): + Deprecated and unused. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + # glm rope style (with full dim) and full precision + original_dtype = q.dtype + + cos = cos.unsqueeze(unsqueeze_dim) + sin = sin.unsqueeze(unsqueeze_dim) + + # Interleave them instead of usual shape + cos = cos[..., : cos.shape[-1] // 2].repeat_interleave(2, dim=-1) + sin = sin[..., : sin.shape[-1] // 2].repeat_interleave(2, dim=-1) + + q_embed = (q.float() * cos) + (rotate_half(q).float() * sin) + k_embed = (k.float() * cos) + (rotate_half(k).float() * sin) + + return q_embed.to(original_dtype), k_embed.to(original_dtype) + + +class Ernie4_5Attention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config: Ernie4_5Config, layer_idx: int): + super().__init__() + self.config = config + self.layer_idx = layer_idx + self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads) + self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads + self.scaling = self.head_dim**-0.5 + + self.attention_dropout = 0.0 + self.is_causal = True + + self.q_proj = nn.Linear(config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.use_bias) + self.k_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.use_bias) + self.v_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.use_bias) + self.o_proj = nn.Linear(config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.use_bias) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + position_embeddings: tuple[torch.Tensor, torch.Tensor], + attention_mask: Optional[torch.Tensor], + past_key_values: Optional[Cache] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> tuple[torch.Tensor, torch.Tensor]: + input_shape = hidden_states.shape[:-1] + hidden_shape = (*input_shape, -1, self.head_dim) + + query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2) + key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2) + value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2) + + cos, sin = position_embeddings + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) + + if past_key_values is not None: + # sin and cos are specific to RoPE models; cache_position needed for the static cache + cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} + key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs) + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + attn_output, attn_weights = attention_interface( + self, + query_states, + key_states, + value_states, + attention_mask, + dropout=0.0 if not self.training else self.attention_dropout, + scaling=self.scaling, + **kwargs, + ) + + attn_output = attn_output.reshape(*input_shape, -1).contiguous() + attn_output = self.o_proj(attn_output) + return attn_output, attn_weights + + +@use_kernel_forward_from_hub("RMSNorm") +class Ernie4_5RMSNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + """ + Ernie4_5RMSNorm is equivalent to T5LayerNorm + """ + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + return self.weight * hidden_states.to(input_dtype) + + def extra_repr(self): + return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}" + + +class Ernie4_5DecoderLayer(GradientCheckpointingLayer): + def __init__(self, config: Ernie4_5Config, layer_idx: int): + super().__init__() + self.hidden_size = config.hidden_size + + self.self_attn = Ernie4_5Attention(config=config, layer_idx=layer_idx) + + self.mlp = Ernie4_5MLP(config) + self.input_layernorm = Ernie4_5RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = Ernie4_5RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + use_cache: Optional[bool] = False, + cache_position: Optional[torch.LongTensor] = None, + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + **kwargs: Unpack[TransformersKwargs], + ) -> torch.Tensor: + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + # Self Attention + hidden_states, _ = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + use_cache=use_cache, + cache_position=cache_position, + position_embeddings=position_embeddings, + **kwargs, + ) + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + return hidden_states + + +@auto_docstring +class Ernie4_5PreTrainedModel(PreTrainedModel): + config: Ernie4_5Config + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["Ernie4_5DecoderLayer"] + _skip_keys_device_placement = ["past_key_values"] + _supports_flash_attn = True + _supports_sdpa = True + _supports_flex_attn = True + + _can_compile_fullgraph = True + _supports_attention_backend = True + _can_record_outputs = { + "hidden_states": Ernie4_5DecoderLayer, + "attentions": Ernie4_5Attention, + } + + +@auto_docstring +class Ernie4_5Model(Ernie4_5PreTrainedModel): + def __init__(self, config: Ernie4_5Config): + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) + self.layers = nn.ModuleList( + [Ernie4_5DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + ) + self.norm = Ernie4_5RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.rotary_emb = Ernie4_5RotaryEmbedding(config=config) + self.gradient_checkpointing = False + + # Initialize weights and apply final processing + self.post_init() + + @check_model_inputs + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + cache_position: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> BaseModelOutputWithPast: + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") + + if inputs_embeds is None: + inputs_embeds: torch.Tensor = self.embed_tokens(input_ids) + + if use_cache and past_key_values is None: + past_key_values = DynamicCache(config=self.config) + + if cache_position is None: + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + cache_position: torch.Tensor = torch.arange( + past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device + ) + + if position_ids is None: + position_ids = cache_position.unsqueeze(0) + + causal_mask = create_causal_mask( + config=self.config, + input_embeds=inputs_embeds, + attention_mask=attention_mask, + cache_position=cache_position, + past_key_values=past_key_values, + position_ids=position_ids, + ) + + hidden_states = inputs_embeds + position_embeddings = self.rotary_emb(hidden_states, position_ids) + + for decoder_layer in self.layers[: self.config.num_hidden_layers]: + hidden_states = decoder_layer( + hidden_states, + attention_mask=causal_mask, + position_ids=position_ids, + past_key_values=past_key_values, + cache_position=cache_position, + position_embeddings=position_embeddings, + **kwargs, + ) + + hidden_states = self.norm(hidden_states) + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=past_key_values, + ) + + +@auto_docstring +class Ernie4_5ForCausalLM(Ernie4_5PreTrainedModel, GenerationMixin): + _tied_weights_keys = ["lm_head.weight"] + _tp_plan = {"lm_head": "colwise_rep"} + _pp_plan = {"lm_head": (["hidden_states"], ["logits"])} + + def __init__(self, config): + super().__init__(config) + self.model = Ernie4_5Model(config) + self.vocab_size = config.vocab_size + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + @can_return_tuple + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + logits_to_keep: Union[int, torch.Tensor] = 0, + **kwargs: Unpack[TransformersKwargs], + ) -> CausalLMOutputWithPast: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + """ + outputs: BaseModelOutputWithPast = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + cache_position=cache_position, + **kwargs, + ) + + hidden_states = outputs.last_hidden_state + # Only compute necessary logits, and do not upcast them to float if we are not computing the loss + slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep + logits = self.lm_head(hidden_states[:, slice_indices, :]) + + loss = None + if labels is not None: + loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs) + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +__all__ = ["Ernie4_5ForCausalLM", "Ernie4_5Model", "Ernie4_5PreTrainedModel"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/ernie4_5/modular_ernie4_5.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/ernie4_5/modular_ernie4_5.py new file mode 100644 index 0000000000000000000000000000000000000000..7cec0232ca6842e72b453f53560ff5da017f794d --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/ernie4_5/modular_ernie4_5.py @@ -0,0 +1,123 @@ +# Copyright (c) 2025 Baidu, Inc. and HuggingFace Inc. team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch Ernie 4.5 model""" + +import torch +from torch import nn + +from ...modeling_rope_utils import dynamic_rope_update +from ...utils import auto_docstring, can_return_tuple +from ..glm.modeling_glm import rotate_half +from ..llama.modeling_llama import ( + LlamaAttention, + LlamaForCausalLM, + LlamaMLP, + LlamaRotaryEmbedding, +) +from .configuration_ernie4_5 import Ernie4_5Config + + +class Ernie4_5RotaryEmbedding(LlamaRotaryEmbedding): + @torch.no_grad() + @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) + def forward(self, x, position_ids): + inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device) + position_ids_expanded = position_ids[:, None, :].float() + + device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" + with torch.autocast(device_type=device_type, enabled=False): # Force float32 + freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) + emb = torch.cat((freqs, freqs), dim=-1) + cos = emb.cos() * self.attention_scaling + sin = emb.sin() * self.attention_scaling + + # keeping it in full precision + return cos, sin + + +def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1): + """Applies Rotary Position Embedding to the query and key tensors. + + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + position_ids (`torch.Tensor`, *optional*): + Deprecated and unused. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + # glm rope style (with full dim) and full precision + original_dtype = q.dtype + + cos = cos.unsqueeze(unsqueeze_dim) + sin = sin.unsqueeze(unsqueeze_dim) + + # Interleave them instead of usual shape + cos = cos[..., : cos.shape[-1] // 2].repeat_interleave(2, dim=-1) + sin = sin[..., : sin.shape[-1] // 2].repeat_interleave(2, dim=-1) + + q_embed = (q.float() * cos) + (rotate_half(q).float() * sin) + k_embed = (k.float() * cos) + (rotate_half(k).float() * sin) + + return q_embed.to(original_dtype), k_embed.to(original_dtype) + + +class Ernie4_5MLP(LlamaMLP): + def __init__(self, config: Ernie4_5Config): + super().__init__(config) + + self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.use_bias) + self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.use_bias) + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.use_bias) + + +class Ernie4_5Attention(LlamaAttention): + def __init__(self, config: Ernie4_5Config, layer_idx: int): + super().__init__(config, layer_idx) + + self.attention_dropout = 0.0 + + self.q_proj = nn.Linear(config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.use_bias) + self.k_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.use_bias) + self.v_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.use_bias) + self.o_proj = nn.Linear(config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.use_bias) + + +class Ernie4_5ForCausalLM(LlamaForCausalLM): + @can_return_tuple + @auto_docstring + def forward(self, **super_kwargs): + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + """ + super().forward(**super_kwargs) + + +__all__ = [ + "Ernie4_5ForCausalLM", + "Ernie4_5Model", # noqa: F822 + "Ernie4_5PreTrainedModel", # noqa: F822 +] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/esm/__pycache__/__init__.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/esm/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3ab11d9f8a3509c9a678cd913157334278f3c418 Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/esm/__pycache__/__init__.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/esm/__pycache__/configuration_esm.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/esm/__pycache__/configuration_esm.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4f629677dd7a1c3d53e79fdbd0e0d5aa7cf68ada Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/esm/__pycache__/configuration_esm.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/esm/__pycache__/modeling_esm.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/esm/__pycache__/modeling_esm.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..95c1a9504f37002607d148df6501c16d9d7e7cb9 Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/esm/__pycache__/modeling_esm.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/esm/__pycache__/modeling_tf_esm.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/esm/__pycache__/modeling_tf_esm.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c56befb7da9dfacdefacb561b9a5f4325ac84e78 Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/esm/__pycache__/modeling_tf_esm.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/esm/__pycache__/tokenization_esm.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/esm/__pycache__/tokenization_esm.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5449563e5e2ced022ad0b644fcb7c8808970e535 Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/esm/__pycache__/tokenization_esm.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/esm/openfold_utils/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/esm/openfold_utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..02a8c149ae320dd9b045edc5df31760a4eebefd9 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/esm/openfold_utils/__init__.py @@ -0,0 +1,8 @@ +from .chunk_utils import chunk_layer +from .data_transforms import make_atom14_masks +from .feats import atom14_to_atom37, frames_and_literature_positions_to_atom14_pos, torsion_angles_to_frames +from .loss import compute_predicted_aligned_error, compute_tm +from .protein import Protein as OFProtein +from .protein import to_pdb +from .rigid_utils import Rigid, Rotation +from .tensor_utils import dict_multimap, flatten_final_dims, permute_final_dims diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/esm/openfold_utils/__pycache__/__init__.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/esm/openfold_utils/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3eb1ff468775242849b5687882d925c0440a109f Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/esm/openfold_utils/__pycache__/__init__.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/esm/openfold_utils/__pycache__/chunk_utils.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/esm/openfold_utils/__pycache__/chunk_utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..afeee1c04ffa5d455041aae287f1a05d77c567fa Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/esm/openfold_utils/__pycache__/chunk_utils.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/esm/openfold_utils/__pycache__/data_transforms.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/esm/openfold_utils/__pycache__/data_transforms.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..32bb42319620139ff3713b76a3a761449b46eb58 Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/esm/openfold_utils/__pycache__/data_transforms.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/esm/openfold_utils/__pycache__/feats.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/esm/openfold_utils/__pycache__/feats.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9fc2ed2f2d30a354bae2ffcc099b3fb44b459b39 Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/esm/openfold_utils/__pycache__/feats.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/esm/openfold_utils/__pycache__/loss.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/esm/openfold_utils/__pycache__/loss.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1d8bb5782483d13539993e89058d3f89bae6318d Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/esm/openfold_utils/__pycache__/loss.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/esm/openfold_utils/__pycache__/protein.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/esm/openfold_utils/__pycache__/protein.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c52527e0b77eed0a581a8c9fc1c5a7a2e10abbde Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/esm/openfold_utils/__pycache__/protein.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/esm/openfold_utils/__pycache__/residue_constants.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/esm/openfold_utils/__pycache__/residue_constants.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4c8c8236cc64546b7c344ef40ceb7fa09b4240fe Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/esm/openfold_utils/__pycache__/residue_constants.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/esm/openfold_utils/__pycache__/rigid_utils.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/esm/openfold_utils/__pycache__/rigid_utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3beaa8fec407a95cbaabd08fce57b4dd63c977ab Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/esm/openfold_utils/__pycache__/rigid_utils.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/esm/openfold_utils/__pycache__/tensor_utils.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/esm/openfold_utils/__pycache__/tensor_utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..11b8ec9ba7b33427c12873860f68658af00029e3 Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/esm/openfold_utils/__pycache__/tensor_utils.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/esm/openfold_utils/chunk_utils.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/esm/openfold_utils/chunk_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..a735fcee001a0d94215c7c101ecad523b4fee3fd --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/esm/openfold_utils/chunk_utils.py @@ -0,0 +1,398 @@ +# Copyright 2021 AlQuraishi Laboratory +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import logging +import math +from collections.abc import Iterable, Sequence +from functools import partial +from typing import Any, Callable, Optional, Union + +import torch + +from .tensor_utils import tensor_tree_map, tree_map + + +def _fetch_dims(tree: Union[dict, list, tuple, torch.Tensor]) -> list[tuple[int, ...]]: + shapes = [] + if isinstance(tree, dict): + for v in tree.values(): + shapes.extend(_fetch_dims(v)) + elif isinstance(tree, (list, tuple)): + for t in tree: + shapes.extend(_fetch_dims(t)) + elif isinstance(tree, torch.Tensor): + shapes.append(tree.shape) + else: + raise TypeError("Not supported") + + return shapes + + +@torch.jit.ignore +def _flat_idx_to_idx(flat_idx: int, dims: tuple[int, ...]) -> tuple[int, ...]: + idx = [] + for d in reversed(dims): + idx.append(flat_idx % d) + flat_idx = flat_idx // d + + return tuple(reversed(idx)) + + +@torch.jit.ignore +def _get_minimal_slice_set( + start: Sequence[int], + end: Sequence[int], + dims: Sequence[int], + start_edges: Optional[Sequence[bool]] = None, + end_edges: Optional[Sequence[bool]] = None, +) -> list[tuple[slice, ...]]: + """ + Produces an ordered sequence of tensor slices that, when used in sequence on a tensor with shape dims, yields + tensors that contain every leaf in the contiguous range [start, end]. Care is taken to yield a short sequence of + slices, and perhaps even the shortest possible (I'm pretty sure it's the latter). + + end is INCLUSIVE. + """ + + # start_edges and end_edges both indicate whether, starting from any given + # dimension, the start/end index is at the top/bottom edge of the + # corresponding tensor, modeled as a tree + def reduce_edge_list(l: list[bool]) -> None: + tally = True + for i in range(len(l)): + reversed_idx = -1 * (i + 1) + l[reversed_idx] &= tally + tally = l[reversed_idx] + + if start_edges is None: + start_edges = [s == 0 for s in start] + reduce_edge_list(start_edges) + if end_edges is None: + end_edges = [e == (d - 1) for e, d in zip(end, dims)] + reduce_edge_list(end_edges) + + # Base cases. Either start/end are empty and we're done, or the final, + # one-dimensional tensor can be simply sliced + if len(start) == 0: + return [()] + elif len(start) == 1: + return [(slice(start[0], end[0] + 1),)] + + slices: list[tuple[slice, ...]] = [] + path_list: list[slice] = [] + + # Dimensions common to start and end can be selected directly + for s, e in zip(start, end): + if s == e: + path_list.append(slice(s, s + 1)) + else: + break + + path: tuple[slice, ...] = tuple(path_list) + divergence_idx = len(path) + + # start == end, and we're done + if divergence_idx == len(dims): + return [path] + + def upper() -> tuple[tuple[slice, ...], ...]: + assert start_edges is not None + assert end_edges is not None + + sdi = start[divergence_idx] + return tuple( + path + (slice(sdi, sdi + 1),) + s + for s in _get_minimal_slice_set( + start[divergence_idx + 1 :], + [d - 1 for d in dims[divergence_idx + 1 :]], + dims[divergence_idx + 1 :], + start_edges=start_edges[divergence_idx + 1 :], + end_edges=[True for _ in end_edges[divergence_idx + 1 :]], + ) + ) + + def lower() -> tuple[tuple[slice, ...], ...]: + assert start_edges is not None + assert end_edges is not None + + edi = end[divergence_idx] + return tuple( + path + (slice(edi, edi + 1),) + s + for s in _get_minimal_slice_set( + [0 for _ in start[divergence_idx + 1 :]], + end[divergence_idx + 1 :], + dims[divergence_idx + 1 :], + start_edges=[True for _ in start_edges[divergence_idx + 1 :]], + end_edges=end_edges[divergence_idx + 1 :], + ) + ) + + # If both start and end are at the edges of the subtree rooted at + # divergence_idx, we can just select the whole subtree at once + if start_edges[divergence_idx] and end_edges[divergence_idx]: + slices.append(path + (slice(start[divergence_idx], end[divergence_idx] + 1),)) + # If just start is at the edge, we can grab almost all of the subtree, + # treating only the ragged bottom edge as an edge case + elif start_edges[divergence_idx]: + slices.append(path + (slice(start[divergence_idx], end[divergence_idx]),)) + slices.extend(lower()) + # Analogous to the previous case, but the top is ragged this time + elif end_edges[divergence_idx]: + slices.extend(upper()) + slices.append(path + (slice(start[divergence_idx] + 1, end[divergence_idx] + 1),)) + # If both sides of the range are ragged, we need to handle both sides + # separately. If there's contiguous meat in between them, we can index it + # in one big chunk + else: + slices.extend(upper()) + middle_ground = end[divergence_idx] - start[divergence_idx] + if middle_ground > 1: + slices.append(path + (slice(start[divergence_idx] + 1, end[divergence_idx]),)) + slices.extend(lower()) + + return slices + + +@torch.jit.ignore +def _chunk_slice(t: torch.Tensor, flat_start: int, flat_end: int, no_batch_dims: int) -> torch.Tensor: + """ + Equivalent to + + t.reshape((-1,) + t.shape[no_batch_dims:])[flat_start:flat_end] + + but without the need for the initial reshape call, which can be memory-intensive in certain situations. The only + reshape operations in this function are performed on sub-tensors that scale with (flat_end - flat_start), the chunk + size. + """ + + batch_dims = t.shape[:no_batch_dims] + start_idx = list(_flat_idx_to_idx(flat_start, batch_dims)) + # _get_minimal_slice_set is inclusive + end_idx = list(_flat_idx_to_idx(flat_end - 1, batch_dims)) + + # Get an ordered list of slices to perform + slices = _get_minimal_slice_set( + start_idx, + end_idx, + batch_dims, + ) + + sliced_tensors = [t[s] for s in slices] + + return torch.cat([s.view((-1,) + t.shape[no_batch_dims:]) for s in sliced_tensors]) + + +def chunk_layer( + layer: Callable, + inputs: dict[str, Any], + chunk_size: int, + no_batch_dims: int, + low_mem: bool = False, + _out: Any = None, + _add_into_out: bool = False, +) -> Any: + """ + Implements the "chunking" procedure described in section 1.11.8. + + Layer outputs and inputs are assumed to be simple "pytrees," consisting only of (arbitrarily nested) lists, tuples, + and dicts with torch.Tensor leaves. + + Args: + layer: + The layer to be applied chunk-wise + inputs: + A (non-nested) dictionary of keyworded inputs. All leaves must be tensors and must share the same batch + dimensions. + chunk_size: + The number of sub-batches per chunk. If multiple batch dimensions are specified, a "sub-batch" is defined + as a single indexing of all batch dimensions simultaneously (s.t. the number of sub-batches is the product + of the batch dimensions). + no_batch_dims: + How many of the initial dimensions of each input tensor can be considered batch dimensions. + low_mem: + Avoids flattening potentially large input tensors. Unnecessary in most cases, and is ever so slightly + slower than the default setting. + Returns: + The reassembled output of the layer on the inputs. + """ + if not (len(inputs) > 0): + raise ValueError("Must provide at least one input") + + initial_dims = [shape[:no_batch_dims] for shape in _fetch_dims(inputs)] + orig_batch_dims = tuple(max(s) for s in zip(*initial_dims)) + + def _prep_inputs(t: torch.Tensor) -> torch.Tensor: + if not low_mem: + if sum(t.shape[:no_batch_dims]) != no_batch_dims: + t = t.expand(orig_batch_dims + t.shape[no_batch_dims:]) + t = t.reshape(-1, *t.shape[no_batch_dims:]) + else: + t = t.expand(orig_batch_dims + t.shape[no_batch_dims:]) + return t + + prepped_inputs: dict[str, Any] = tensor_tree_map(_prep_inputs, inputs) + prepped_outputs = None + if _out is not None: + prepped_outputs = tensor_tree_map(lambda t: t.view([-1] + list(t.shape[no_batch_dims:])), _out) + + flat_batch_dim = 1 + for d in orig_batch_dims: + flat_batch_dim *= d + + no_chunks = flat_batch_dim // chunk_size + (flat_batch_dim % chunk_size != 0) + + def _select_chunk(t: torch.Tensor) -> torch.Tensor: + return t[i : i + chunk_size] if t.shape[0] != 1 else t + + i = 0 + out = prepped_outputs + for _ in range(no_chunks): + # Chunk the input + if not low_mem: + select_chunk = _select_chunk + else: + select_chunk = partial( + _chunk_slice, + flat_start=i, + flat_end=min(flat_batch_dim, i + chunk_size), + no_batch_dims=len(orig_batch_dims), + ) + + chunks: dict[str, Any] = tensor_tree_map(select_chunk, prepped_inputs) + + # Run the layer on the chunk + output_chunk = layer(**chunks) + + # Allocate space for the output + if out is None: + out = tensor_tree_map(lambda t: t.new_zeros((flat_batch_dim,) + t.shape[1:]), output_chunk) + + # Put the chunk in its pre-allocated space + if isinstance(output_chunk, dict): + + def assign(d1: dict, d2: dict) -> None: + for k, v in d1.items(): + if isinstance(v, dict): + assign(v, d2[k]) + else: + if _add_into_out: + v[i : i + chunk_size] += d2[k] + else: + v[i : i + chunk_size] = d2[k] + + assign(out, output_chunk) + elif isinstance(output_chunk, tuple): + for x1, x2 in zip(out, output_chunk): + if _add_into_out: + x1[i : i + chunk_size] += x2 + else: + x1[i : i + chunk_size] = x2 + elif isinstance(output_chunk, torch.Tensor): + if _add_into_out: + out[i : i + chunk_size] += output_chunk + else: + out[i : i + chunk_size] = output_chunk + else: + raise TypeError("Not supported") + + i += chunk_size + + out = tensor_tree_map(lambda t: t.view(orig_batch_dims + t.shape[1:]), out) + + return out + + +class ChunkSizeTuner: + def __init__( + self, + # Heuristically, runtimes for most of the modules in the network + # plateau earlier than this on all GPUs I've run the model on. + max_chunk_size: int = 512, + ): + self.max_chunk_size = max_chunk_size + self.cached_chunk_size: Optional[int] = None + self.cached_arg_data: Optional[tuple] = None + + def _determine_favorable_chunk_size(self, fn: Callable, args: tuple, min_chunk_size: int) -> int: + logging.info("Tuning chunk size...") + + if min_chunk_size >= self.max_chunk_size: + return min_chunk_size + + candidates: list[int] = [2**l for l in range(int(math.log2(self.max_chunk_size)) + 1)] + candidates = [c for c in candidates if c > min_chunk_size] + candidates = [min_chunk_size] + candidates + candidates[-1] += 4 + + def test_chunk_size(chunk_size: int) -> bool: + try: + with torch.no_grad(): + fn(*args, chunk_size=chunk_size) + return True + except RuntimeError: + return False + + min_viable_chunk_size_index = 0 + i = len(candidates) - 1 + while i > min_viable_chunk_size_index: + viable = test_chunk_size(candidates[i]) + if not viable: + i = (min_viable_chunk_size_index + i) // 2 + else: + min_viable_chunk_size_index = i + i = (i + len(candidates) - 1) // 2 + + return candidates[min_viable_chunk_size_index] + + def _compare_arg_caches(self, ac1: Iterable, ac2: Iterable) -> bool: + consistent = True + for a1, a2 in zip(ac1, ac2): + assert type(ac1) is type(ac2) + if isinstance(ac1, (list, tuple)): + consistent &= self._compare_arg_caches(a1, a2) + elif isinstance(ac1, dict): + a1_items = [v for _, v in sorted(a1.items(), key=lambda x: x[0])] + a2_items = [v for _, v in sorted(a2.items(), key=lambda x: x[0])] + consistent &= self._compare_arg_caches(a1_items, a2_items) + else: + consistent &= a1 == a2 + + return consistent + + def tune_chunk_size( + self, + representative_fn: Callable, + args: tuple, + min_chunk_size: int, + ) -> int: + consistent = True + arg_data: tuple = tree_map(lambda a: a.shape if isinstance(a, torch.Tensor) else a, args, object) + if self.cached_arg_data is not None: + # If args have changed shape/value, we need to re-tune + assert len(self.cached_arg_data) == len(arg_data) + consistent = self._compare_arg_caches(self.cached_arg_data, arg_data) + else: + # Otherwise, we can reuse the precomputed value + consistent = False + + if not consistent: + self.cached_chunk_size = self._determine_favorable_chunk_size( + representative_fn, + args, + min_chunk_size, + ) + self.cached_arg_data = arg_data + + assert self.cached_chunk_size is not None + + return self.cached_chunk_size diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/esm/openfold_utils/data_transforms.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/esm/openfold_utils/data_transforms.py new file mode 100644 index 0000000000000000000000000000000000000000..bcd67aacab8e5ee6d141166d1ec7fa94cba84e6d --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/esm/openfold_utils/data_transforms.py @@ -0,0 +1,90 @@ +# Copyright 2021 AlQuraishi Laboratory +# Copyright 2021 DeepMind Technologies Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import numpy as np +import torch + +from . import residue_constants as rc +from .tensor_utils import tensor_tree_map, tree_map + + +def make_atom14_masks(protein: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]: + """Construct denser atom positions (14 dimensions instead of 37).""" + restype_atom14_to_atom37_list = [] + restype_atom37_to_atom14_list = [] + restype_atom14_mask_list = [] + + for rt in rc.restypes: + atom_names = rc.restype_name_to_atom14_names[rc.restype_1to3[rt]] + restype_atom14_to_atom37_list.append([(rc.atom_order[name] if name else 0) for name in atom_names]) + atom_name_to_idx14 = {name: i for i, name in enumerate(atom_names)} + restype_atom37_to_atom14_list.append([(atom_name_to_idx14.get(name, 0)) for name in rc.atom_types]) + + restype_atom14_mask_list.append([(1.0 if name else 0.0) for name in atom_names]) + + # Add dummy mapping for restype 'UNK' + restype_atom14_to_atom37_list.append([0] * 14) + restype_atom37_to_atom14_list.append([0] * 37) + restype_atom14_mask_list.append([0.0] * 14) + + restype_atom14_to_atom37 = torch.tensor( + restype_atom14_to_atom37_list, + dtype=torch.int32, + device=protein["aatype"].device, + ) + restype_atom37_to_atom14 = torch.tensor( + restype_atom37_to_atom14_list, + dtype=torch.int32, + device=protein["aatype"].device, + ) + restype_atom14_mask = torch.tensor( + restype_atom14_mask_list, + dtype=torch.float32, + device=protein["aatype"].device, + ) + protein_aatype = protein["aatype"].to(torch.long) + + # create the mapping for (residx, atom14) --> atom37, i.e. an array + # with shape (num_res, 14) containing the atom37 indices for this protein + residx_atom14_to_atom37 = restype_atom14_to_atom37[protein_aatype] + residx_atom14_mask = restype_atom14_mask[protein_aatype] + + protein["atom14_atom_exists"] = residx_atom14_mask + protein["residx_atom14_to_atom37"] = residx_atom14_to_atom37.long() + + # create the gather indices for mapping back + residx_atom37_to_atom14 = restype_atom37_to_atom14[protein_aatype] + protein["residx_atom37_to_atom14"] = residx_atom37_to_atom14.long() + + # create the corresponding mask + restype_atom37_mask = torch.zeros([21, 37], dtype=torch.float32, device=protein["aatype"].device) + for restype, restype_letter in enumerate(rc.restypes): + restype_name = rc.restype_1to3[restype_letter] + atom_names = rc.residue_atoms[restype_name] + for atom_name in atom_names: + atom_type = rc.atom_order[atom_name] + restype_atom37_mask[restype, atom_type] = 1 + + residx_atom37_mask = restype_atom37_mask[protein_aatype] + protein["atom37_atom_exists"] = residx_atom37_mask + + return protein + + +def make_atom14_masks_np(batch: dict[str, torch.Tensor]) -> dict[str, np.ndarray]: + batch = tree_map(lambda n: torch.tensor(n, device=batch["aatype"].device), batch, np.ndarray) + out = tensor_tree_map(lambda t: np.array(t), make_atom14_masks(batch)) + return out diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/esm/openfold_utils/feats.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/esm/openfold_utils/feats.py new file mode 100644 index 0000000000000000000000000000000000000000..366f3c47d9466d0c5556b6eb7c365393785a82aa --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/esm/openfold_utils/feats.py @@ -0,0 +1,253 @@ +# Copyright 2021 AlQuraishi Laboratory +# Copyright 2021 DeepMind Technologies Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import overload + +import torch +import torch.types +from torch import nn + +from . import residue_constants as rc +from .rigid_utils import Rigid, Rotation +from .tensor_utils import batched_gather + + +@overload +def pseudo_beta_fn(aatype: torch.Tensor, all_atom_positions: torch.Tensor, all_atom_masks: None) -> torch.Tensor: ... + + +@overload +def pseudo_beta_fn( + aatype: torch.Tensor, all_atom_positions: torch.Tensor, all_atom_masks: torch.Tensor +) -> tuple[torch.Tensor, torch.Tensor]: ... + + +def pseudo_beta_fn(aatype, all_atom_positions, all_atom_masks): + is_gly = aatype == rc.restype_order["G"] + ca_idx = rc.atom_order["CA"] + cb_idx = rc.atom_order["CB"] + pseudo_beta = torch.where( + is_gly[..., None].expand(*((-1,) * len(is_gly.shape)), 3), + all_atom_positions[..., ca_idx, :], + all_atom_positions[..., cb_idx, :], + ) + + if all_atom_masks is not None: + pseudo_beta_mask = torch.where( + is_gly, + all_atom_masks[..., ca_idx], + all_atom_masks[..., cb_idx], + ) + return pseudo_beta, pseudo_beta_mask + else: + return pseudo_beta + + +def atom14_to_atom37(atom14: torch.Tensor, batch: dict[str, torch.Tensor]) -> torch.Tensor: + atom37_data = batched_gather( + atom14, + batch["residx_atom37_to_atom14"], + dim=-2, + no_batch_dims=len(atom14.shape[:-2]), + ) + + atom37_data = atom37_data * batch["atom37_atom_exists"][..., None] + + return atom37_data + + +def build_template_angle_feat(template_feats: dict[str, torch.Tensor]) -> torch.Tensor: + template_aatype = template_feats["template_aatype"] + torsion_angles_sin_cos = template_feats["template_torsion_angles_sin_cos"] + alt_torsion_angles_sin_cos = template_feats["template_alt_torsion_angles_sin_cos"] + torsion_angles_mask = template_feats["template_torsion_angles_mask"] + template_angle_feat = torch.cat( + [ + nn.functional.one_hot(template_aatype, 22), + torsion_angles_sin_cos.reshape(*torsion_angles_sin_cos.shape[:-2], 14), + alt_torsion_angles_sin_cos.reshape(*alt_torsion_angles_sin_cos.shape[:-2], 14), + torsion_angles_mask, + ], + dim=-1, + ) + + return template_angle_feat + + +def build_template_pair_feat( + batch: dict[str, torch.Tensor], + min_bin: torch.types.Number, + max_bin: torch.types.Number, + no_bins: int, + use_unit_vector: bool = False, + eps: float = 1e-20, + inf: float = 1e8, +) -> torch.Tensor: + template_mask = batch["template_pseudo_beta_mask"] + template_mask_2d = template_mask[..., None] * template_mask[..., None, :] + + # Compute distogram (this seems to differ slightly from Alg. 5) + tpb = batch["template_pseudo_beta"] + dgram = torch.sum((tpb[..., None, :] - tpb[..., None, :, :]) ** 2, dim=-1, keepdim=True) + lower = torch.linspace(min_bin, max_bin, no_bins, device=tpb.device) ** 2 + upper = torch.cat([lower[1:], lower.new_tensor([inf])], dim=-1) + dgram = ((dgram > lower) * (dgram < upper)).type(dgram.dtype) + + to_concat = [dgram, template_mask_2d[..., None]] + + aatype_one_hot: torch.LongTensor = nn.functional.one_hot( + batch["template_aatype"], + rc.restype_num + 2, + ) + + n_res = batch["template_aatype"].shape[-1] + to_concat.append(aatype_one_hot[..., None, :, :].expand(*aatype_one_hot.shape[:-2], n_res, -1, -1)) + to_concat.append(aatype_one_hot[..., None, :].expand(*aatype_one_hot.shape[:-2], -1, n_res, -1)) + + n, ca, c = [rc.atom_order[a] for a in ["N", "CA", "C"]] + rigids = Rigid.make_transform_from_reference( + n_xyz=batch["template_all_atom_positions"][..., n, :], + ca_xyz=batch["template_all_atom_positions"][..., ca, :], + c_xyz=batch["template_all_atom_positions"][..., c, :], + eps=eps, + ) + points = rigids.get_trans()[..., None, :, :] + rigid_vec = rigids[..., None].invert_apply(points) + + inv_distance_scalar = torch.rsqrt(eps + torch.sum(rigid_vec**2, dim=-1)) + + t_aa_masks = batch["template_all_atom_mask"] + template_mask = t_aa_masks[..., n] * t_aa_masks[..., ca] * t_aa_masks[..., c] + template_mask_2d = template_mask[..., None] * template_mask[..., None, :] + + inv_distance_scalar = inv_distance_scalar * template_mask_2d + unit_vector = rigid_vec * inv_distance_scalar[..., None] + + if not use_unit_vector: + unit_vector = unit_vector * 0.0 + + to_concat.extend(torch.unbind(unit_vector[..., None, :], dim=-1)) + to_concat.append(template_mask_2d[..., None]) + + act = torch.cat(to_concat, dim=-1) + act = act * template_mask_2d[..., None] + + return act + + +def build_extra_msa_feat(batch: dict[str, torch.Tensor]) -> torch.Tensor: + msa_1hot: torch.LongTensor = nn.functional.one_hot(batch["extra_msa"], 23) + msa_feat = [ + msa_1hot, + batch["extra_has_deletion"].unsqueeze(-1), + batch["extra_deletion_value"].unsqueeze(-1), + ] + return torch.cat(msa_feat, dim=-1) + + +def torsion_angles_to_frames( + r: Rigid, + alpha: torch.Tensor, + aatype: torch.Tensor, + rrgdf: torch.Tensor, +) -> Rigid: + # [*, N, 8, 4, 4] + default_4x4 = rrgdf[aatype, ...] + + # [*, N, 8] transformations, i.e. + # One [*, N, 8, 3, 3] rotation matrix and + # One [*, N, 8, 3] translation matrix + default_r = r.from_tensor_4x4(default_4x4) + + bb_rot = alpha.new_zeros((*((1,) * len(alpha.shape[:-1])), 2)) + bb_rot[..., 1] = 1 + + # [*, N, 8, 2] + alpha = torch.cat([bb_rot.expand(*alpha.shape[:-2], -1, -1), alpha], dim=-2) + + # [*, N, 8, 3, 3] + # Produces rotation matrices of the form: + # [ + # [1, 0 , 0 ], + # [0, a_2,-a_1], + # [0, a_1, a_2] + # ] + # This follows the original code rather than the supplement, which uses + # different indices. + + all_rots = alpha.new_zeros(default_r.get_rots().get_rot_mats().shape) + all_rots[..., 0, 0] = 1 + all_rots[..., 1, 1] = alpha[..., 1] + all_rots[..., 1, 2] = -alpha[..., 0] + all_rots[..., 2, 1:] = alpha + + all_frames = default_r.compose(Rigid(Rotation(rot_mats=all_rots), None)) + + chi2_frame_to_frame = all_frames[..., 5] + chi3_frame_to_frame = all_frames[..., 6] + chi4_frame_to_frame = all_frames[..., 7] + + chi1_frame_to_bb = all_frames[..., 4] + chi2_frame_to_bb = chi1_frame_to_bb.compose(chi2_frame_to_frame) + chi3_frame_to_bb = chi2_frame_to_bb.compose(chi3_frame_to_frame) + chi4_frame_to_bb = chi3_frame_to_bb.compose(chi4_frame_to_frame) + + all_frames_to_bb = Rigid.cat( + [ + all_frames[..., :5], + chi2_frame_to_bb.unsqueeze(-1), + chi3_frame_to_bb.unsqueeze(-1), + chi4_frame_to_bb.unsqueeze(-1), + ], + dim=-1, + ) + + all_frames_to_global = r[..., None].compose(all_frames_to_bb) + + return all_frames_to_global + + +def frames_and_literature_positions_to_atom14_pos( + r: Rigid, + aatype: torch.Tensor, + default_frames: torch.Tensor, + group_idx: torch.Tensor, + atom_mask: torch.Tensor, + lit_positions: torch.Tensor, +) -> torch.Tensor: + # [*, N, 14] + group_mask = group_idx[aatype, ...] + + # [*, N, 14, 8] + group_mask_one_hot: torch.LongTensor = nn.functional.one_hot( + group_mask, + num_classes=default_frames.shape[-3], + ) + + # [*, N, 14, 8] + t_atoms_to_global = r[..., None, :] * group_mask_one_hot + + # [*, N, 14] + t_atoms_to_global = t_atoms_to_global.map_tensor_fn(lambda x: torch.sum(x, dim=-1)) + + # [*, N, 14, 1] + atom_mask = atom_mask[aatype, ...].unsqueeze(-1) + + # [*, N, 14, 3] + lit_positions = lit_positions[aatype, ...] + pred_positions = t_atoms_to_global.apply(lit_positions) + pred_positions = pred_positions * atom_mask + + return pred_positions diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/esm/openfold_utils/loss.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/esm/openfold_utils/loss.py new file mode 100644 index 0000000000000000000000000000000000000000..3cd9f8c6b42f3091b05bcb2fb6550c36b40e9943 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/esm/openfold_utils/loss.py @@ -0,0 +1,105 @@ +# Copyright 2021 AlQuraishi Laboratory +# Copyright 2021 DeepMind Technologies Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Optional + +import torch + + +def _calculate_bin_centers(boundaries: torch.Tensor) -> torch.Tensor: + step = boundaries[1] - boundaries[0] + bin_centers = boundaries + step / 2 + bin_centers = torch.cat([bin_centers, (bin_centers[-1] + step).unsqueeze(-1)], dim=0) + return bin_centers + + +def _calculate_expected_aligned_error( + alignment_confidence_breaks: torch.Tensor, + aligned_distance_error_probs: torch.Tensor, +) -> tuple[torch.Tensor, torch.Tensor]: + bin_centers = _calculate_bin_centers(alignment_confidence_breaks) + return ( + torch.sum(aligned_distance_error_probs * bin_centers, dim=-1), + bin_centers[-1], + ) + + +def compute_predicted_aligned_error( + logits: torch.Tensor, + max_bin: int = 31, + no_bins: int = 64, + **kwargs, +) -> dict[str, torch.Tensor]: + """Computes aligned confidence metrics from logits. + + Args: + logits: [*, num_res, num_res, num_bins] the logits output from + PredictedAlignedErrorHead. + max_bin: Maximum bin value + no_bins: Number of bins + Returns: + aligned_confidence_probs: [*, num_res, num_res, num_bins] the predicted + aligned error probabilities over bins for each residue pair. + predicted_aligned_error: [*, num_res, num_res] the expected aligned distance + error for each pair of residues. + max_predicted_aligned_error: [*] the maximum predicted error possible. + """ + boundaries = torch.linspace(0, max_bin, steps=(no_bins - 1), device=logits.device) + + aligned_confidence_probs = torch.nn.functional.softmax(logits, dim=-1) + predicted_aligned_error, max_predicted_aligned_error = _calculate_expected_aligned_error( + alignment_confidence_breaks=boundaries, + aligned_distance_error_probs=aligned_confidence_probs, + ) + + return { + "aligned_confidence_probs": aligned_confidence_probs, + "predicted_aligned_error": predicted_aligned_error, + "max_predicted_aligned_error": max_predicted_aligned_error, + } + + +def compute_tm( + logits: torch.Tensor, + residue_weights: Optional[torch.Tensor] = None, + max_bin: int = 31, + no_bins: int = 64, + eps: float = 1e-8, + **kwargs, +) -> torch.Tensor: + if residue_weights is None: + residue_weights = logits.new_ones(logits.shape[-2]) + + boundaries = torch.linspace(0, max_bin, steps=(no_bins - 1), device=logits.device) + + bin_centers = _calculate_bin_centers(boundaries) + torch.sum(residue_weights) + n = logits.shape[-2] + clipped_n = max(n, 19) + + d0 = 1.24 * (clipped_n - 15) ** (1.0 / 3) - 1.8 + + probs = torch.nn.functional.softmax(logits, dim=-1) + + tm_per_bin = 1.0 / (1 + (bin_centers**2) / (d0**2)) + predicted_tm_term = torch.sum(probs * tm_per_bin, dim=-1) + + normed_residue_mask = residue_weights / (eps + residue_weights.sum()) + per_alignment = torch.sum(predicted_tm_term * normed_residue_mask, dim=-1) + + weighted = per_alignment * residue_weights + + argmax = (weighted == torch.max(weighted)).nonzero()[0] + return per_alignment[tuple(argmax)] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/esm/openfold_utils/protein.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/esm/openfold_utils/protein.py new file mode 100644 index 0000000000000000000000000000000000000000..e9701ca07114fb6cea95ad331d062243656bd35f --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/esm/openfold_utils/protein.py @@ -0,0 +1,331 @@ +# Copyright 2021 AlQuraishi Laboratory +# Copyright 2021 DeepMind Technologies Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Protein data type.""" + +import dataclasses +import re +import string +from collections.abc import Iterator, Mapping, Sequence +from typing import Any, Optional + +import numpy as np + +from . import residue_constants + + +FeatureDict = Mapping[str, np.ndarray] +ModelOutput = Mapping[str, Any] # Is a nested dict. +PICO_TO_ANGSTROM = 0.01 + + +@dataclasses.dataclass(frozen=True) +class Protein: + """Protein structure representation.""" + + # Cartesian coordinates of atoms in angstroms. The atom types correspond to + # residue_constants.atom_types, i.e. the first three are N, CA, CB. + atom_positions: np.ndarray # [num_res, num_atom_type, 3] + + # Amino-acid type for each residue represented as an integer between 0 and + # 20, where 20 is 'X'. + aatype: np.ndarray # [num_res] + + # Binary float mask to indicate presence of a particular atom. 1.0 if an atom + # is present and 0.0 if not. This should be used for loss masking. + atom_mask: np.ndarray # [num_res, num_atom_type] + + # Residue index as used in PDB. It is not necessarily continuous or 0-indexed. + residue_index: np.ndarray # [num_res] + + # B-factors, or temperature factors, of each residue (in sq. angstroms units), + # representing the displacement of the residue from its ground truth mean + # value. + b_factors: np.ndarray # [num_res, num_atom_type] + + # Chain indices for multi-chain predictions + chain_index: Optional[np.ndarray] = None + + # Optional remark about the protein. Included as a comment in output PDB + # files + remark: Optional[str] = None + + # Templates used to generate this protein (prediction-only) + parents: Optional[Sequence[str]] = None + + # Chain corresponding to each parent + parents_chain_index: Optional[Sequence[int]] = None + + +def from_proteinnet_string(proteinnet_str: str) -> Protein: + tag_re = r"(\[[A-Z]+\]\n)" + tags: list[str] = [tag.strip() for tag in re.split(tag_re, proteinnet_str) if len(tag) > 0] + groups: Iterator[tuple[str, list[str]]] = zip(tags[0::2], [l.split("\n") for l in tags[1::2]]) + + atoms: list[str] = ["N", "CA", "C"] + aatype = None + atom_positions = None + atom_mask = None + for g in groups: + if "[PRIMARY]" == g[0]: + seq = g[1][0].strip() + for i in range(len(seq)): + if seq[i] not in residue_constants.restypes: + seq[i] = "X" # FIXME: strings are immutable + aatype = np.array( + [residue_constants.restype_order.get(res_symbol, residue_constants.restype_num) for res_symbol in seq] + ) + elif "[TERTIARY]" == g[0]: + tertiary: list[list[float]] = [] + for axis in range(3): + tertiary.append(list(map(float, g[1][axis].split()))) + tertiary_np = np.array(tertiary) + atom_positions = np.zeros((len(tertiary[0]) // 3, residue_constants.atom_type_num, 3)).astype(np.float32) + for i, atom in enumerate(atoms): + atom_positions[:, residue_constants.atom_order[atom], :] = np.transpose(tertiary_np[:, i::3]) + atom_positions *= PICO_TO_ANGSTROM + elif "[MASK]" == g[0]: + mask = np.array(list(map({"-": 0, "+": 1}.get, g[1][0].strip()))) + atom_mask = np.zeros( + ( + len(mask), + residue_constants.atom_type_num, + ) + ).astype(np.float32) + for i, atom in enumerate(atoms): + atom_mask[:, residue_constants.atom_order[atom]] = 1 + atom_mask *= mask[..., None] + + assert aatype is not None + + return Protein( + atom_positions=atom_positions, + atom_mask=atom_mask, + aatype=aatype, + residue_index=np.arange(len(aatype)), + b_factors=None, + ) + + +def get_pdb_headers(prot: Protein, chain_id: int = 0) -> list[str]: + pdb_headers: list[str] = [] + + remark = prot.remark + if remark is not None: + pdb_headers.append(f"REMARK {remark}") + + parents = prot.parents + parents_chain_index = prot.parents_chain_index + if parents is not None and parents_chain_index is not None: + parents = [p for i, p in zip(parents_chain_index, parents) if i == chain_id] + + if parents is None or len(parents) == 0: + parents = ["N/A"] + + pdb_headers.append(f"PARENT {' '.join(parents)}") + + return pdb_headers + + +def add_pdb_headers(prot: Protein, pdb_str: str) -> str: + """Add pdb headers to an existing PDB string. Useful during multi-chain + recycling + """ + out_pdb_lines: list[str] = [] + lines = pdb_str.split("\n") + + remark = prot.remark + if remark is not None: + out_pdb_lines.append(f"REMARK {remark}") + + parents_per_chain: list[list[str]] + if prot.parents is not None and len(prot.parents) > 0: + parents_per_chain = [] + if prot.parents_chain_index is not None: + parent_dict: dict[str, list[str]] = {} + for p, i in zip(prot.parents, prot.parents_chain_index): + parent_dict.setdefault(str(i), []) + parent_dict[str(i)].append(p) + + max_idx = max(int(chain_idx) for chain_idx in parent_dict) + for i in range(max_idx + 1): + chain_parents = parent_dict.get(str(i), ["N/A"]) + parents_per_chain.append(chain_parents) + else: + parents_per_chain.append(list(prot.parents)) + else: + parents_per_chain = [["N/A"]] + + def make_parent_line(p: Sequence[str]) -> str: + return f"PARENT {' '.join(p)}" + + out_pdb_lines.append(make_parent_line(parents_per_chain[0])) + + chain_counter = 0 + for i, l in enumerate(lines): + if "PARENT" not in l and "REMARK" not in l: + out_pdb_lines.append(l) + if "TER" in l and "END" not in lines[i + 1]: + chain_counter += 1 + if not chain_counter >= len(parents_per_chain): + chain_parents = parents_per_chain[chain_counter] + else: + chain_parents = ["N/A"] + + out_pdb_lines.append(make_parent_line(chain_parents)) + + return "\n".join(out_pdb_lines) + + +def to_pdb(prot: Protein) -> str: + """Converts a `Protein` instance to a PDB string. + + Args: + prot: The protein to convert to PDB. + + Returns: + PDB string. + """ + restypes = residue_constants.restypes + ["X"] + + def res_1to3(r: int) -> str: + return residue_constants.restype_1to3.get(restypes[r], "UNK") + + atom_types = residue_constants.atom_types + + pdb_lines: list[str] = [] + + atom_mask = prot.atom_mask + aatype = prot.aatype + atom_positions = prot.atom_positions + residue_index = prot.residue_index.astype(np.int32) + b_factors = prot.b_factors + chain_index = prot.chain_index + + if np.any(aatype > residue_constants.restype_num): + raise ValueError("Invalid aatypes.") + + headers = get_pdb_headers(prot) + if len(headers) > 0: + pdb_lines.extend(headers) + + n = aatype.shape[0] + atom_index = 1 + prev_chain_index = 0 + chain_tags = string.ascii_uppercase + chain_tag = None + # Add all atom sites. + for i in range(n): + res_name_3 = res_1to3(aatype[i]) + for atom_name, pos, mask, b_factor in zip(atom_types, atom_positions[i], atom_mask[i], b_factors[i]): + if mask < 0.5: + continue + + record_type = "ATOM" + name = atom_name if len(atom_name) == 4 else f" {atom_name}" + alt_loc = "" + insertion_code = "" + occupancy = 1.00 + element = atom_name[0] # Protein supports only C, N, O, S, this works. + charge = "" + + chain_tag = "A" + if chain_index is not None: + chain_tag = chain_tags[chain_index[i]] + + # PDB is a columnar format, every space matters here! + atom_line = ( + f"{record_type:<6}{atom_index:>5} {name:<4}{alt_loc:>1}" + f"{res_name_3:>3} {chain_tag:>1}" + f"{residue_index[i]:>4}{insertion_code:>1} " + f"{pos[0]:>8.3f}{pos[1]:>8.3f}{pos[2]:>8.3f}" + f"{occupancy:>6.2f}{b_factor:>6.2f} " + f"{element:>2}{charge:>2}" + ) + pdb_lines.append(atom_line) + atom_index += 1 + + should_terminate = i == n - 1 + if chain_index is not None: + if i != n - 1 and chain_index[i + 1] != prev_chain_index: + should_terminate = True + prev_chain_index = chain_index[i + 1] + + if should_terminate: + # Close the chain. + chain_end = "TER" + chain_termination_line = ( + f"{chain_end:<6}{atom_index:>5} {res_1to3(aatype[i]):>3} {chain_tag:>1}{residue_index[i]:>4}" + ) + pdb_lines.append(chain_termination_line) + atom_index += 1 + + if i != n - 1: + # "prev" is a misnomer here. This happens at the beginning of + # each new chain. + pdb_lines.extend(get_pdb_headers(prot, prev_chain_index)) + + pdb_lines.append("END") + pdb_lines.append("") + return "\n".join(pdb_lines) + + +def ideal_atom_mask(prot: Protein) -> np.ndarray: + """Computes an ideal atom mask. + + `Protein.atom_mask` typically is defined according to the atoms that are reported in the PDB. This function + computes a mask according to heavy atoms that should be present in the given sequence of amino acids. + + Args: + prot: `Protein` whose fields are `numpy.ndarray` objects. + + Returns: + An ideal atom mask. + """ + return residue_constants.STANDARD_ATOM_MASK[prot.aatype] + + +def from_prediction( + features: FeatureDict, + result: ModelOutput, + b_factors: Optional[np.ndarray] = None, + chain_index: Optional[np.ndarray] = None, + remark: Optional[str] = None, + parents: Optional[Sequence[str]] = None, + parents_chain_index: Optional[Sequence[int]] = None, +) -> Protein: + """Assembles a protein from a prediction. + + Args: + features: Dictionary holding model inputs. + result: Dictionary holding model outputs. + b_factors: (Optional) B-factors to use for the protein. + chain_index: (Optional) Chain indices for multi-chain predictions + remark: (Optional) Remark about the prediction + parents: (Optional) List of template names + Returns: + A protein instance. + """ + return Protein( + aatype=features["aatype"], + atom_positions=result["final_atom_positions"], + atom_mask=result["final_atom_mask"], + residue_index=features["residue_index"] + 1, + b_factors=b_factors if b_factors is not None else np.zeros_like(result["final_atom_mask"]), + chain_index=chain_index, + remark=remark, + parents=parents, + parents_chain_index=parents_chain_index, + ) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/esm/openfold_utils/residue_constants.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/esm/openfold_utils/residue_constants.py new file mode 100644 index 0000000000000000000000000000000000000000..e92e65d29bfb2e7a0ad640e71d983c1c467d3a54 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/esm/openfold_utils/residue_constants.py @@ -0,0 +1,979 @@ +# Copyright 2021 AlQuraishi Laboratory +# Copyright 2021 DeepMind Technologies Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Constants used in AlphaFold.""" + +import collections +import copy +import functools +from collections.abc import Mapping, Sequence +from importlib import resources + +import numpy as np + + +# Internal import (35fd). + + +# Distance from one CA to next CA [trans configuration: omega = 180]. +ca_ca = 3.80209737096 + +# Format: The list for each AA type contains chi1, chi2, chi3, chi4 in +# this order (or a relevant subset from chi1 onwards). ALA and GLY don't have +# chi angles so their chi angle lists are empty. +chi_angles_atoms: dict[str, list[list[str]]] = { + "ALA": [], + # Chi5 in arginine is always 0 +- 5 degrees, so ignore it. + "ARG": [["N", "CA", "CB", "CG"], ["CA", "CB", "CG", "CD"], ["CB", "CG", "CD", "NE"], ["CG", "CD", "NE", "CZ"]], + "ASN": [["N", "CA", "CB", "CG"], ["CA", "CB", "CG", "OD1"]], + "ASP": [["N", "CA", "CB", "CG"], ["CA", "CB", "CG", "OD1"]], + "CYS": [["N", "CA", "CB", "SG"]], + "GLN": [["N", "CA", "CB", "CG"], ["CA", "CB", "CG", "CD"], ["CB", "CG", "CD", "OE1"]], + "GLU": [["N", "CA", "CB", "CG"], ["CA", "CB", "CG", "CD"], ["CB", "CG", "CD", "OE1"]], + "GLY": [], + "HIS": [["N", "CA", "CB", "CG"], ["CA", "CB", "CG", "ND1"]], + "ILE": [["N", "CA", "CB", "CG1"], ["CA", "CB", "CG1", "CD1"]], + "LEU": [["N", "CA", "CB", "CG"], ["CA", "CB", "CG", "CD1"]], + "LYS": [["N", "CA", "CB", "CG"], ["CA", "CB", "CG", "CD"], ["CB", "CG", "CD", "CE"], ["CG", "CD", "CE", "NZ"]], + "MET": [["N", "CA", "CB", "CG"], ["CA", "CB", "CG", "SD"], ["CB", "CG", "SD", "CE"]], + "PHE": [["N", "CA", "CB", "CG"], ["CA", "CB", "CG", "CD1"]], + "PRO": [["N", "CA", "CB", "CG"], ["CA", "CB", "CG", "CD"]], + "SER": [["N", "CA", "CB", "OG"]], + "THR": [["N", "CA", "CB", "OG1"]], + "TRP": [["N", "CA", "CB", "CG"], ["CA", "CB", "CG", "CD1"]], + "TYR": [["N", "CA", "CB", "CG"], ["CA", "CB", "CG", "CD1"]], + "VAL": [["N", "CA", "CB", "CG1"]], +} + +# If chi angles given in fixed-length array, this matrix determines how to mask +# them for each AA type. The order is as per restype_order (see below). +chi_angles_mask: list[list[float]] = [ + [0.0, 0.0, 0.0, 0.0], # ALA + [1.0, 1.0, 1.0, 1.0], # ARG + [1.0, 1.0, 0.0, 0.0], # ASN + [1.0, 1.0, 0.0, 0.0], # ASP + [1.0, 0.0, 0.0, 0.0], # CYS + [1.0, 1.0, 1.0, 0.0], # GLN + [1.0, 1.0, 1.0, 0.0], # GLU + [0.0, 0.0, 0.0, 0.0], # GLY + [1.0, 1.0, 0.0, 0.0], # HIS + [1.0, 1.0, 0.0, 0.0], # ILE + [1.0, 1.0, 0.0, 0.0], # LEU + [1.0, 1.0, 1.0, 1.0], # LYS + [1.0, 1.0, 1.0, 0.0], # MET + [1.0, 1.0, 0.0, 0.0], # PHE + [1.0, 1.0, 0.0, 0.0], # PRO + [1.0, 0.0, 0.0, 0.0], # SER + [1.0, 0.0, 0.0, 0.0], # THR + [1.0, 1.0, 0.0, 0.0], # TRP + [1.0, 1.0, 0.0, 0.0], # TYR + [1.0, 0.0, 0.0, 0.0], # VAL +] + +# The following chi angles are pi periodic: they can be rotated by a multiple +# of pi without affecting the structure. +chi_pi_periodic: list[list[float]] = [ + [0.0, 0.0, 0.0, 0.0], # ALA + [0.0, 0.0, 0.0, 0.0], # ARG + [0.0, 0.0, 0.0, 0.0], # ASN + [0.0, 1.0, 0.0, 0.0], # ASP + [0.0, 0.0, 0.0, 0.0], # CYS + [0.0, 0.0, 0.0, 0.0], # GLN + [0.0, 0.0, 1.0, 0.0], # GLU + [0.0, 0.0, 0.0, 0.0], # GLY + [0.0, 0.0, 0.0, 0.0], # HIS + [0.0, 0.0, 0.0, 0.0], # ILE + [0.0, 0.0, 0.0, 0.0], # LEU + [0.0, 0.0, 0.0, 0.0], # LYS + [0.0, 0.0, 0.0, 0.0], # MET + [0.0, 1.0, 0.0, 0.0], # PHE + [0.0, 0.0, 0.0, 0.0], # PRO + [0.0, 0.0, 0.0, 0.0], # SER + [0.0, 0.0, 0.0, 0.0], # THR + [0.0, 0.0, 0.0, 0.0], # TRP + [0.0, 1.0, 0.0, 0.0], # TYR + [0.0, 0.0, 0.0, 0.0], # VAL + [0.0, 0.0, 0.0, 0.0], # UNK +] + +# Atoms positions relative to the 8 rigid groups, defined by the pre-omega, phi, +# psi and chi angles: +# 0: 'backbone group', +# 1: 'pre-omega-group', (empty) +# 2: 'phi-group', (currently empty, because it defines only hydrogens) +# 3: 'psi-group', +# 4,5,6,7: 'chi1,2,3,4-group' +# The atom positions are relative to the axis-end-atom of the corresponding +# rotation axis. The x-axis is in direction of the rotation axis, and the y-axis +# is defined such that the dihedral-angle-definiting atom (the last entry in +# chi_angles_atoms above) is in the xy-plane (with a positive y-coordinate). +# format: [atomname, group_idx, rel_position] +rigid_group_atom_positions: dict[str, list[tuple[str, int, tuple[float, float, float]]]] = { + "ALA": [ + ("N", 0, (-0.525, 1.363, 0.000)), + ("CA", 0, (0.000, 0.000, 0.000)), + ("C", 0, (1.526, -0.000, -0.000)), + ("CB", 0, (-0.529, -0.774, -1.205)), + ("O", 3, (0.627, 1.062, 0.000)), + ], + "ARG": [ + ("N", 0, (-0.524, 1.362, -0.000)), + ("CA", 0, (0.000, 0.000, 0.000)), + ("C", 0, (1.525, -0.000, -0.000)), + ("CB", 0, (-0.524, -0.778, -1.209)), + ("O", 3, (0.626, 1.062, 0.000)), + ("CG", 4, (0.616, 1.390, -0.000)), + ("CD", 5, (0.564, 1.414, 0.000)), + ("NE", 6, (0.539, 1.357, -0.000)), + ("NH1", 7, (0.206, 2.301, 0.000)), + ("NH2", 7, (2.078, 0.978, -0.000)), + ("CZ", 7, (0.758, 1.093, -0.000)), + ], + "ASN": [ + ("N", 0, (-0.536, 1.357, 0.000)), + ("CA", 0, (0.000, 0.000, 0.000)), + ("C", 0, (1.526, -0.000, -0.000)), + ("CB", 0, (-0.531, -0.787, -1.200)), + ("O", 3, (0.625, 1.062, 0.000)), + ("CG", 4, (0.584, 1.399, 0.000)), + ("ND2", 5, (0.593, -1.188, 0.001)), + ("OD1", 5, (0.633, 1.059, 0.000)), + ], + "ASP": [ + ("N", 0, (-0.525, 1.362, -0.000)), + ("CA", 0, (0.000, 0.000, 0.000)), + ("C", 0, (1.527, 0.000, -0.000)), + ("CB", 0, (-0.526, -0.778, -1.208)), + ("O", 3, (0.626, 1.062, -0.000)), + ("CG", 4, (0.593, 1.398, -0.000)), + ("OD1", 5, (0.610, 1.091, 0.000)), + ("OD2", 5, (0.592, -1.101, -0.003)), + ], + "CYS": [ + ("N", 0, (-0.522, 1.362, -0.000)), + ("CA", 0, (0.000, 0.000, 0.000)), + ("C", 0, (1.524, 0.000, 0.000)), + ("CB", 0, (-0.519, -0.773, -1.212)), + ("O", 3, (0.625, 1.062, -0.000)), + ("SG", 4, (0.728, 1.653, 0.000)), + ], + "GLN": [ + ("N", 0, (-0.526, 1.361, -0.000)), + ("CA", 0, (0.000, 0.000, 0.000)), + ("C", 0, (1.526, 0.000, 0.000)), + ("CB", 0, (-0.525, -0.779, -1.207)), + ("O", 3, (0.626, 1.062, -0.000)), + ("CG", 4, (0.615, 1.393, 0.000)), + ("CD", 5, (0.587, 1.399, -0.000)), + ("NE2", 6, (0.593, -1.189, -0.001)), + ("OE1", 6, (0.634, 1.060, 0.000)), + ], + "GLU": [ + ("N", 0, (-0.528, 1.361, 0.000)), + ("CA", 0, (0.000, 0.000, 0.000)), + ("C", 0, (1.526, -0.000, -0.000)), + ("CB", 0, (-0.526, -0.781, -1.207)), + ("O", 3, (0.626, 1.062, 0.000)), + ("CG", 4, (0.615, 1.392, 0.000)), + ("CD", 5, (0.600, 1.397, 0.000)), + ("OE1", 6, (0.607, 1.095, -0.000)), + ("OE2", 6, (0.589, -1.104, -0.001)), + ], + "GLY": [ + ("N", 0, (-0.572, 1.337, 0.000)), + ("CA", 0, (0.000, 0.000, 0.000)), + ("C", 0, (1.517, -0.000, -0.000)), + ("O", 3, (0.626, 1.062, -0.000)), + ], + "HIS": [ + ("N", 0, (-0.527, 1.360, 0.000)), + ("CA", 0, (0.000, 0.000, 0.000)), + ("C", 0, (1.525, 0.000, 0.000)), + ("CB", 0, (-0.525, -0.778, -1.208)), + ("O", 3, (0.625, 1.063, 0.000)), + ("CG", 4, (0.600, 1.370, -0.000)), + ("CD2", 5, (0.889, -1.021, 0.003)), + ("ND1", 5, (0.744, 1.160, -0.000)), + ("CE1", 5, (2.030, 0.851, 0.002)), + ("NE2", 5, (2.145, -0.466, 0.004)), + ], + "ILE": [ + ("N", 0, (-0.493, 1.373, -0.000)), + ("CA", 0, (0.000, 0.000, 0.000)), + ("C", 0, (1.527, -0.000, -0.000)), + ("CB", 0, (-0.536, -0.793, -1.213)), + ("O", 3, (0.627, 1.062, -0.000)), + ("CG1", 4, (0.534, 1.437, -0.000)), + ("CG2", 4, (0.540, -0.785, -1.199)), + ("CD1", 5, (0.619, 1.391, 0.000)), + ], + "LEU": [ + ("N", 0, (-0.520, 1.363, 0.000)), + ("CA", 0, (0.000, 0.000, 0.000)), + ("C", 0, (1.525, -0.000, -0.000)), + ("CB", 0, (-0.522, -0.773, -1.214)), + ("O", 3, (0.625, 1.063, -0.000)), + ("CG", 4, (0.678, 1.371, 0.000)), + ("CD1", 5, (0.530, 1.430, -0.000)), + ("CD2", 5, (0.535, -0.774, 1.200)), + ], + "LYS": [ + ("N", 0, (-0.526, 1.362, -0.000)), + ("CA", 0, (0.000, 0.000, 0.000)), + ("C", 0, (1.526, 0.000, 0.000)), + ("CB", 0, (-0.524, -0.778, -1.208)), + ("O", 3, (0.626, 1.062, -0.000)), + ("CG", 4, (0.619, 1.390, 0.000)), + ("CD", 5, (0.559, 1.417, 0.000)), + ("CE", 6, (0.560, 1.416, 0.000)), + ("NZ", 7, (0.554, 1.387, 0.000)), + ], + "MET": [ + ("N", 0, (-0.521, 1.364, -0.000)), + ("CA", 0, (0.000, 0.000, 0.000)), + ("C", 0, (1.525, 0.000, 0.000)), + ("CB", 0, (-0.523, -0.776, -1.210)), + ("O", 3, (0.625, 1.062, -0.000)), + ("CG", 4, (0.613, 1.391, -0.000)), + ("SD", 5, (0.703, 1.695, 0.000)), + ("CE", 6, (0.320, 1.786, -0.000)), + ], + "PHE": [ + ("N", 0, (-0.518, 1.363, 0.000)), + ("CA", 0, (0.000, 0.000, 0.000)), + ("C", 0, (1.524, 0.000, -0.000)), + ("CB", 0, (-0.525, -0.776, -1.212)), + ("O", 3, (0.626, 1.062, -0.000)), + ("CG", 4, (0.607, 1.377, 0.000)), + ("CD1", 5, (0.709, 1.195, -0.000)), + ("CD2", 5, (0.706, -1.196, 0.000)), + ("CE1", 5, (2.102, 1.198, -0.000)), + ("CE2", 5, (2.098, -1.201, -0.000)), + ("CZ", 5, (2.794, -0.003, -0.001)), + ], + "PRO": [ + ("N", 0, (-0.566, 1.351, -0.000)), + ("CA", 0, (0.000, 0.000, 0.000)), + ("C", 0, (1.527, -0.000, 0.000)), + ("CB", 0, (-0.546, -0.611, -1.293)), + ("O", 3, (0.621, 1.066, 0.000)), + ("CG", 4, (0.382, 1.445, 0.0)), + # ('CD', 5, (0.427, 1.440, 0.0)), + ("CD", 5, (0.477, 1.424, 0.0)), # manually made angle 2 degrees larger + ], + "SER": [ + ("N", 0, (-0.529, 1.360, -0.000)), + ("CA", 0, (0.000, 0.000, 0.000)), + ("C", 0, (1.525, -0.000, -0.000)), + ("CB", 0, (-0.518, -0.777, -1.211)), + ("O", 3, (0.626, 1.062, -0.000)), + ("OG", 4, (0.503, 1.325, 0.000)), + ], + "THR": [ + ("N", 0, (-0.517, 1.364, 0.000)), + ("CA", 0, (0.000, 0.000, 0.000)), + ("C", 0, (1.526, 0.000, -0.000)), + ("CB", 0, (-0.516, -0.793, -1.215)), + ("O", 3, (0.626, 1.062, 0.000)), + ("CG2", 4, (0.550, -0.718, -1.228)), + ("OG1", 4, (0.472, 1.353, 0.000)), + ], + "TRP": [ + ("N", 0, (-0.521, 1.363, 0.000)), + ("CA", 0, (0.000, 0.000, 0.000)), + ("C", 0, (1.525, -0.000, 0.000)), + ("CB", 0, (-0.523, -0.776, -1.212)), + ("O", 3, (0.627, 1.062, 0.000)), + ("CG", 4, (0.609, 1.370, -0.000)), + ("CD1", 5, (0.824, 1.091, 0.000)), + ("CD2", 5, (0.854, -1.148, -0.005)), + ("CE2", 5, (2.186, -0.678, -0.007)), + ("CE3", 5, (0.622, -2.530, -0.007)), + ("NE1", 5, (2.140, 0.690, -0.004)), + ("CH2", 5, (3.028, -2.890, -0.013)), + ("CZ2", 5, (3.283, -1.543, -0.011)), + ("CZ3", 5, (1.715, -3.389, -0.011)), + ], + "TYR": [ + ("N", 0, (-0.522, 1.362, 0.000)), + ("CA", 0, (0.000, 0.000, 0.000)), + ("C", 0, (1.524, -0.000, -0.000)), + ("CB", 0, (-0.522, -0.776, -1.213)), + ("O", 3, (0.627, 1.062, -0.000)), + ("CG", 4, (0.607, 1.382, -0.000)), + ("CD1", 5, (0.716, 1.195, -0.000)), + ("CD2", 5, (0.713, -1.194, -0.001)), + ("CE1", 5, (2.107, 1.200, -0.002)), + ("CE2", 5, (2.104, -1.201, -0.003)), + ("OH", 5, (4.168, -0.002, -0.005)), + ("CZ", 5, (2.791, -0.001, -0.003)), + ], + "VAL": [ + ("N", 0, (-0.494, 1.373, -0.000)), + ("CA", 0, (0.000, 0.000, 0.000)), + ("C", 0, (1.527, -0.000, -0.000)), + ("CB", 0, (-0.533, -0.795, -1.213)), + ("O", 3, (0.627, 1.062, -0.000)), + ("CG1", 4, (0.540, 1.429, -0.000)), + ("CG2", 4, (0.533, -0.776, 1.203)), + ], +} + +# A list of atoms (excluding hydrogen) for each AA type. PDB naming convention. +residue_atoms: dict[str, list[str]] = { + "ALA": ["C", "CA", "CB", "N", "O"], + "ARG": ["C", "CA", "CB", "CG", "CD", "CZ", "N", "NE", "O", "NH1", "NH2"], + "ASP": ["C", "CA", "CB", "CG", "N", "O", "OD1", "OD2"], + "ASN": ["C", "CA", "CB", "CG", "N", "ND2", "O", "OD1"], + "CYS": ["C", "CA", "CB", "N", "O", "SG"], + "GLU": ["C", "CA", "CB", "CG", "CD", "N", "O", "OE1", "OE2"], + "GLN": ["C", "CA", "CB", "CG", "CD", "N", "NE2", "O", "OE1"], + "GLY": ["C", "CA", "N", "O"], + "HIS": ["C", "CA", "CB", "CG", "CD2", "CE1", "N", "ND1", "NE2", "O"], + "ILE": ["C", "CA", "CB", "CG1", "CG2", "CD1", "N", "O"], + "LEU": ["C", "CA", "CB", "CG", "CD1", "CD2", "N", "O"], + "LYS": ["C", "CA", "CB", "CG", "CD", "CE", "N", "NZ", "O"], + "MET": ["C", "CA", "CB", "CG", "CE", "N", "O", "SD"], + "PHE": ["C", "CA", "CB", "CG", "CD1", "CD2", "CE1", "CE2", "CZ", "N", "O"], + "PRO": ["C", "CA", "CB", "CG", "CD", "N", "O"], + "SER": ["C", "CA", "CB", "N", "O", "OG"], + "THR": ["C", "CA", "CB", "CG2", "N", "O", "OG1"], + "TRP": ["C", "CA", "CB", "CG", "CD1", "CD2", "CE2", "CE3", "CZ2", "CZ3", "CH2", "N", "NE1", "O"], + "TYR": ["C", "CA", "CB", "CG", "CD1", "CD2", "CE1", "CE2", "CZ", "N", "O", "OH"], + "VAL": ["C", "CA", "CB", "CG1", "CG2", "N", "O"], +} + +# Naming swaps for ambiguous atom names. +# Due to symmetries in the amino acids the naming of atoms is ambiguous in +# 4 of the 20 amino acids. +# (The LDDT paper lists 7 amino acids as ambiguous, but the naming ambiguities +# in LEU, VAL and ARG can be resolved by using the 3d constellations of +# the 'ambiguous' atoms and their neighbours) +# TODO: ^ interpret this +residue_atom_renaming_swaps: dict[str, dict[str, str]] = { + "ASP": {"OD1": "OD2"}, + "GLU": {"OE1": "OE2"}, + "PHE": {"CD1": "CD2", "CE1": "CE2"}, + "TYR": {"CD1": "CD2", "CE1": "CE2"}, +} + +# Van der Waals radii [Angstroem] of the atoms (from Wikipedia) +van_der_waals_radius: dict[str, float] = { + "C": 1.7, + "N": 1.55, + "O": 1.52, + "S": 1.8, +} + +Bond = collections.namedtuple("Bond", ["atom1_name", "atom2_name", "length", "stddev"]) +BondAngle = collections.namedtuple( + "BondAngle", + ["atom1_name", "atom2_name", "atom3name", "angle_rad", "stddev"], +) + + +def map_structure_with_atom_order(in_list: list, first_call: bool = True) -> list: + # Maps strings in a nested list structure to their corresponding index in atom_order + if first_call: + in_list = copy.deepcopy(in_list) + for i in range(len(in_list)): + if isinstance(in_list[i], list): + in_list[i] = map_structure_with_atom_order(in_list[i], first_call=False) + elif isinstance(in_list[i], str): + in_list[i] = atom_order[in_list[i]] + else: + raise TypeError("Unexpected type when mapping nested lists!") + return in_list + + +@functools.cache +def load_stereo_chemical_props() -> tuple[ + Mapping[str, list[Bond]], + Mapping[str, list[Bond]], + Mapping[str, list[BondAngle]], +]: + """Load stereo_chemical_props.txt into a nice structure. + + Load literature values for bond lengths and bond angles and translate bond angles into the length of the opposite + edge of the triangle ("residue_virtual_bonds"). + + Returns: + residue_bonds: dict that maps resname --> list of Bond tuples residue_virtual_bonds: dict that maps resname --> + list of Bond tuples residue_bond_angles: dict that maps resname --> list of BondAngle tuples + """ + # TODO: this file should be downloaded in a setup script + stereo_chemical_props = resources.read_text("openfold.resources", "stereo_chemical_props.txt") + + lines_iter = iter(stereo_chemical_props.splitlines()) + # Load bond lengths. + residue_bonds: dict[str, list[Bond]] = {} + next(lines_iter) # Skip header line. + for line in lines_iter: + if line.strip() == "-": + break + bond, resname, bond_length, stddev = line.split() + atom1, atom2 = bond.split("-") + if resname not in residue_bonds: + residue_bonds[resname] = [] + residue_bonds[resname].append(Bond(atom1, atom2, float(bond_length), float(stddev))) + residue_bonds["UNK"] = [] + + # Load bond angles. + residue_bond_angles: dict[str, list[BondAngle]] = {} + next(lines_iter) # Skip empty line. + next(lines_iter) # Skip header line. + for line in lines_iter: + if line.strip() == "-": + break + bond, resname, angle_degree, stddev_degree = line.split() + atom1, atom2, atom3 = bond.split("-") + if resname not in residue_bond_angles: + residue_bond_angles[resname] = [] + residue_bond_angles[resname].append( + BondAngle( + atom1, + atom2, + atom3, + float(angle_degree) / 180.0 * np.pi, + float(stddev_degree) / 180.0 * np.pi, + ) + ) + residue_bond_angles["UNK"] = [] + + def make_bond_key(atom1_name: str, atom2_name: str) -> str: + """Unique key to lookup bonds.""" + return "-".join(sorted([atom1_name, atom2_name])) + + # Translate bond angles into distances ("virtual bonds"). + residue_virtual_bonds: dict[str, list[Bond]] = {} + for resname, bond_angles in residue_bond_angles.items(): + # Create a fast lookup dict for bond lengths. + bond_cache: dict[str, Bond] = {} + for b in residue_bonds[resname]: + bond_cache[make_bond_key(b.atom1_name, b.atom2_name)] = b + residue_virtual_bonds[resname] = [] + for ba in bond_angles: + bond1 = bond_cache[make_bond_key(ba.atom1_name, ba.atom2_name)] + bond2 = bond_cache[make_bond_key(ba.atom2_name, ba.atom3name)] + + # Compute distance between atom1 and atom3 using the law of cosines + # c^2 = a^2 + b^2 - 2ab*cos(gamma). + gamma = ba.angle_rad + length = np.sqrt(bond1.length**2 + bond2.length**2 - 2 * bond1.length * bond2.length * np.cos(gamma)) + + # Propagation of uncertainty assuming uncorrelated errors. + dl_outer = 0.5 / length + dl_dgamma = (2 * bond1.length * bond2.length * np.sin(gamma)) * dl_outer + dl_db1 = (2 * bond1.length - 2 * bond2.length * np.cos(gamma)) * dl_outer + dl_db2 = (2 * bond2.length - 2 * bond1.length * np.cos(gamma)) * dl_outer + stddev = np.sqrt( + (dl_dgamma * ba.stddev) ** 2 + (dl_db1 * bond1.stddev) ** 2 + (dl_db2 * bond2.stddev) ** 2 + ) + residue_virtual_bonds[resname].append(Bond(ba.atom1_name, ba.atom3name, length, stddev)) + + return (residue_bonds, residue_virtual_bonds, residue_bond_angles) + + +# Between-residue bond lengths for general bonds (first element) and for Proline +# (second element). +between_res_bond_length_c_n: tuple[float, float] = (1.329, 1.341) +between_res_bond_length_stddev_c_n: tuple[float, float] = (0.014, 0.016) + +# Between-residue cos_angles. +between_res_cos_angles_c_n_ca: tuple[float, float] = (-0.5203, 0.0353) # degrees: 121.352 +- 2.315 +between_res_cos_angles_ca_c_n: tuple[float, float] = (-0.4473, 0.0311) # degrees: 116.568 +- 1.995 + +# This mapping is used when we need to store atom data in a format that requires +# fixed atom data size for every residue (e.g. a numpy array). +atom_types: list[str] = [ + "N", + "CA", + "C", + "CB", + "O", + "CG", + "CG1", + "CG2", + "OG", + "OG1", + "SG", + "CD", + "CD1", + "CD2", + "ND1", + "ND2", + "OD1", + "OD2", + "SD", + "CE", + "CE1", + "CE2", + "CE3", + "NE", + "NE1", + "NE2", + "OE1", + "OE2", + "CH2", + "NH1", + "NH2", + "OH", + "CZ", + "CZ2", + "CZ3", + "NZ", + "OXT", +] +atom_order: dict[str, int] = {atom_type: i for i, atom_type in enumerate(atom_types)} +atom_type_num = len(atom_types) # := 37. + +# A compact atom encoding with 14 columns +# pylint: disable=line-too-long +restype_name_to_atom14_names: dict[str, list[str]] = { + "ALA": ["N", "CA", "C", "O", "CB", "", "", "", "", "", "", "", "", ""], + "ARG": ["N", "CA", "C", "O", "CB", "CG", "CD", "NE", "CZ", "NH1", "NH2", "", "", ""], + "ASN": ["N", "CA", "C", "O", "CB", "CG", "OD1", "ND2", "", "", "", "", "", ""], + "ASP": ["N", "CA", "C", "O", "CB", "CG", "OD1", "OD2", "", "", "", "", "", ""], + "CYS": ["N", "CA", "C", "O", "CB", "SG", "", "", "", "", "", "", "", ""], + "GLN": ["N", "CA", "C", "O", "CB", "CG", "CD", "OE1", "NE2", "", "", "", "", ""], + "GLU": ["N", "CA", "C", "O", "CB", "CG", "CD", "OE1", "OE2", "", "", "", "", ""], + "GLY": ["N", "CA", "C", "O", "", "", "", "", "", "", "", "", "", ""], + "HIS": ["N", "CA", "C", "O", "CB", "CG", "ND1", "CD2", "CE1", "NE2", "", "", "", ""], + "ILE": ["N", "CA", "C", "O", "CB", "CG1", "CG2", "CD1", "", "", "", "", "", ""], + "LEU": ["N", "CA", "C", "O", "CB", "CG", "CD1", "CD2", "", "", "", "", "", ""], + "LYS": ["N", "CA", "C", "O", "CB", "CG", "CD", "CE", "NZ", "", "", "", "", ""], + "MET": ["N", "CA", "C", "O", "CB", "CG", "SD", "CE", "", "", "", "", "", ""], + "PHE": ["N", "CA", "C", "O", "CB", "CG", "CD1", "CD2", "CE1", "CE2", "CZ", "", "", ""], + "PRO": ["N", "CA", "C", "O", "CB", "CG", "CD", "", "", "", "", "", "", ""], + "SER": ["N", "CA", "C", "O", "CB", "OG", "", "", "", "", "", "", "", ""], + "THR": ["N", "CA", "C", "O", "CB", "OG1", "CG2", "", "", "", "", "", "", ""], + "TRP": ["N", "CA", "C", "O", "CB", "CG", "CD1", "CD2", "NE1", "CE2", "CE3", "CZ2", "CZ3", "CH2"], + "TYR": ["N", "CA", "C", "O", "CB", "CG", "CD1", "CD2", "CE1", "CE2", "CZ", "OH", "", ""], + "VAL": ["N", "CA", "C", "O", "CB", "CG1", "CG2", "", "", "", "", "", "", ""], + "UNK": ["", "", "", "", "", "", "", "", "", "", "", "", "", ""], +} +# pylint: enable=line-too-long + + +# This is the standard residue order when coding AA type as a number. +# Reproduce it by taking 3-letter AA codes and sorting them alphabetically. +restypes: list[str] = [ + "A", + "R", + "N", + "D", + "C", + "Q", + "E", + "G", + "H", + "I", + "L", + "K", + "M", + "F", + "P", + "S", + "T", + "W", + "Y", + "V", +] +restype_order: dict[str, int] = {restype: i for i, restype in enumerate(restypes)} +restype_num = len(restypes) # := 20. +unk_restype_index = restype_num # Catch-all index for unknown restypes. + +restypes_with_x: list[str] = restypes + ["X"] +restype_order_with_x: dict[str, int] = {restype: i for i, restype in enumerate(restypes_with_x)} + + +def sequence_to_onehot(sequence: str, mapping: Mapping[str, int], map_unknown_to_x: bool = False) -> np.ndarray: + """Maps the given sequence into a one-hot encoded matrix. + + Args: + sequence: An amino acid sequence. + mapping: A dictionary mapping amino acids to integers. + map_unknown_to_x: If True, any amino acid that is not in the mapping will be + mapped to the unknown amino acid 'X'. If the mapping doesn't contain amino acid 'X', an error will be thrown. + If False, any amino acid not in the mapping will throw an error. + + Returns: + A numpy array of shape (seq_len, num_unique_aas) with one-hot encoding of the sequence. + + Raises: + ValueError: If the mapping doesn't contain values from 0 to + num_unique_aas - 1 without any gaps. + """ + num_entries = max(mapping.values()) + 1 + + if sorted(set(mapping.values())) != list(range(num_entries)): + raise ValueError( + "The mapping must have values from 0 to num_unique_aas-1 without any gaps. Got: %s" + % sorted(mapping.values()) + ) + + one_hot_arr = np.zeros((len(sequence), num_entries), dtype=np.int32) + + for aa_index, aa_type in enumerate(sequence): + if map_unknown_to_x: + if aa_type.isalpha() and aa_type.isupper(): + aa_id = mapping.get(aa_type, mapping["X"]) + else: + raise ValueError(f"Invalid character in the sequence: {aa_type}") + else: + aa_id = mapping[aa_type] + one_hot_arr[aa_index, aa_id] = 1 + + return one_hot_arr + + +restype_1to3: dict[str, str] = { + "A": "ALA", + "R": "ARG", + "N": "ASN", + "D": "ASP", + "C": "CYS", + "Q": "GLN", + "E": "GLU", + "G": "GLY", + "H": "HIS", + "I": "ILE", + "L": "LEU", + "K": "LYS", + "M": "MET", + "F": "PHE", + "P": "PRO", + "S": "SER", + "T": "THR", + "W": "TRP", + "Y": "TYR", + "V": "VAL", +} + + +# NB: restype_3to1 differs from Bio.PDB.protein_letters_3to1 by being a simple +# 1-to-1 mapping of 3 letter names to one letter names. The latter contains +# many more, and less common, three letter names as keys and maps many of these +# to the same one letter name (including 'X' and 'U' which we don't use here). +restype_3to1: dict[str, str] = {v: k for k, v in restype_1to3.items()} + +# Define a restype name for all unknown residues. +unk_restype = "UNK" + +resnames: list[str] = [restype_1to3[r] for r in restypes] + [unk_restype] +resname_to_idx: dict[str, int] = {resname: i for i, resname in enumerate(resnames)} + + +# The mapping here uses hhblits convention, so that B is mapped to D, J and O +# are mapped to X, U is mapped to C, and Z is mapped to E. Other than that the +# remaining 20 amino acids are kept in alphabetical order. +# There are 2 non-amino acid codes, X (representing any amino acid) and +# "-" representing a missing amino acid in an alignment. The id for these +# codes is put at the end (20 and 21) so that they can easily be ignored if +# desired. +HHBLITS_AA_TO_ID: dict[str, int] = { + "A": 0, + "B": 2, + "C": 1, + "D": 2, + "E": 3, + "F": 4, + "G": 5, + "H": 6, + "I": 7, + "J": 20, + "K": 8, + "L": 9, + "M": 10, + "N": 11, + "O": 20, + "P": 12, + "Q": 13, + "R": 14, + "S": 15, + "T": 16, + "U": 1, + "V": 17, + "W": 18, + "X": 20, + "Y": 19, + "Z": 3, + "-": 21, +} + +# Partial inversion of HHBLITS_AA_TO_ID. +ID_TO_HHBLITS_AA: dict[int, str] = { + 0: "A", + 1: "C", # Also U. + 2: "D", # Also B. + 3: "E", # Also Z. + 4: "F", + 5: "G", + 6: "H", + 7: "I", + 8: "K", + 9: "L", + 10: "M", + 11: "N", + 12: "P", + 13: "Q", + 14: "R", + 15: "S", + 16: "T", + 17: "V", + 18: "W", + 19: "Y", + 20: "X", # Includes J and O. + 21: "-", +} + +restypes_with_x_and_gap: list[str] = restypes + ["X", "-"] +MAP_HHBLITS_AATYPE_TO_OUR_AATYPE: tuple[int, ...] = tuple( + restypes_with_x_and_gap.index(ID_TO_HHBLITS_AA[i]) for i in range(len(restypes_with_x_and_gap)) +) + + +def _make_standard_atom_mask() -> np.ndarray: + """Returns [num_res_types, num_atom_types] mask array.""" + # +1 to account for unknown (all 0s). + mask = np.zeros([restype_num + 1, atom_type_num], dtype=np.int32) + for restype, restype_letter in enumerate(restypes): + restype_name = restype_1to3[restype_letter] + atom_names = residue_atoms[restype_name] + for atom_name in atom_names: + atom_type = atom_order[atom_name] + mask[restype, atom_type] = 1 + return mask + + +STANDARD_ATOM_MASK = _make_standard_atom_mask() + + +# A one hot representation for the first and second atoms defining the axis +# of rotation for each chi-angle in each residue. +def chi_angle_atom(atom_index: int) -> np.ndarray: + """Define chi-angle rigid groups via one-hot representations.""" + chi_angles_index = {} + one_hots = [] + + for k, v in chi_angles_atoms.items(): + indices = [atom_types.index(s[atom_index]) for s in v] + indices.extend([-1] * (4 - len(indices))) + chi_angles_index[k] = indices + + for r in restypes: + res3 = restype_1to3[r] + one_hot = np.eye(atom_type_num)[chi_angles_index[res3]] + one_hots.append(one_hot) + + one_hots.append(np.zeros([4, atom_type_num])) # Add zeros for residue `X`. + one_hot = np.stack(one_hots, axis=0) + one_hot = np.transpose(one_hot, [0, 2, 1]) + + return one_hot + + +chi_atom_1_one_hot = chi_angle_atom(1) +chi_atom_2_one_hot = chi_angle_atom(2) + +# An array like chi_angles_atoms but using indices rather than names. +chi_angles_atom_indices_list: list[list[list[str]]] = [chi_angles_atoms[restype_1to3[r]] for r in restypes] +chi_angles_atom_indices_ours: list = map_structure_with_atom_order(chi_angles_atom_indices_list) +chi_angles_atom_indices = np.array( + [chi_atoms + ([[0, 0, 0, 0]] * (4 - len(chi_atoms))) for chi_atoms in chi_angles_atom_indices_list] +) + +# Mapping from (res_name, atom_name) pairs to the atom's chi group index +# and atom index within that group. +chi_groups_for_atom: dict[tuple[str, str], list[tuple[int, int]]] = collections.defaultdict(list) +for res_name, chi_angle_atoms_for_res in chi_angles_atoms.items(): + for chi_group_i, chi_group in enumerate(chi_angle_atoms_for_res): + for atom_i, atom in enumerate(chi_group): + chi_groups_for_atom[(res_name, atom)].append((chi_group_i, atom_i)) +chi_groups_for_atom = dict(chi_groups_for_atom) + + +def _make_rigid_transformation_4x4(ex: np.ndarray, ey: np.ndarray, translation: np.ndarray) -> np.ndarray: + """Create a rigid 4x4 transformation matrix from two axes and transl.""" + # Normalize ex. + ex_normalized = ex / np.linalg.norm(ex) + + # make ey perpendicular to ex + ey_normalized = ey - np.dot(ey, ex_normalized) * ex_normalized + ey_normalized /= np.linalg.norm(ey_normalized) + + # compute ez as cross product + eznorm = np.cross(ex_normalized, ey_normalized) + m = np.stack([ex_normalized, ey_normalized, eznorm, translation]).transpose() + m = np.concatenate([m, [[0.0, 0.0, 0.0, 1.0]]], axis=0) + return m + + +# create an array with (restype, atomtype) --> rigid_group_idx +# and an array with (restype, atomtype, coord) for the atom positions +# and compute affine transformation matrices (4,4) from one rigid group to the +# previous group +restype_atom37_to_rigid_group = np.zeros([21, 37], dtype=int) +restype_atom37_mask = np.zeros([21, 37], dtype=np.float32) +restype_atom37_rigid_group_positions = np.zeros([21, 37, 3], dtype=np.float32) +restype_atom14_to_rigid_group = np.zeros([21, 14], dtype=int) +restype_atom14_mask = np.zeros([21, 14], dtype=np.float32) +restype_atom14_rigid_group_positions = np.zeros([21, 14, 3], dtype=np.float32) +restype_rigid_group_default_frame = np.zeros([21, 8, 4, 4], dtype=np.float32) + + +def _make_rigid_group_constants() -> None: + """Fill the arrays above.""" + for restype, restype_letter in enumerate(restypes): + resname = restype_1to3[restype_letter] + for atomname, group_idx, atom_position in rigid_group_atom_positions[resname]: + atomtype = atom_order[atomname] + restype_atom37_to_rigid_group[restype, atomtype] = group_idx + restype_atom37_mask[restype, atomtype] = 1 + restype_atom37_rigid_group_positions[restype, atomtype, :] = atom_position + + atom14idx = restype_name_to_atom14_names[resname].index(atomname) + restype_atom14_to_rigid_group[restype, atom14idx] = group_idx + restype_atom14_mask[restype, atom14idx] = 1 + restype_atom14_rigid_group_positions[restype, atom14idx, :] = atom_position + + for restype, restype_letter in enumerate(restypes): + resname = restype_1to3[restype_letter] + atom_positions: dict[str, np.ndarray] = { + name: np.array(pos) for name, _, pos in rigid_group_atom_positions[resname] + } + + # backbone to backbone is the identity transform + restype_rigid_group_default_frame[restype, 0, :, :] = np.eye(4) + + # pre-omega-frame to backbone (currently dummy identity matrix) + restype_rigid_group_default_frame[restype, 1, :, :] = np.eye(4) + + # phi-frame to backbone + mat = _make_rigid_transformation_4x4( + ex=atom_positions["N"] - atom_positions["CA"], + ey=np.array([1.0, 0.0, 0.0]), + translation=atom_positions["N"], + ) + restype_rigid_group_default_frame[restype, 2, :, :] = mat + + # psi-frame to backbone + mat = _make_rigid_transformation_4x4( + ex=atom_positions["C"] - atom_positions["CA"], + ey=atom_positions["CA"] - atom_positions["N"], + translation=atom_positions["C"], + ) + restype_rigid_group_default_frame[restype, 3, :, :] = mat + + # chi1-frame to backbone + if chi_angles_mask[restype][0]: + base_atom_names = chi_angles_atoms[resname][0] + base_atom_positions = [atom_positions[name] for name in base_atom_names] + mat = _make_rigid_transformation_4x4( + ex=base_atom_positions[2] - base_atom_positions[1], + ey=base_atom_positions[0] - base_atom_positions[1], + translation=base_atom_positions[2], + ) + restype_rigid_group_default_frame[restype, 4, :, :] = mat + + # chi2-frame to chi1-frame + # chi3-frame to chi2-frame + # chi4-frame to chi3-frame + # luckily all rotation axes for the next frame start at (0,0,0) of the + # previous frame + for chi_idx in range(1, 4): + if chi_angles_mask[restype][chi_idx]: + axis_end_atom_name = chi_angles_atoms[resname][chi_idx][2] + axis_end_atom_position = atom_positions[axis_end_atom_name] + mat = _make_rigid_transformation_4x4( + ex=axis_end_atom_position, + ey=np.array([-1.0, 0.0, 0.0]), + translation=axis_end_atom_position, + ) + restype_rigid_group_default_frame[restype, 4 + chi_idx, :, :] = mat + + +_make_rigid_group_constants() + + +def make_atom14_dists_bounds( + overlap_tolerance: float = 1.5, + bond_length_tolerance_factor: int = 15, +) -> dict[str, np.ndarray]: + """compute upper and lower bounds for bonds to assess violations.""" + restype_atom14_bond_lower_bound = np.zeros([21, 14, 14], np.float32) + restype_atom14_bond_upper_bound = np.zeros([21, 14, 14], np.float32) + restype_atom14_bond_stddev = np.zeros([21, 14, 14], np.float32) + residue_bonds, residue_virtual_bonds, _ = load_stereo_chemical_props() + for restype, restype_letter in enumerate(restypes): + resname = restype_1to3[restype_letter] + atom_list = restype_name_to_atom14_names[resname] + + # create lower and upper bounds for clashes + for atom1_idx, atom1_name in enumerate(atom_list): + if not atom1_name: + continue + atom1_radius = van_der_waals_radius[atom1_name[0]] + for atom2_idx, atom2_name in enumerate(atom_list): + if (not atom2_name) or atom1_idx == atom2_idx: + continue + atom2_radius = van_der_waals_radius[atom2_name[0]] + lower = atom1_radius + atom2_radius - overlap_tolerance + upper = 1e10 + restype_atom14_bond_lower_bound[restype, atom1_idx, atom2_idx] = lower + restype_atom14_bond_lower_bound[restype, atom2_idx, atom1_idx] = lower + restype_atom14_bond_upper_bound[restype, atom1_idx, atom2_idx] = upper + restype_atom14_bond_upper_bound[restype, atom2_idx, atom1_idx] = upper + + # overwrite lower and upper bounds for bonds and angles + for b in residue_bonds[resname] + residue_virtual_bonds[resname]: + atom1_idx = atom_list.index(b.atom1_name) + atom2_idx = atom_list.index(b.atom2_name) + lower = b.length - bond_length_tolerance_factor * b.stddev + upper = b.length + bond_length_tolerance_factor * b.stddev + restype_atom14_bond_lower_bound[restype, atom1_idx, atom2_idx] = lower + restype_atom14_bond_lower_bound[restype, atom2_idx, atom1_idx] = lower + restype_atom14_bond_upper_bound[restype, atom1_idx, atom2_idx] = upper + restype_atom14_bond_upper_bound[restype, atom2_idx, atom1_idx] = upper + restype_atom14_bond_stddev[restype, atom1_idx, atom2_idx] = b.stddev + restype_atom14_bond_stddev[restype, atom2_idx, atom1_idx] = b.stddev + return { + "lower_bound": restype_atom14_bond_lower_bound, # shape (21,14,14) + "upper_bound": restype_atom14_bond_upper_bound, # shape (21,14,14) + "stddev": restype_atom14_bond_stddev, # shape (21,14,14) + } + + +restype_atom14_ambiguous_atoms = np.zeros((21, 14), dtype=np.float32) +restype_atom14_ambiguous_atoms_swap_idx: np.ndarray = np.tile(np.arange(14, dtype=int), (21, 1)) + + +def _make_atom14_ambiguity_feats() -> None: + for res, pairs in residue_atom_renaming_swaps.items(): + res_idx = restype_order[restype_3to1[res]] + for atom1, atom2 in pairs.items(): + atom1_idx = restype_name_to_atom14_names[res].index(atom1) + atom2_idx = restype_name_to_atom14_names[res].index(atom2) + restype_atom14_ambiguous_atoms[res_idx, atom1_idx] = 1 + restype_atom14_ambiguous_atoms[res_idx, atom2_idx] = 1 + restype_atom14_ambiguous_atoms_swap_idx[res_idx, atom1_idx] = atom2_idx + restype_atom14_ambiguous_atoms_swap_idx[res_idx, atom2_idx] = atom1_idx + + +_make_atom14_ambiguity_feats() + + +def aatype_to_str_sequence(aatype: Sequence[int]) -> str: + return "".join([restypes_with_x[aatype[i]] for i in range(len(aatype))]) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/esm/openfold_utils/rigid_utils.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/esm/openfold_utils/rigid_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..e2bc25cb6d53f0428be3f6c1c25e67d67edab739 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/esm/openfold_utils/rigid_utils.py @@ -0,0 +1,1243 @@ +# Copyright 2021 AlQuraishi Laboratory +# Copyright 2021 DeepMind Technologies Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from collections.abc import Sequence +from functools import cache +from typing import Any, Callable + +import numpy as np +import torch + + +def rot_matmul(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor: + """ + Performs matrix multiplication of two rotation matrix tensors. Written out by hand to avoid AMP downcasting. + + Args: + a: [*, 3, 3] left multiplicand + b: [*, 3, 3] right multiplicand + Returns: + The product ab + """ + + def row_mul(i: int) -> torch.Tensor: + return torch.stack( + [ + a[..., i, 0] * b[..., 0, 0] + a[..., i, 1] * b[..., 1, 0] + a[..., i, 2] * b[..., 2, 0], + a[..., i, 0] * b[..., 0, 1] + a[..., i, 1] * b[..., 1, 1] + a[..., i, 2] * b[..., 2, 1], + a[..., i, 0] * b[..., 0, 2] + a[..., i, 1] * b[..., 1, 2] + a[..., i, 2] * b[..., 2, 2], + ], + dim=-1, + ) + + return torch.stack( + [ + row_mul(0), + row_mul(1), + row_mul(2), + ], + dim=-2, + ) + + +def rot_vec_mul(r: torch.Tensor, t: torch.Tensor) -> torch.Tensor: + """ + Applies a rotation to a vector. Written out by hand to avoid transfer to avoid AMP downcasting. + + Args: + r: [*, 3, 3] rotation matrices + t: [*, 3] coordinate tensors + Returns: + [*, 3] rotated coordinates + """ + x, y, z = torch.unbind(t, dim=-1) + return torch.stack( + [ + r[..., 0, 0] * x + r[..., 0, 1] * y + r[..., 0, 2] * z, + r[..., 1, 0] * x + r[..., 1, 1] * y + r[..., 1, 2] * z, + r[..., 2, 0] * x + r[..., 2, 1] * y + r[..., 2, 2] * z, + ], + dim=-1, + ) + + +@cache +def identity_rot_mats( + batch_dims: tuple[int, ...], + dtype: torch.dtype | None = None, + device: torch.device | None = None, + requires_grad: bool = True, +) -> torch.Tensor: + rots = torch.eye(3, dtype=dtype, device=device, requires_grad=requires_grad) + rots = rots.view(*((1,) * len(batch_dims)), 3, 3) + rots = rots.expand(*batch_dims, -1, -1) + rots = rots.contiguous() + + return rots + + +@cache +def identity_trans( + batch_dims: tuple[int, ...], + dtype: torch.dtype | None = None, + device: torch.device | None = None, + requires_grad: bool = True, +) -> torch.Tensor: + trans = torch.zeros((*batch_dims, 3), dtype=dtype, device=device, requires_grad=requires_grad) + return trans + + +@cache +def identity_quats( + batch_dims: tuple[int, ...], + dtype: torch.dtype | None = None, + device: torch.device | None = None, + requires_grad: bool = True, +) -> torch.Tensor: + quat = torch.zeros((*batch_dims, 4), dtype=dtype, device=device, requires_grad=requires_grad) + + with torch.no_grad(): + quat[..., 0] = 1 + + return quat + + +_quat_elements: list[str] = ["a", "b", "c", "d"] +_qtr_keys: list[str] = [l1 + l2 for l1 in _quat_elements for l2 in _quat_elements] +_qtr_ind_dict: dict[str, int] = {key: ind for ind, key in enumerate(_qtr_keys)} + + +def _to_mat(pairs: list[tuple[str, int]]) -> np.ndarray: + mat = np.zeros((4, 4)) + for key, value in pairs: + ind = _qtr_ind_dict[key] + mat[ind // 4][ind % 4] = value + + return mat + + +_QTR_MAT = np.zeros((4, 4, 3, 3)) +_QTR_MAT[..., 0, 0] = _to_mat([("aa", 1), ("bb", 1), ("cc", -1), ("dd", -1)]) +_QTR_MAT[..., 0, 1] = _to_mat([("bc", 2), ("ad", -2)]) +_QTR_MAT[..., 0, 2] = _to_mat([("bd", 2), ("ac", 2)]) +_QTR_MAT[..., 1, 0] = _to_mat([("bc", 2), ("ad", 2)]) +_QTR_MAT[..., 1, 1] = _to_mat([("aa", 1), ("bb", -1), ("cc", 1), ("dd", -1)]) +_QTR_MAT[..., 1, 2] = _to_mat([("cd", 2), ("ab", -2)]) +_QTR_MAT[..., 2, 0] = _to_mat([("bd", 2), ("ac", -2)]) +_QTR_MAT[..., 2, 1] = _to_mat([("cd", 2), ("ab", 2)]) +_QTR_MAT[..., 2, 2] = _to_mat([("aa", 1), ("bb", -1), ("cc", -1), ("dd", 1)]) + + +def quat_to_rot(quat: torch.Tensor) -> torch.Tensor: + """ + Converts a quaternion to a rotation matrix. + + Args: + quat: [*, 4] quaternions + Returns: + [*, 3, 3] rotation matrices + """ + # [*, 4, 4] + quat = quat[..., None] * quat[..., None, :] + + # [4, 4, 3, 3] + mat = _get_quat("_QTR_MAT", dtype=quat.dtype, device=quat.device) + + # [*, 4, 4, 3, 3] + shaped_qtr_mat = mat.view((1,) * len(quat.shape[:-2]) + mat.shape) + quat = quat[..., None, None] * shaped_qtr_mat + + # [*, 3, 3] + return torch.sum(quat, dim=(-3, -4)) + + +def rot_to_quat(rot: torch.Tensor) -> torch.Tensor: + if rot.shape[-2:] != (3, 3): + raise ValueError("Input rotation is incorrectly shaped") + + [[xx, xy, xz], [yx, yy, yz], [zx, zy, zz]] = [[rot[..., i, j] for j in range(3)] for i in range(3)] + + k = [ + [ + xx + yy + zz, + zy - yz, + xz - zx, + yx - xy, + ], + [ + zy - yz, + xx - yy - zz, + xy + yx, + xz + zx, + ], + [ + xz - zx, + xy + yx, + yy - xx - zz, + yz + zy, + ], + [ + yx - xy, + xz + zx, + yz + zy, + zz - xx - yy, + ], + ] + + _, vectors = torch.linalg.eigh((1.0 / 3.0) * torch.stack([torch.stack(t, dim=-1) for t in k], dim=-2)) + return vectors[..., -1] + + +_QUAT_MULTIPLY = np.zeros((4, 4, 4)) +_QUAT_MULTIPLY[:, :, 0] = [[1, 0, 0, 0], [0, -1, 0, 0], [0, 0, -1, 0], [0, 0, 0, -1]] + +_QUAT_MULTIPLY[:, :, 1] = [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, -1, 0]] + +_QUAT_MULTIPLY[:, :, 2] = [[0, 0, 1, 0], [0, 0, 0, -1], [1, 0, 0, 0], [0, 1, 0, 0]] + +_QUAT_MULTIPLY[:, :, 3] = [[0, 0, 0, 1], [0, 0, 1, 0], [0, -1, 0, 0], [1, 0, 0, 0]] + +_QUAT_MULTIPLY_BY_VEC = _QUAT_MULTIPLY[:, 1:, :] + +_CACHED_QUATS: dict[str, np.ndarray] = { + "_QTR_MAT": _QTR_MAT, + "_QUAT_MULTIPLY": _QUAT_MULTIPLY, + "_QUAT_MULTIPLY_BY_VEC": _QUAT_MULTIPLY_BY_VEC, +} + + +@cache +def _get_quat(quat_key: str, dtype: torch.dtype, device: torch.device) -> torch.Tensor: + return torch.tensor(_CACHED_QUATS[quat_key], dtype=dtype, device=device) + + +def quat_multiply(quat1: torch.Tensor, quat2: torch.Tensor) -> torch.Tensor: + """Multiply a quaternion by another quaternion.""" + mat = _get_quat("_QUAT_MULTIPLY", dtype=quat1.dtype, device=quat1.device) + reshaped_mat = mat.view((1,) * len(quat1.shape[:-1]) + mat.shape) + return torch.sum(reshaped_mat * quat1[..., :, None, None] * quat2[..., None, :, None], dim=(-3, -2)) + + +def quat_multiply_by_vec(quat: torch.Tensor, vec: torch.Tensor) -> torch.Tensor: + """Multiply a quaternion by a pure-vector quaternion.""" + mat = _get_quat("_QUAT_MULTIPLY_BY_VEC", dtype=quat.dtype, device=quat.device) + reshaped_mat = mat.view((1,) * len(quat.shape[:-1]) + mat.shape) + return torch.sum(reshaped_mat * quat[..., :, None, None] * vec[..., None, :, None], dim=(-3, -2)) + + +def invert_rot_mat(rot_mat: torch.Tensor) -> torch.Tensor: + return rot_mat.transpose(-1, -2) + + +def invert_quat(quat: torch.Tensor) -> torch.Tensor: + quat_prime = quat.clone() + quat_prime[..., 1:] *= -1 + inv = quat_prime / torch.sum(quat**2, dim=-1, keepdim=True) + return inv + + +class Rotation: + """ + A 3D rotation. Depending on how the object is initialized, the rotation is represented by either a rotation matrix + or a quaternion, though both formats are made available by helper functions. To simplify gradient computation, the + underlying format of the rotation cannot be changed in-place. Like Rigid, the class is designed to mimic the + behavior of a torch Tensor, almost as if each Rotation object were a tensor of rotations, in one format or another. + """ + + def __init__( + self, + rot_mats: torch.Tensor | None = None, + quats: torch.Tensor | None = None, + normalize_quats: bool = True, + ): + """ + Args: + rot_mats: + A [*, 3, 3] rotation matrix tensor. Mutually exclusive with quats + quats: + A [*, 4] quaternion. Mutually exclusive with rot_mats. If normalize_quats is not True, must be a unit + quaternion + normalize_quats: + If quats is specified, whether to normalize quats + """ + if (rot_mats is None and quats is None) or (rot_mats is not None and quats is not None): + raise ValueError("Exactly one input argument must be specified") + + if (rot_mats is not None and rot_mats.shape[-2:] != (3, 3)) or (quats is not None and quats.shape[-1] != 4): + raise ValueError("Incorrectly shaped rotation matrix or quaternion") + + # Force full-precision + if quats is not None: + quats = quats.to(dtype=torch.float32) + if rot_mats is not None: + rot_mats = rot_mats.to(dtype=torch.float32) + + if quats is not None and normalize_quats: + quats = quats / torch.linalg.norm(quats, dim=-1, keepdim=True) + + self._rot_mats = rot_mats + self._quats = quats + + @staticmethod + def identity( + shape, + dtype: torch.dtype | None = None, + device: torch.device | None = None, + requires_grad: bool = True, + fmt: str = "quat", + ) -> Rotation: + """ + Returns an identity Rotation. + + Args: + shape: + The "shape" of the resulting Rotation object. See documentation for the shape property + dtype: + The torch dtype for the rotation + device: + The torch device for the new rotation + requires_grad: + Whether the underlying tensors in the new rotation object should require gradient computation + fmt: + One of "quat" or "rot_mat". Determines the underlying format of the new object's rotation + Returns: + A new identity rotation + """ + if fmt == "rot_mat": + rot_mats = identity_rot_mats( + shape, + dtype, + device, + requires_grad, + ) + return Rotation(rot_mats=rot_mats, quats=None) + elif fmt == "quat": + quats = identity_quats(shape, dtype, device, requires_grad) + return Rotation(rot_mats=None, quats=quats, normalize_quats=False) + else: + raise ValueError(f"Invalid format: f{fmt}") + + # Magic methods + + def __getitem__(self, index: Any) -> Rotation: + """ + Allows torch-style indexing over the virtual shape of the rotation object. See documentation for the shape + property. + + Args: + index: + A torch index. E.g. (1, 3, 2), or (slice(None,)) + Returns: + The indexed rotation + """ + if type(index) is not tuple: + index = (index,) + + if self._rot_mats is not None: + rot_mats = self._rot_mats[index + (slice(None), slice(None))] + return Rotation(rot_mats=rot_mats) + elif self._quats is not None: + quats = self._quats[index + (slice(None),)] + return Rotation(quats=quats, normalize_quats=False) + else: + raise ValueError("Both rotations are None") + + def __mul__(self, right: torch.Tensor) -> Rotation: + """ + Pointwise left multiplication of the rotation with a tensor. Can be used to e.g. mask the Rotation. + + Args: + right: + The tensor multiplicand + Returns: + The product + """ + if not (isinstance(right, torch.Tensor)): + raise TypeError("The other multiplicand must be a Tensor") + + if self._rot_mats is not None: + rot_mats = self._rot_mats * right[..., None, None] + return Rotation(rot_mats=rot_mats, quats=None) + elif self._quats is not None: + quats = self._quats * right[..., None] + return Rotation(rot_mats=None, quats=quats, normalize_quats=False) + else: + raise ValueError("Both rotations are None") + + def __rmul__(self, left: torch.Tensor) -> Rotation: + """ + Reverse pointwise multiplication of the rotation with a tensor. + + Args: + left: + The left multiplicand + Returns: + The product + """ + return self.__mul__(left) + + # Properties + + @property + def shape(self) -> torch.Size: + """ + Returns the virtual shape of the rotation object. This shape is defined as the batch dimensions of the + underlying rotation matrix or quaternion. If the Rotation was initialized with a [10, 3, 3] rotation matrix + tensor, for example, the resulting shape would be [10]. + + Returns: + The virtual shape of the rotation object + """ + if self._rot_mats is not None: + return self._rot_mats.shape[:-2] + elif self._quats is not None: + return self._quats.shape[:-1] + else: + raise ValueError("Both rotations are None") + + @property + def dtype(self) -> torch.dtype: + """ + Returns the dtype of the underlying rotation. + + Returns: + The dtype of the underlying rotation + """ + if self._rot_mats is not None: + return self._rot_mats.dtype + elif self._quats is not None: + return self._quats.dtype + else: + raise ValueError("Both rotations are None") + + @property + def device(self) -> torch.device: + """ + The device of the underlying rotation + + Returns: + The device of the underlying rotation + """ + if self._rot_mats is not None: + return self._rot_mats.device + elif self._quats is not None: + return self._quats.device + else: + raise ValueError("Both rotations are None") + + @property + def requires_grad(self) -> bool: + """ + Returns the requires_grad property of the underlying rotation + + Returns: + The requires_grad property of the underlying tensor + """ + if self._rot_mats is not None: + return self._rot_mats.requires_grad + elif self._quats is not None: + return self._quats.requires_grad + else: + raise ValueError("Both rotations are None") + + def get_rot_mats(self) -> torch.Tensor: + """ + Returns the underlying rotation as a rotation matrix tensor. + + Returns: + The rotation as a rotation matrix tensor + """ + if self._rot_mats is not None: + return self._rot_mats + elif self._quats is not None: + return quat_to_rot(self._quats) + else: + raise ValueError("Both rotations are None") + + def get_quats(self) -> torch.Tensor: + """ + Returns the underlying rotation as a quaternion tensor. + + Depending on whether the Rotation was initialized with a quaternion, this function may call torch.linalg.eigh. + + Returns: + The rotation as a quaternion tensor. + """ + if self._rot_mats is not None: + return rot_to_quat(self._rot_mats) + elif self._quats is not None: + return self._quats + else: + raise ValueError("Both rotations are None") + + def get_cur_rot(self) -> torch.Tensor: + """ + Return the underlying rotation in its current form + + Returns: + The stored rotation + """ + if self._rot_mats is not None: + return self._rot_mats + elif self._quats is not None: + return self._quats + else: + raise ValueError("Both rotations are None") + + # Rotation functions + + def compose_q_update_vec(self, q_update_vec: torch.Tensor, normalize_quats: bool = True) -> Rotation: + """ + Returns a new quaternion Rotation after updating the current object's underlying rotation with a quaternion + update, formatted as a [*, 3] tensor whose final three columns represent x, y, z such that (1, x, y, z) is the + desired (not necessarily unit) quaternion update. + + Args: + q_update_vec: + A [*, 3] quaternion update tensor + normalize_quats: + Whether to normalize the output quaternion + Returns: + An updated Rotation + """ + quats = self.get_quats() + new_quats = quats + quat_multiply_by_vec(quats, q_update_vec) + return Rotation( + rot_mats=None, + quats=new_quats, + normalize_quats=normalize_quats, + ) + + def compose_r(self, r: Rotation) -> Rotation: + """ + Compose the rotation matrices of the current Rotation object with those of another. + + Args: + r: + An update rotation object + Returns: + An updated rotation object + """ + r1 = self.get_rot_mats() + r2 = r.get_rot_mats() + new_rot_mats = rot_matmul(r1, r2) + return Rotation(rot_mats=new_rot_mats, quats=None) + + def compose_q(self, r: Rotation, normalize_quats: bool = True) -> Rotation: + """ + Compose the quaternions of the current Rotation object with those of another. + + Depending on whether either Rotation was initialized with quaternions, this function may call + torch.linalg.eigh. + + Args: + r: + An update rotation object + Returns: + An updated rotation object + """ + q1 = self.get_quats() + q2 = r.get_quats() + new_quats = quat_multiply(q1, q2) + return Rotation(rot_mats=None, quats=new_quats, normalize_quats=normalize_quats) + + def apply(self, pts: torch.Tensor) -> torch.Tensor: + """ + Apply the current Rotation as a rotation matrix to a set of 3D coordinates. + + Args: + pts: + A [*, 3] set of points + Returns: + [*, 3] rotated points + """ + rot_mats = self.get_rot_mats() + return rot_vec_mul(rot_mats, pts) + + def invert_apply(self, pts: torch.Tensor) -> torch.Tensor: + """ + The inverse of the apply() method. + + Args: + pts: + A [*, 3] set of points + Returns: + [*, 3] inverse-rotated points + """ + rot_mats = self.get_rot_mats() + inv_rot_mats = invert_rot_mat(rot_mats) + return rot_vec_mul(inv_rot_mats, pts) + + def invert(self) -> Rotation: + """ + Returns the inverse of the current Rotation. + + Returns: + The inverse of the current Rotation + """ + if self._rot_mats is not None: + return Rotation(rot_mats=invert_rot_mat(self._rot_mats), quats=None) + elif self._quats is not None: + return Rotation( + rot_mats=None, + quats=invert_quat(self._quats), + normalize_quats=False, + ) + else: + raise ValueError("Both rotations are None") + + # "Tensor" stuff + + def unsqueeze(self, dim: int) -> Rotation: + """ + Analogous to torch.unsqueeze. The dimension is relative to the shape of the Rotation object. + + Args: + dim: A positive or negative dimension index. + Returns: + The unsqueezed Rotation. + """ + if dim >= len(self.shape): + raise ValueError("Invalid dimension") + + if self._rot_mats is not None: + rot_mats = self._rot_mats.unsqueeze(dim if dim >= 0 else dim - 2) + return Rotation(rot_mats=rot_mats, quats=None) + elif self._quats is not None: + quats = self._quats.unsqueeze(dim if dim >= 0 else dim - 1) + return Rotation(rot_mats=None, quats=quats, normalize_quats=False) + else: + raise ValueError("Both rotations are None") + + @staticmethod + def cat(rs: Sequence[Rotation], dim: int) -> Rotation: + """ + Concatenates rotations along one of the batch dimensions. Analogous to torch.cat(). + + Note that the output of this operation is always a rotation matrix, regardless of the format of input + rotations. + + Args: + rs: + A list of rotation objects + dim: + The dimension along which the rotations should be concatenated + Returns: + A concatenated Rotation object in rotation matrix format + """ + rot_mats = torch.cat( + [r.get_rot_mats() for r in rs], + dim=dim if dim >= 0 else dim - 2, + ) + + return Rotation(rot_mats=rot_mats, quats=None) + + def map_tensor_fn(self, fn: Callable[[torch.Tensor], torch.Tensor]) -> Rotation: + """ + Apply a Tensor -> Tensor function to underlying rotation tensors, mapping over the rotation dimension(s). Can + be used e.g. to sum out a one-hot batch dimension. + + Args: + fn: + A Tensor -> Tensor function to be mapped over the Rotation + Returns: + The transformed Rotation object + """ + if self._rot_mats is not None: + rot_mats = self._rot_mats.view(self._rot_mats.shape[:-2] + (9,)) + rot_mats = torch.stack(list(map(fn, torch.unbind(rot_mats, dim=-1))), dim=-1) + rot_mats = rot_mats.view(rot_mats.shape[:-1] + (3, 3)) + return Rotation(rot_mats=rot_mats, quats=None) + elif self._quats is not None: + quats = torch.stack(list(map(fn, torch.unbind(self._quats, dim=-1))), dim=-1) + return Rotation(rot_mats=None, quats=quats, normalize_quats=False) + else: + raise ValueError("Both rotations are None") + + def cuda(self) -> Rotation: + """ + Analogous to the cuda() method of torch Tensors + + Returns: + A copy of the Rotation in CUDA memory + """ + if self._rot_mats is not None: + return Rotation(rot_mats=self._rot_mats.cuda(), quats=None) + elif self._quats is not None: + return Rotation(rot_mats=None, quats=self._quats.cuda(), normalize_quats=False) + else: + raise ValueError("Both rotations are None") + + def to(self, device: torch.device | None, dtype: torch.dtype | None) -> Rotation: + """ + Analogous to the to() method of torch Tensors + + Args: + device: + A torch device + dtype: + A torch dtype + Returns: + A copy of the Rotation using the new device and dtype + """ + if self._rot_mats is not None: + return Rotation( + rot_mats=self._rot_mats.to(device=device, dtype=dtype), + quats=None, + ) + elif self._quats is not None: + return Rotation( + rot_mats=None, + quats=self._quats.to(device=device, dtype=dtype), + normalize_quats=False, + ) + else: + raise ValueError("Both rotations are None") + + def detach(self) -> Rotation: + """ + Returns a copy of the Rotation whose underlying Tensor has been detached from its torch graph. + + Returns: + A copy of the Rotation whose underlying Tensor has been detached from its torch graph + """ + if self._rot_mats is not None: + return Rotation(rot_mats=self._rot_mats.detach(), quats=None) + elif self._quats is not None: + return Rotation( + rot_mats=None, + quats=self._quats.detach(), + normalize_quats=False, + ) + else: + raise ValueError("Both rotations are None") + + +class Rigid: + """ + A class representing a rigid transformation. Little more than a wrapper around two objects: a Rotation object and a + [*, 3] translation Designed to behave approximately like a single torch tensor with the shape of the shared batch + dimensions of its component parts. + """ + + def __init__(self, rots: Rotation | None, trans: torch.Tensor | None): + """ + Args: + rots: A [*, 3, 3] rotation tensor + trans: A corresponding [*, 3] translation tensor + """ + # (we need device, dtype, etc. from at least one input) + + batch_dims, dtype, device, requires_grad = None, None, None, None + if trans is not None: + batch_dims = trans.shape[:-1] + dtype = trans.dtype + device = trans.device + requires_grad = trans.requires_grad + elif rots is not None: + batch_dims = rots.shape + dtype = rots.dtype + device = rots.device + requires_grad = rots.requires_grad + else: + raise ValueError("At least one input argument must be specified") + + if rots is None: + rots = Rotation.identity( + batch_dims, + dtype, + device, + requires_grad, + ) + elif trans is None: + trans = identity_trans( + batch_dims, + dtype, + device, + requires_grad, + ) + + assert rots is not None + assert trans is not None + + if (rots.shape != trans.shape[:-1]) or (rots.device != trans.device): + raise ValueError("Rots and trans incompatible") + + # Force full precision. Happens to the rotations automatically. + trans = trans.to(dtype=torch.float32) + + self._rots = rots + self._trans = trans + + @staticmethod + def identity( + shape: tuple[int, ...], + dtype: torch.dtype | None = None, + device: torch.device | None = None, + requires_grad: bool = True, + fmt: str = "quat", + ) -> Rigid: + """ + Constructs an identity transformation. + + Args: + shape: + The desired shape + dtype: + The dtype of both internal tensors + device: + The device of both internal tensors + requires_grad: + Whether grad should be enabled for the internal tensors + Returns: + The identity transformation + """ + return Rigid( + Rotation.identity(shape, dtype, device, requires_grad, fmt=fmt), + identity_trans(shape, dtype, device, requires_grad), + ) + + def __getitem__(self, index: Any) -> Rigid: + """ + Indexes the affine transformation with PyTorch-style indices. The index is applied to the shared dimensions of + both the rotation and the translation. + + E.g.:: + + r = Rotation(rot_mats=torch.rand(10, 10, 3, 3), quats=None) t = Rigid(r, torch.rand(10, 10, 3)) indexed = + t[3, 4:6] assert(indexed.shape == (2,)) assert(indexed.get_rots().shape == (2,)) + assert(indexed.get_trans().shape == (2, 3)) + + Args: + index: A standard torch tensor index. E.g. 8, (10, None, 3), + or (3, slice(0, 1, None)) + Returns: + The indexed tensor + """ + if type(index) is not tuple: + index = (index,) + + return Rigid( + self._rots[index], + self._trans[index + (slice(None),)], + ) + + def __mul__(self, right: torch.Tensor) -> Rigid: + """ + Pointwise left multiplication of the transformation with a tensor. Can be used to e.g. mask the Rigid. + + Args: + right: + The tensor multiplicand + Returns: + The product + """ + if not (isinstance(right, torch.Tensor)): + raise TypeError("The other multiplicand must be a Tensor") + + new_rots = self._rots * right + new_trans = self._trans * right[..., None] + + return Rigid(new_rots, new_trans) + + def __rmul__(self, left: torch.Tensor) -> Rigid: + """ + Reverse pointwise multiplication of the transformation with a tensor. + + Args: + left: + The left multiplicand + Returns: + The product + """ + return self.__mul__(left) + + @property + def shape(self) -> torch.Size: + """ + Returns the shape of the shared dimensions of the rotation and the translation. + + Returns: + The shape of the transformation + """ + return self._trans.shape[:-1] + + @property + def device(self) -> torch.device: + """ + Returns the device on which the Rigid's tensors are located. + + Returns: + The device on which the Rigid's tensors are located + """ + return self._trans.device + + def get_rots(self) -> Rotation: + """ + Getter for the rotation. + + Returns: + The rotation object + """ + return self._rots + + def get_trans(self) -> torch.Tensor: + """ + Getter for the translation. + + Returns: + The stored translation + """ + return self._trans + + def compose_q_update_vec(self, q_update_vec: torch.Tensor) -> Rigid: + """ + Composes the transformation with a quaternion update vector of shape [*, 6], where the final 6 columns + represent the x, y, and z values of a quaternion of form (1, x, y, z) followed by a 3D translation. + + Args: + q_vec: The quaternion update vector. + Returns: + The composed transformation. + """ + q_vec, t_vec = q_update_vec[..., :3], q_update_vec[..., 3:] + new_rots = self._rots.compose_q_update_vec(q_vec) + + trans_update = self._rots.apply(t_vec) + new_translation = self._trans + trans_update + + return Rigid(new_rots, new_translation) + + def compose(self, r: Rigid) -> Rigid: + """ + Composes the current rigid object with another. + + Args: + r: + Another Rigid object + Returns: + The composition of the two transformations + """ + new_rot = self._rots.compose_r(r._rots) + new_trans = self._rots.apply(r._trans) + self._trans + return Rigid(new_rot, new_trans) + + def apply(self, pts: torch.Tensor) -> torch.Tensor: + """ + Applies the transformation to a coordinate tensor. + + Args: + pts: A [*, 3] coordinate tensor. + Returns: + The transformed points. + """ + rotated = self._rots.apply(pts) + return rotated + self._trans + + def invert_apply(self, pts: torch.Tensor) -> torch.Tensor: + """ + Applies the inverse of the transformation to a coordinate tensor. + + Args: + pts: A [*, 3] coordinate tensor + Returns: + The transformed points. + """ + pts = pts - self._trans + return self._rots.invert_apply(pts) + + def invert(self) -> Rigid: + """ + Inverts the transformation. + + Returns: + The inverse transformation. + """ + rot_inv = self._rots.invert() + trn_inv = rot_inv.apply(self._trans) + + return Rigid(rot_inv, -1 * trn_inv) + + def map_tensor_fn(self, fn: Callable[[torch.Tensor], torch.Tensor]) -> Rigid: + """ + Apply a Tensor -> Tensor function to underlying translation and rotation tensors, mapping over the + translation/rotation dimensions respectively. + + Args: + fn: + A Tensor -> Tensor function to be mapped over the Rigid + Returns: + The transformed Rigid object + """ + new_rots = self._rots.map_tensor_fn(fn) + new_trans = torch.stack(list(map(fn, torch.unbind(self._trans, dim=-1))), dim=-1) + + return Rigid(new_rots, new_trans) + + def to_tensor_4x4(self) -> torch.Tensor: + """ + Converts a transformation to a homogeneous transformation tensor. + + Returns: + A [*, 4, 4] homogeneous transformation tensor + """ + tensor = self._trans.new_zeros((*self.shape, 4, 4)) + tensor[..., :3, :3] = self._rots.get_rot_mats() + tensor[..., :3, 3] = self._trans + tensor[..., 3, 3] = 1 + return tensor + + @staticmethod + def from_tensor_4x4(t: torch.Tensor) -> Rigid: + """ + Constructs a transformation from a homogeneous transformation tensor. + + Args: + t: [*, 4, 4] homogeneous transformation tensor + Returns: + T object with shape [*] + """ + if t.shape[-2:] != (4, 4): + raise ValueError("Incorrectly shaped input tensor") + + rots = Rotation(rot_mats=t[..., :3, :3], quats=None) + trans = t[..., :3, 3] + + return Rigid(rots, trans) + + def to_tensor_7(self) -> torch.Tensor: + """ + Converts a transformation to a tensor with 7 final columns, four for the quaternion followed by three for the + translation. + + Returns: + A [*, 7] tensor representation of the transformation + """ + tensor = self._trans.new_zeros((*self.shape, 7)) + tensor[..., :4] = self._rots.get_quats() + tensor[..., 4:] = self._trans + + return tensor + + @staticmethod + def from_tensor_7(t: torch.Tensor, normalize_quats: bool = False) -> Rigid: + if t.shape[-1] != 7: + raise ValueError("Incorrectly shaped input tensor") + + quats, trans = t[..., :4], t[..., 4:] + + rots = Rotation(rot_mats=None, quats=quats, normalize_quats=normalize_quats) + + return Rigid(rots, trans) + + @staticmethod + def from_3_points( + p_neg_x_axis: torch.Tensor, origin: torch.Tensor, p_xy_plane: torch.Tensor, eps: float = 1e-8 + ) -> Rigid: + """ + Implements algorithm 21. Constructs transformations from sets of 3 points using the Gram-Schmidt algorithm. + + Args: + p_neg_x_axis: [*, 3] coordinates + origin: [*, 3] coordinates used as frame origins + p_xy_plane: [*, 3] coordinates + eps: Small epsilon value + Returns: + A transformation object of shape [*] + """ + p_neg_x_axis_unbound = torch.unbind(p_neg_x_axis, dim=-1) + origin_unbound = torch.unbind(origin, dim=-1) + p_xy_plane_unbound = torch.unbind(p_xy_plane, dim=-1) + + e0 = [c1 - c2 for c1, c2 in zip(origin_unbound, p_neg_x_axis_unbound)] + e1 = [c1 - c2 for c1, c2 in zip(p_xy_plane_unbound, origin_unbound)] + + denom = torch.sqrt(sum(c * c for c in e0) + eps * torch.ones_like(e0[0])) + e0 = [c / denom for c in e0] + dot = sum((c1 * c2 for c1, c2 in zip(e0, e1))) + e1 = [c2 - c1 * dot for c1, c2 in zip(e0, e1)] + denom = torch.sqrt(sum(c * c for c in e1) + eps * torch.ones_like(e1[0])) + e1 = [c / denom for c in e1] + e2 = [ + e0[1] * e1[2] - e0[2] * e1[1], + e0[2] * e1[0] - e0[0] * e1[2], + e0[0] * e1[1] - e0[1] * e1[0], + ] + + rots = torch.stack([c for tup in zip(e0, e1, e2) for c in tup], dim=-1) + rots = rots.reshape(rots.shape[:-1] + (3, 3)) + + rot_obj = Rotation(rot_mats=rots, quats=None) + + return Rigid(rot_obj, torch.stack(origin_unbound, dim=-1)) + + def unsqueeze(self, dim: int) -> Rigid: + """ + Analogous to torch.unsqueeze. The dimension is relative to the shared dimensions of the rotation/translation. + + Args: + dim: A positive or negative dimension index. + Returns: + The unsqueezed transformation. + """ + if dim >= len(self.shape): + raise ValueError("Invalid dimension") + rots = self._rots.unsqueeze(dim) + trans = self._trans.unsqueeze(dim if dim >= 0 else dim - 1) + + return Rigid(rots, trans) + + @staticmethod + def cat(ts: Sequence[Rigid], dim: int) -> Rigid: + """ + Concatenates transformations along a new dimension. + + Args: + ts: + A list of T objects + dim: + The dimension along which the transformations should be concatenated + Returns: + A concatenated transformation object + """ + rots = Rotation.cat([t._rots for t in ts], dim) + trans = torch.cat([t._trans for t in ts], dim=dim if dim >= 0 else dim - 1) + + return Rigid(rots, trans) + + def apply_rot_fn(self, fn: Callable[[Rotation], Rotation]) -> Rigid: + """ + Applies a Rotation -> Rotation function to the stored rotation object. + + Args: + fn: A function of type Rotation -> Rotation + Returns: + A transformation object with a transformed rotation. + """ + return Rigid(fn(self._rots), self._trans) + + def apply_trans_fn(self, fn: Callable[[torch.Tensor], torch.Tensor]) -> Rigid: + """ + Applies a Tensor -> Tensor function to the stored translation. + + Args: + fn: + A function of type Tensor -> Tensor to be applied to the translation + Returns: + A transformation object with a transformed translation. + """ + return Rigid(self._rots, fn(self._trans)) + + def scale_translation(self, trans_scale_factor: float) -> Rigid: + """ + Scales the translation by a constant factor. + + Args: + trans_scale_factor: + The constant factor + Returns: + A transformation object with a scaled translation. + """ + return self.apply_trans_fn(lambda t: t * trans_scale_factor) + + def stop_rot_gradient(self) -> Rigid: + """ + Detaches the underlying rotation object + + Returns: + A transformation object with detached rotations + """ + return self.apply_rot_fn(lambda r: r.detach()) + + @staticmethod + def make_transform_from_reference( + n_xyz: torch.Tensor, ca_xyz: torch.Tensor, c_xyz: torch.Tensor, eps: float = 1e-20 + ) -> Rigid: + """ + Returns a transformation object from reference coordinates. + + Note that this method does not take care of symmetries. If you provide the atom positions in the non-standard + way, the N atom will end up not at [-0.527250, 1.359329, 0.0] but instead at [-0.527250, -1.359329, 0.0]. You + need to take care of such cases in your code. + + Args: + n_xyz: A [*, 3] tensor of nitrogen xyz coordinates. + ca_xyz: A [*, 3] tensor of carbon alpha xyz coordinates. + c_xyz: A [*, 3] tensor of carbon xyz coordinates. + Returns: + A transformation object. After applying the translation and rotation to the reference backbone, the + coordinates will approximately equal to the input coordinates. + """ + translation = -1 * ca_xyz + n_xyz = n_xyz + translation + c_xyz = c_xyz + translation + + c_x, c_y, c_z = [c_xyz[..., i] for i in range(3)] + norm = torch.sqrt(eps + c_x**2 + c_y**2) + sin_c1 = -c_y / norm + cos_c1 = c_x / norm + + c1_rots = sin_c1.new_zeros((*sin_c1.shape, 3, 3)) + c1_rots[..., 0, 0] = cos_c1 + c1_rots[..., 0, 1] = -1 * sin_c1 + c1_rots[..., 1, 0] = sin_c1 + c1_rots[..., 1, 1] = cos_c1 + c1_rots[..., 2, 2] = 1 + + norm = torch.sqrt(eps + c_x**2 + c_y**2 + c_z**2) + sin_c2 = c_z / norm + cos_c2 = torch.sqrt(c_x**2 + c_y**2) / norm + + c2_rots = sin_c2.new_zeros((*sin_c2.shape, 3, 3)) + c2_rots[..., 0, 0] = cos_c2 + c2_rots[..., 0, 2] = sin_c2 + c2_rots[..., 1, 1] = 1 + c2_rots[..., 2, 0] = -1 * sin_c2 + c2_rots[..., 2, 2] = cos_c2 + + c_rots = rot_matmul(c2_rots, c1_rots) + n_xyz = rot_vec_mul(c_rots, n_xyz) + + _, n_y, n_z = [n_xyz[..., i] for i in range(3)] + norm = torch.sqrt(eps + n_y**2 + n_z**2) + sin_n = -n_z / norm + cos_n = n_y / norm + + n_rots = sin_c2.new_zeros((*sin_c2.shape, 3, 3)) + n_rots[..., 0, 0] = 1 + n_rots[..., 1, 1] = cos_n + n_rots[..., 1, 2] = -1 * sin_n + n_rots[..., 2, 1] = sin_n + n_rots[..., 2, 2] = cos_n + + rots = rot_matmul(n_rots, c_rots) + + rots = rots.transpose(-1, -2) + translation = -1 * translation + + rot_obj = Rotation(rot_mats=rots, quats=None) + + return Rigid(rot_obj, translation) + + def cuda(self) -> Rigid: + """ + Moves the transformation object to GPU memory + + Returns: + A version of the transformation on GPU + """ + return Rigid(self._rots.cuda(), self._trans.cuda()) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/esm/openfold_utils/tensor_utils.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/esm/openfold_utils/tensor_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..449c810aed3f51b34a96674f29d5dc8bdfc35103 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/esm/openfold_utils/tensor_utils.py @@ -0,0 +1,140 @@ +# Copyright 2021 AlQuraishi Laboratory +# Copyright 2021 DeepMind Technologies Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from functools import partial +from typing import Any, Callable, TypeVar, Union, overload + +import torch +import torch.nn as nn +import torch.types + + +def add(m1: torch.Tensor, m2: torch.Tensor, inplace: bool) -> torch.Tensor: + # The first operation in a checkpoint can't be in-place, but it's + # nice to have in-place addition during inference. Thus... + if not inplace: + m1 = m1 + m2 + else: + m1 += m2 + + return m1 + + +def permute_final_dims(tensor: torch.Tensor, inds: list[int]) -> torch.Tensor: + zero_index = -1 * len(inds) + first_inds = list(range(len(tensor.shape[:zero_index]))) + return tensor.permute(first_inds + [zero_index + i for i in inds]) + + +def flatten_final_dims(t: torch.Tensor, no_dims: int) -> torch.Tensor: + return t.reshape(t.shape[:-no_dims] + (-1,)) + + +def masked_mean(mask: torch.Tensor, value: torch.Tensor, dim: int, eps: float = 1e-4) -> torch.Tensor: + mask = mask.expand(*value.shape) + return torch.sum(mask * value, dim=dim) / (eps + torch.sum(mask, dim=dim)) + + +def pts_to_distogram( + pts: torch.Tensor, min_bin: torch.types.Number = 2.3125, max_bin: torch.types.Number = 21.6875, no_bins: int = 64 +) -> torch.Tensor: + boundaries = torch.linspace(min_bin, max_bin, no_bins - 1, device=pts.device) + dists = torch.sqrt(torch.sum((pts.unsqueeze(-2) - pts.unsqueeze(-3)) ** 2, dim=-1)) + return torch.bucketize(dists, boundaries) + + +def dict_multimap(fn: Callable[[list], Any], dicts: list[dict]) -> dict: + first = dicts[0] + new_dict = {} + for k, v in first.items(): + all_v = [d[k] for d in dicts] + if isinstance(v, dict): + new_dict[k] = dict_multimap(fn, all_v) + else: + new_dict[k] = fn(all_v) + + return new_dict + + +def one_hot(x: torch.Tensor, v_bins: torch.Tensor) -> torch.Tensor: + reshaped_bins = v_bins.view(((1,) * len(x.shape)) + (len(v_bins),)) + diffs = x[..., None] - reshaped_bins + am = torch.argmin(torch.abs(diffs), dim=-1) + return nn.functional.one_hot(am, num_classes=len(v_bins)).float() + + +def batched_gather(data: torch.Tensor, inds: torch.Tensor, dim: int = 0, no_batch_dims: int = 0) -> torch.Tensor: + ranges: list[Union[slice, torch.Tensor]] = [] + for i, s in enumerate(data.shape[:no_batch_dims]): + r = torch.arange(s) + r = r.view(*(*((1,) * i), -1, *((1,) * (len(inds.shape) - i - 1)))) + ranges.append(r) + + remaining_dims: list[Union[slice, torch.Tensor]] = [slice(None) for _ in range(len(data.shape) - no_batch_dims)] + remaining_dims[dim - no_batch_dims if dim >= 0 else dim] = inds + ranges.extend(remaining_dims) + # Matt note: Editing this to get around the behaviour of using a list as an array index changing + # in recent Numpy versions + return data[tuple(ranges)] + + +T = TypeVar("T") + + +# With tree_map, a poor man's JAX tree_map +def dict_map( + fn: Callable[[T], Any], dic: dict[Any, Union[dict, list, tuple, T]], leaf_type: type[T] +) -> dict[Any, Union[dict, list, tuple, Any]]: + new_dict: dict[Any, Union[dict, list, tuple, Any]] = {} + for k, v in dic.items(): + if isinstance(v, dict): + new_dict[k] = dict_map(fn, v, leaf_type) + else: + new_dict[k] = tree_map(fn, v, leaf_type) + + return new_dict + + +@overload +def tree_map(fn: Callable[[T], Any], tree: T, leaf_type: type[T]) -> Any: ... + + +@overload +def tree_map(fn: Callable[[T], Any], tree: dict, leaf_type: type[T]) -> dict: ... + + +@overload +def tree_map(fn: Callable[[T], Any], tree: list, leaf_type: type[T]) -> list: ... + + +@overload +def tree_map(fn: Callable[[T], Any], tree: tuple, leaf_type: type[T]) -> tuple: ... + + +def tree_map(fn, tree, leaf_type): + if isinstance(tree, dict): + return dict_map(fn, tree, leaf_type) + elif isinstance(tree, list): + return [tree_map(fn, x, leaf_type) for x in tree] + elif isinstance(tree, tuple): + return tuple(tree_map(fn, x, leaf_type) for x in tree) + elif isinstance(tree, leaf_type): + return fn(tree) + else: + print(type(tree)) + raise TypeError("Not supported") + + +tensor_tree_map = partial(tree_map, leaf_type=torch.Tensor) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/falcon_h1/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/falcon_h1/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..9749c5e1e982cd1ac49db2d560238e4d45907c62 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/falcon_h1/__init__.py @@ -0,0 +1,27 @@ +# Copyright 2025 TII and the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_falcon_h1 import * + from .modeling_falcon_h1 import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/falcon_h1/configuration_falcon_h1.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/falcon_h1/configuration_falcon_h1.py new file mode 100644 index 0000000000000000000000000000000000000000..8e9aaaf3405f066d90731a3e84fd41cb3b352876 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/falcon_h1/configuration_falcon_h1.py @@ -0,0 +1,282 @@ +# coding=utf-8 +# Copyright 2025 TII and the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""FalconH1 model configuration""" + +from ...configuration_utils import PretrainedConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class FalconH1Config(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`FalconH1Model`]. It is used to instantiate a + FalconH1Model model according to the specified arguments, defining the model architecture. Instantiating a configuration + with defaults taken from [ibm-fms/FalconH1-9.8b-2.2T-hf](https://huggingface.co/ibm-fms/FalconH1-9.8b-2.2T-hf). + The FalconH1Model is a hybrid [mamba2](https://github.com/state-spaces/mamba) architecture with SwiGLU. + The checkpoints are jointly trained by IBM, Princeton, and UIUC. + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + Args: + vocab_size (`int`, *optional*, defaults to 128000): + Vocabulary size of the FalconH1 model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`FalconH1Model`] + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether the model's input and output word embeddings should be tied. Note that this is only relevant if the + model has a output word embedding layer. + hidden_size (`int`, *optional*, defaults to 4096): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 14336): + Dimension of the MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 32): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 32): + Number of attention heads for each attention layer in the Transformer encoder. + num_key_value_heads (`int`, *optional*, defaults to 8): + This is the number of key_value heads that should be used to implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When + converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed + by meanpooling all the original heads within that group. For more details, check out [this + paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to `8`. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the decoder. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + rms_norm_eps (`float`, *optional*, defaults to 1e-05): + The epsilon used by the rms normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + num_logits_to_keep (`int` or `None`, *optional*, defaults to 1): + Number of prompt logits to calculate during generation. If `None`, all logits will be calculated. If an + integer value, only last `num_logits_to_keep` logits will be calculated. Default is 1 because only the + logits of the last prompt token are needed for generation. For long sequences, the logits for the entire + sequence may use a lot of memory so, setting `num_logits_to_keep=1` will reduce memory footprint + significantly. + pad_token_id (`int`, *optional*, defaults to 0): + The id of the padding token. + bos_token_id (`int`, *optional*, defaults to 1): + The id of the "beginning-of-sequence" token. + eos_token_id (`int`, *optional*, defaults to 2): + The id of the "end-of-sequence" token. + max_position_embeddings (`int`, *optional*, defaults to 8192): + Max cached sequence length for the model + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + mamba_d_ssm (`int`, *optional*, defaults to 1024): + The dimension of the SSM state space latents. + mamba_n_heads (`int`, *optional*, defaults to 128): + The number of mamba heads used in the v2 implementation. + mamba_d_head (`int`, *optional*, defaults to `"auto"`): + Head embeddding dimension size + mamba_n_groups (`int`, *optional*, defaults to 1): + The number of the mamba groups used in the v2 implementation. + mamba_d_state (`int`, *optional*, defaults to 256): + The dimension the mamba state space latents + mamba_d_conv (`int`, *optional*, defaults to 4): + The size of the mamba convolution kernel + mamba_expand (`int`, *optional*, defaults to 2): + Expanding factor (relative to hidden_size) used to determine the mamba intermediate size + mamba_chunk_size (`int`, *optional*, defaults to 256): + The chunks in which to break the sequence when doing prefill/training + mamba_conv_bias (`bool`, *optional*, defaults to `True`): + Flag indicating whether or not to use bias in the convolution layer of the mamba mixer block. + mamba_proj_bias (`bool`, *optional*, defaults to `False`): + Flag indicating whether or not to use bias in the input and output projections (["in_proj", "out_proj"]) of the mamba mixer block + mamba_norm_before_gate (`bool`, *optional*, defaults to `True`): + Whether to use RMSNorm before the gate in the Mamba block + mamba_rms_norm (`bool`, *optional*, defaults to `False`): + Whether to use RMSNorm instead of LayerNorm in the Mamba block + projectors_bias (`bool`, *optional*, defaults to `False`): + Flag indicating whether or not to use bias in the input and output projections (["in_proj", "out_proj"]) of the attention block + rope_theta (`float`, *optional*, defaults to 100000.0): + The theta value used for the RoPE embeddings. + rope_scaling (`float`, *optional*): + The scaling value used for the RoPE embeddings. If `None`, no scaling is applied. + lm_head_multiplier (`float`, *optional*, defaults to 1.0): + The multiplier for the LM head. This is used to scale the output of the LM head. + embedding_multiplier (`float`, *optional*, defaults to 1.0): + The multiplier for the embedding layer. This is used to scale the output of the embedding layer. + mlp_multipliers (`list[float]`, *optional*): + The multipliers for the MLP layers. This is used to scale the output of the MLP layers. The first value is + the multiplier of gate layer, the second value is the multiplier of the down_proj layer. + key_multiplier (`float`, *optional*): + The multiplier for the key layer. This is used to scale the output of the key layer. + attention_out_multiplier (`float`, *optional*): + The multiplier for the attention output layer. This is used to scale the output of the attention output + attention_in_multiplier (`float`, *optional*): + The multiplier for the attention input layer. This is used to scale the output of the attention input layer. + ssm_multipliers (`list[float]`, *optional*): + The multipliers for the SSM layers. This is used to scale the output of the SSM layers. + ssm_in_multiplier (`float`, *optional*): + The multiplier for the SSM input layer. This is used to scale the output of the SSM input layer. + ssm_out_multiplier (`float`, *optional*): + The multiplier for the SSM output layer. This is used to scale the output of the SSM output layer. + """ + + model_type = "falcon_h1" + keys_to_ignore_at_inference = ["past_key_values"] + + def __init__( + self, + vocab_size=128000, + tie_word_embeddings=False, + hidden_size=4096, + intermediate_size=14336, + num_hidden_layers=32, + num_attention_heads=32, + num_key_value_heads=8, + hidden_act="silu", + initializer_range=0.02, + rms_norm_eps=1e-5, + use_cache=True, + num_logits_to_keep=1, + pad_token_id=0, + bos_token_id=1, + eos_token_id=2, + max_position_embeddings=8192, + attention_dropout=0.0, + mamba_d_ssm=1024, + mamba_n_heads=128, + mamba_d_head="auto", + mamba_n_groups=1, + mamba_d_state=256, + mamba_d_conv=4, + mamba_expand=2, + mamba_chunk_size=256, + mamba_conv_bias=True, + mamba_proj_bias=False, + mamba_norm_before_gate=True, + mamba_rms_norm=False, + projectors_bias=False, + rope_theta=100000.0, + rope_scaling=None, + lm_head_multiplier=1.0, + embedding_multiplier=1.0, + mlp_multipliers=None, + key_multiplier=None, + attention_out_multiplier=None, + attention_in_multiplier=None, + ssm_multipliers=None, + ssm_in_multiplier=None, + ssm_out_multiplier=None, + **kwargs, + ): + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.max_position_embeddings = max_position_embeddings + self.attention_dropout = attention_dropout + self.attention_bias = False + self.mlp_bias = False + + # for backward compatibility + if num_key_value_heads is None: + num_key_value_heads = num_attention_heads + + self.num_key_value_heads = num_key_value_heads + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + + self.use_cache = use_cache + self.num_logits_to_keep = num_logits_to_keep + + self.rope_theta = rope_theta + self.rope_scaling = None + self.rope_scaling = rope_scaling + self.projectors_bias = projectors_bias + mamba_intermediate = mamba_expand * hidden_size if mamba_d_ssm is None else mamba_d_ssm + + if mamba_intermediate % mamba_n_heads != 0: + raise ValueError("mamba_n_heads must divide mamba_expand * hidden_size") + + # for the mamba_v2, must satisfy the following + if mamba_d_head == "auto": + mamba_d_head = mamba_intermediate // mamba_n_heads + + if mamba_d_head * mamba_n_heads != mamba_intermediate: + raise ValueError("The dimensions for the Mamba head state do not match the model intermediate_size") + + self.mamba_d_ssm = mamba_d_ssm + self.mamba_n_heads = mamba_n_heads + self.mamba_d_head = mamba_d_head + self.mamba_n_groups = mamba_n_groups + self.mamba_d_state = mamba_d_state + self.mamba_d_conv = mamba_d_conv + self.mamba_expand = mamba_expand + self.mamba_chunk_size = mamba_chunk_size + self.mamba_conv_bias = mamba_conv_bias + self.mamba_proj_bias = mamba_proj_bias + + self.mamba_norm_before_gate = mamba_norm_before_gate + self.mamba_rms_norm = mamba_rms_norm + + self.lm_head_multiplier = lm_head_multiplier + self.embedding_multiplier = embedding_multiplier + + if mlp_multipliers is not None: + self.mlp_multipliers = mlp_multipliers + else: + self.mlp_multipliers = [1.0, 1.0] + + if attention_out_multiplier is not None: + self.attention_out_multiplier = attention_out_multiplier + else: + self.attention_out_multiplier = 1.0 + + if attention_in_multiplier is not None: + self.attention_in_multiplier = attention_in_multiplier + else: + self.attention_in_multiplier = 1.0 + + if key_multiplier is not None: + self.key_multiplier = key_multiplier + else: + self.key_multiplier = 1.0 + + if ssm_multipliers is not None: + self.ssm_multipliers = ssm_multipliers + else: + self.ssm_multipliers = [1.0, 1.0, 1.0, 1.0, 1.0] + + if ssm_in_multiplier is not None: + self.ssm_in_multiplier = ssm_in_multiplier + else: + self.ssm_in_multiplier = 1.0 + + if ssm_out_multiplier is not None: + self.ssm_out_multiplier = ssm_out_multiplier + else: + self.ssm_out_multiplier = 1.0 + + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) + + @property + def layers_block_type(self): + return ["attention" for i in range(self.num_hidden_layers)] + + +__all__ = ["FalconH1Config"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/falcon_h1/modeling_falcon_h1.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/falcon_h1/modeling_falcon_h1.py new file mode 100644 index 0000000000000000000000000000000000000000..3a8b13ef21d00e400fa19d4e7cea7ea0ab89e4a3 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/falcon_h1/modeling_falcon_h1.py @@ -0,0 +1,1619 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/falcon_h1/modular_falcon_h1.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_falcon_h1.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright 2025 Technology Innovation Institute and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Any, Callable, Optional, Union + +import torch +import torch.nn.functional as F +from torch import nn + +from transformers.activations import ACT2FN + +from ...cache_utils import Cache +from ...generation import GenerationMixin +from ...integrations import use_kernel_forward_from_hub +from ...modeling_attn_mask_utils import AttentionMaskConverter +from ...modeling_flash_attention_utils import FlashAttentionKwargs +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast +from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update +from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel +from ...processing_utils import Unpack +from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, is_torchdynamo_compiling, logging +from ...utils.deprecation import deprecate_kwarg +from ...utils.import_utils import is_causal_conv1d_available, is_mamba_2_ssm_available +from .configuration_falcon_h1 import FalconH1Config + + +if is_mamba_2_ssm_available(): + from mamba_ssm.ops.triton.selective_state_update import selective_state_update + from mamba_ssm.ops.triton.ssd_combined import mamba_chunk_scan_combined, mamba_split_conv1d_scan_combined +else: + selective_state_update = None + +if is_causal_conv1d_available(): + from causal_conv1d import causal_conv1d_fn, causal_conv1d_update +else: + causal_conv1d_update, causal_conv1d_fn = None, None + + +logger = logging.get_logger(__name__) + + +class FalconHybridMambaAttentionDynamicCache: + """ + A dynamic cache that can handle both the attention cache (which has a seq_len dimension) and the mamba cache + (which has a constant shape regardless of seq_len). + + This cache has two sets of lists of tensors: `key_cache` and `value_cache` for attention cache and `conv_states` + and `ssm_states` for mamba cache. Each of these lists has `num_layers` tensors. The expected shape for each tensor + For attention layers, `key_cache` and `value_cache` have a shape of `(batch_size, num_heads, seq_len, head_dim)`, + while `conv_states` and `ssm_states` have a shape of `(batch_size, 0)` (empty tensors). + For mamba layers, `key_cache` and `value_cache` have a shape of `(batch_size, 0)` (empty tensors), + while `conv_states` represents the convolution state and has a shape of `(batch_size, d_inner, d_conv)`, + and `ssm_states` represents the ssm state and has a shape of `(batch_size, d_inner, d_state)`. + """ + + is_compileable = False + + def __init__( + self, + config: FalconH1Config, + batch_size: int, + dtype: torch.dtype = torch.float16, + devices: Optional[list[str]] = None, + ): + self.seqlen_offset = 0 + self.dtype = dtype + self.has_previous_state = False + self.conv_kernel_size = config.mamba_d_conv + + self.intermediate_size = ( + config.mamba_d_ssm if config.mamba_d_ssm is not None else int(config.mamba_expand * config.hidden_size) + ) + + self.conv_states = { + i: torch.zeros( + batch_size, + self.intermediate_size + 2 * config.mamba_n_groups * config.mamba_d_state, + self.conv_kernel_size, + device=devices[i], + dtype=dtype, + ) + for i in range(config.num_hidden_layers) + } + self.ssm_states = { + i: torch.zeros( + batch_size, + config.mamba_n_heads, + config.mamba_d_head, + config.mamba_d_state, + device=devices[i], + dtype=dtype, + ) + for i in range(config.num_hidden_layers) + } + + self.transformer_layers = [] + for i in range(config.num_hidden_layers): + self.transformer_layers.append(i) + + self.key_cache: list[torch.Tensor] = [] + self.value_cache: list[torch.Tensor] = [] + + def update( + self, + key_states: torch.Tensor, + value_states: torch.Tensor, + layer_idx: int, + cache_kwargs: Optional[dict[str, Any]] = None, + ) -> tuple[torch.Tensor, torch.Tensor]: + """ + Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`. + + Parameters: + key_states (`torch.Tensor`): + The new key states to cache. + value_states (`torch.Tensor`): + The new value states to cache. + layer_idx (`int`): + The index of the layer to cache the states for. + cache_kwargs (`dict[str, Any]`, `optional`): + Additional arguments for the cache subclass. No additional arguments are used in `DynamicCache`. + + Return: + A tuple containing the updated key and value states. + """ + # Update the cache + if len(self.key_cache) <= layer_idx: + # There may be skipped layers, fill them with empty lists + for _ in range(len(self.key_cache), layer_idx): + self.key_cache.append([]) + self.value_cache.append([]) + self.key_cache.append(key_states) + self.value_cache.append(value_states) + elif len(self.key_cache[layer_idx]) == 0: # fills previously skipped layers; checking for tensor causes errors + self.key_cache[layer_idx] = key_states + self.value_cache[layer_idx] = value_states + else: + self.key_cache[layer_idx] = torch.cat([self.key_cache[layer_idx], key_states], dim=-2) + self.value_cache[layer_idx] = torch.cat([self.value_cache[layer_idx], value_states], dim=-2) + + return self.key_cache[layer_idx], self.value_cache[layer_idx] + + def reorder_cache(self, beam_idx: torch.LongTensor): + """Reorders the cache for beam search, given the selected beam indices.""" + for layer_idx in range(len(self.key_cache)): + device = self.key_cache[layer_idx].device + self.key_cache[layer_idx] = self.key_cache[layer_idx].index_select(0, beam_idx.to(device)) + device = self.value_cache[layer_idx].device + self.value_cache[layer_idx] = self.value_cache[layer_idx].index_select(0, beam_idx.to(device)) + + device = self.conv_states[layer_idx].device + self.conv_states[layer_idx] = self.conv_states[layer_idx].index_select(0, beam_idx.to(device)) + device = self.ssm_states[layer_idx].device + self.ssm_states[layer_idx] = self.ssm_states[layer_idx].index_select(0, beam_idx.to(device)) + + def get_seq_length(self, layer_idx: Optional[int] = 0) -> int: + """Returns the sequence length of the cached states. A layer index can be optionally passed.""" + # take any layer that contains cache and not empty tensor + layer_idx = self.transformer_layers[0] if layer_idx not in self.transformer_layers else layer_idx + if len(self.key_cache) <= layer_idx: + return 0 + return self.key_cache[layer_idx].shape[-2] + + def update_conv_state( + self, + layer_idx: int, + new_conv_state: torch.Tensor, + cache_position: torch.LongTensor, + ) -> torch.Tensor: + conv_state = self.conv_states[layer_idx] + cache_position = cache_position.clamp(0, self.conv_kernel_size - 1) + + conv_state = conv_state.roll(shifts=-1, dims=-1) + if len(cache_position) > 1: + conv_state[:, :, :] = new_conv_state.to(conv_state.device) + else: + conv_state[:, :, -1] = new_conv_state[:, :, -1].to(conv_state.device) + self.conv_states[layer_idx].zero_() + self.conv_states[layer_idx] += conv_state + return self.conv_states[layer_idx] + + def reset(self): + self.conv_states.zero_() + self.ssm_states.zero_() + + +class FalconH1RotaryEmbedding(nn.Module): + inv_freq: torch.Tensor # fix linting for `register_buffer` + + def __init__(self, config: FalconH1Config, device=None): + super().__init__() + # BC: "rope_type" was originally "type" + if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): + self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) + else: + self.rope_type = "default" + self.max_seq_len_cached = config.max_position_embeddings + self.original_max_seq_len = config.max_position_embeddings + + self.config = config + self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + + inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) + self.original_inv_freq = self.inv_freq + + @torch.no_grad() + @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) + def forward(self, x, position_ids): + inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device) + position_ids_expanded = position_ids[:, None, :].float() + + device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" + with torch.autocast(device_type=device_type, enabled=False): # Force float32 + freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) + emb = torch.cat((freqs, freqs), dim=-1) + cos = emb.cos() * self.attention_scaling + sin = emb.sin() * self.attention_scaling + + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) + + +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + +def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1): + """Applies Rotary Position Embedding to the query and key tensors. + + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + position_ids (`torch.Tensor`, *optional*): + Deprecated and unused. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + cos = cos.unsqueeze(unsqueeze_dim) + sin = sin.unsqueeze(unsqueeze_dim) + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + +def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: + """ + This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, + num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) + """ + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) + + +def eager_attention_forward( + module: nn.Module, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attention_mask: Optional[torch.Tensor], + scaling: float, + dropout: float = 0.0, + **kwargs: Unpack[TransformersKwargs], +): + key_states = repeat_kv(key, module.num_key_value_groups) + value_states = repeat_kv(value, module.num_key_value_groups) + + attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling + if attention_mask is not None: + causal_mask = attention_mask[:, :, :, : key_states.shape[-2]] + attn_weights = attn_weights + causal_mask + + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype) + attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) + attn_output = torch.matmul(attn_weights, value_states) + attn_output = attn_output.transpose(1, 2).contiguous() + + return attn_output, attn_weights + + +class FalconH1Attention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config: FalconH1Config, layer_idx: int): + super().__init__() + self.config = config + self.layer_idx = layer_idx + self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads) + self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads + self.scaling = self.head_dim**-0.5 + self.attention_dropout = config.attention_dropout + self.is_causal = True + + self.q_proj = nn.Linear( + config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias + ) + self.k_proj = nn.Linear( + config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias + ) + self.v_proj = nn.Linear( + config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias + ) + self.o_proj = nn.Linear( + config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias + ) + self.key_multiplier = config.key_multiplier + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + position_embeddings: tuple[torch.Tensor, torch.Tensor], + attention_mask: Optional[torch.Tensor], + past_key_values: Optional[Cache] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Unpack[FlashAttentionKwargs], + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: + input_shape = hidden_states.shape[:-1] + hidden_shape = (*input_shape, -1, self.head_dim) + + query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2) + key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2) * self.key_multiplier + value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2) + + cos, sin = position_embeddings + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) + + if past_key_values is not None: + # sin and cos are specific to RoPE models; cache_position needed for the static cache + cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} + key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs) + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + attn_output, attn_weights = attention_interface( + self, + query_states, + key_states, + value_states, + attention_mask, + dropout=0.0 if not self.training else self.attention_dropout, + scaling=self.scaling, + **kwargs, + ) + + attn_output = attn_output.reshape(*input_shape, -1).contiguous() + attn_output = self.o_proj(attn_output) + return attn_output, attn_weights + + +class FalconH1RMSNormGated(torch.nn.Module): + def __init__(self, hidden_size, eps=1e-6, n_groups=1, norm_before_gate=True): + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + self.n_groups = n_groups + self.norm_before_gate = norm_before_gate + + def forward(self, hidden_states, gate=None): + input_dtype = hidden_states.dtype + + if not self.norm_before_gate and gate is not None: + hidden_states = hidden_states * F.silu(gate.to(torch.float32)) + + if len(hidden_states.shape) == 3: + batch_size, seq_len, dim = hidden_states.shape + else: + batch_size, dim = hidden_states.shape + seq_len = 1 + hidden_states = hidden_states.to(torch.float32) + + hidden_states = hidden_states.view(batch_size, seq_len, self.n_groups, int(dim // self.n_groups)) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + + hidden_states = self.weight.view(self.n_groups, int(dim // self.n_groups)) * hidden_states + hidden_states = hidden_states.view(batch_size, seq_len, dim) + + if seq_len == 1: + hidden_states = hidden_states.squeeze(1) + + if self.norm_before_gate and gate is not None: + hidden_states = hidden_states * F.silu(gate.to(torch.float32)) + return hidden_states.to(input_dtype) + + +# Helper methods for segment sum computation + + +def pad_tensor_by_size(input_tensor: torch.Tensor, pad_size: int): + """ + Padding x tensor with `pad_size` on the seq_len dim (dim=1) + + Assumes that we only have tensors of either size 4 or 3 + """ + pad_shape = (0, 0, 0, 0, 0, pad_size, 0, 0) if len(input_tensor.shape) == 4 else (0, 0, 0, pad_size, 0, 0) + + return torch.nn.functional.pad(input_tensor, pad_shape, mode="constant", value=0) + + +def reshape_into_chunks(input_tensor, pad_size, chunk_size): + """ + Padding input_tensor with `pad_size` on the seq_len dim (dim=1) and + simultaneously splitting it into chunk sequences. + + Assumes that we only have tensors of either size 4 or 3 + """ + # [bsz, seq_len, ...] -> [bsz, seq_len multiple of chunk_size, ...] + input_tensor = pad_tensor_by_size(input_tensor, pad_size) + + if len(input_tensor.shape) == 3: + # [bsz, seq_len multiple of chunk_size, num_heads] -> [bsz, -1, chunk_size, num_heads] + return input_tensor.reshape(input_tensor.shape[0], -1, chunk_size, input_tensor.shape[2]) + else: + # [bsz, seq_len multiple of chunk_size, num_heads, head_dim or state_size] -> [bsz, -1, chunk_size, num_heads, head_dim or state_size] + return input_tensor.reshape( + input_tensor.shape[0], -1, chunk_size, input_tensor.shape[2], input_tensor.shape[3] + ) + + +def segment_sum(input_tensor): + """ + More stable segment sum calculation. Uses cumulative sums and masking instead of direct subtractions. + """ + chunk_size = input_tensor.size(-1) + # 1. expand input tensor to have an additional dimension and repeat along that dimension + # [..., chunk_size] -> [..., chunk_size, chunk_size] + input_tensor = input_tensor[..., None].expand(*input_tensor.size(), chunk_size) + # 2. create a lower triangular mask with the diagonal set to 0 to 0 out elements above diag + mask = torch.tril(torch.ones(chunk_size, chunk_size, device=input_tensor.device, dtype=torch.bool), diagonal=-1) + input_tensor = input_tensor.masked_fill(~mask, 0) + # 3. compute actual cumsum + tensor_segsum = torch.cumsum(input_tensor, dim=-2) + + # 4. apply mask to keep only the lower triangular part of the cumulative sum result (incl diagonal this time) + mask = torch.tril(torch.ones(chunk_size, chunk_size, device=input_tensor.device, dtype=torch.bool), diagonal=0) + tensor_segsum = tensor_segsum.masked_fill(~mask, -torch.inf) + return tensor_segsum + + +is_fast_path_available = all((selective_state_update, causal_conv1d_fn, causal_conv1d_update)) + + +def apply_mask_to_padding_states(hidden_states, attention_mask): + """ + Tunes out the hidden states for padding tokens, see https://github.com/state-spaces/mamba/issues/66 + """ + if attention_mask is not None and attention_mask.shape[1] > 1 and attention_mask.shape[0] > 1: + dtype = hidden_states.dtype + hidden_states = (hidden_states * attention_mask[:, :, None]).to(dtype) + + return hidden_states + + +# Adapted from transformers.models.mamba2.modeling_mamba2.Mamba2Mixer +class FalconH1Mixer(nn.Module): + """ + FalconH1Mixer is identical to classic Mamba2 mixer classes but differs on two different things + - Users can pass custom intermediate_size through `config.mamba_d_ssm` + - The use of gated RMS normalization layer is optional + """ + + def __init__(self, config: FalconH1Config, layer_idx: int): + super().__init__() + self.num_heads = config.mamba_n_heads + self.hidden_size = config.hidden_size + self.ssm_state_size = config.mamba_d_state + self.conv_kernel_size = config.mamba_d_conv + self.intermediate_size = ( + int(config.mamba_expand * self.hidden_size) if config.mamba_d_ssm is None else config.mamba_d_ssm + ) + self.layer_idx = layer_idx + self.use_conv_bias = config.mamba_conv_bias + self.activation = config.hidden_act + self.act = ACT2FN[config.hidden_act] + self.use_bias = config.mamba_proj_bias + + self.layer_norm_epsilon = config.rms_norm_eps + self.groups_time_state_size = config.mamba_n_groups * self.ssm_state_size + + self.n_groups = config.mamba_n_groups + self.head_dim = config.mamba_d_head + self.chunk_size = config.mamba_chunk_size + + # FIXME: + self.time_step_limit = (0.0, float("inf")) + self.time_step_min = 0.001 + self.time_step_max = 0.1 + + self.conv_dim = self.intermediate_size + 2 * self.n_groups * self.ssm_state_size + self.conv1d = nn.Conv1d( + in_channels=self.conv_dim, + out_channels=self.conv_dim, + bias=config.mamba_conv_bias, + kernel_size=self.conv_kernel_size, + groups=self.conv_dim, + padding=self.conv_kernel_size - 1, + ) + + # projection of the input hidden states + projection_size = self.intermediate_size + self.conv_dim + self.num_heads + self.in_proj = nn.Linear( + self.hidden_size, + projection_size, + bias=self.use_bias, + ) + # selective projection used to make dt, B and C input dependant + + # time step projection (discretization) + # instantiate once and copy inv_dt in init_weights of PretrainedModel + self.dt_bias = nn.Parameter(torch.ones(self.num_heads)) + + # S4D real initialization. These are not discretized! + # The core is to load them, compute the discrete states, then write the updated state. Keeps the memory bounded + A = torch.arange(1, self.num_heads + 1) + self.A_log = nn.Parameter(torch.log(A)) + self.mamba_rms_norm = config.mamba_rms_norm + + if self.mamba_rms_norm: + self.norm = FalconH1RMSNormGated( + self.intermediate_size, + eps=self.layer_norm_epsilon, + n_groups=self.n_groups, + norm_before_gate=config.mamba_norm_before_gate, + ) + self.D = nn.Parameter(torch.ones(self.num_heads)) + + self.out_proj = nn.Linear(self.intermediate_size, config.hidden_size, bias=config.projectors_bias) + + if not is_fast_path_available: + logger.warning_once( + "The fast path is not available because one of `(selective_state_update, causal_conv1d_fn, causal_conv1d_update)`" + " is None. Falling back to the naive implementation. To install follow https://github.com/state-spaces/mamba/#installation and" + " https://github.com/Dao-AILab/causal-conv1d" + ) + else: + logger.warning_once("The fast path for FalconH1 will be used when running the model on a GPU") + + self.zxbcdt_multipliers = config.ssm_multipliers + self.ssm_in_multiplier = config.ssm_in_multiplier + + def cuda_kernels_forward( + self, + hidden_states: torch.Tensor, + cache_params: Optional[FalconHybridMambaAttentionDynamicCache] = None, + cache_position: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + ): + # 1. Gated MLP's linear projection + hidden_states = apply_mask_to_padding_states(hidden_states, attention_mask) + # Add Multipliers + hidden_states = hidden_states * self.ssm_in_multiplier + projected_states = self.in_proj(hidden_states) + projected_states = projected_states * self.mup_vector # ADD Mup Multipliers + d_to_remove = 2 * self.intermediate_size + 2 * self.n_groups * self.ssm_state_size + self.num_heads + + # Set up dimensions for reshapes later + batch_size, seq_len, _ = hidden_states.shape + groups_time_state_size = self.n_groups * self.ssm_state_size + + use_precomputed_states = ( + cache_params is not None + and cache_params.has_previous_state + and seq_len == 1 + and cache_params.conv_states[self.layer_idx].shape[0] + == cache_params.ssm_states[self.layer_idx].shape[0] + == batch_size + and cache_position is not None + and cache_position[0] > 0 + ) + + # getting projected states from cache if it exists + if use_precomputed_states: + d_mlp = (projected_states.squeeze(1).shape[-1] - d_to_remove) // 2 + + z0, x0, gate, hidden_states_B_C, dt = projected_states.squeeze(1).split( + [d_mlp, d_mlp, self.intermediate_size, self.conv_dim, self.num_heads], dim=-1 + ) + + # 2. Convolution sequence transformation + hidden_states_B_C = causal_conv1d_update( + hidden_states_B_C, + cache_params.conv_states[self.layer_idx], + self.conv1d.weight.squeeze(1), + self.conv1d.bias, + self.activation, + ) + + hidden_states, B, C = torch.split( + hidden_states_B_C, + [self.intermediate_size, groups_time_state_size, groups_time_state_size], + dim=-1, + ) + + # 3. SSM transformation + A = -torch.exp(self.A_log.float()) # (nheads,) + A = A[:, None, ...][:, :, None].expand(-1, self.head_dim, self.ssm_state_size).to(dtype=torch.float32) + dt = dt[:, :, None].expand(-1, -1, self.head_dim) + dt_bias = self.dt_bias[:, None, ...].expand(-1, self.head_dim) + D = self.D[:, None, ...].expand(-1, self.head_dim) + B = B.view(batch_size, self.n_groups, B.shape[1] // self.n_groups) + C = C.view(batch_size, self.n_groups, C.shape[1] // self.n_groups) + hidden_states_reshaped = hidden_states.view(batch_size, self.num_heads, self.head_dim) + hidden_states = selective_state_update( + cache_params.ssm_states[self.layer_idx], + hidden_states_reshaped, + dt, + A, + B, + C, + D, + z=gate.view(batch_size, self.num_heads, self.head_dim) if not self.mamba_rms_norm else None, + dt_bias=dt_bias, + dt_softplus=True, + ) + hidden_states = hidden_states.view(batch_size, self.num_heads * self.head_dim) + + if self.mamba_rms_norm: + hidden_states = self.norm(hidden_states, gate) + + if d_mlp > 0: + hidden_states = torch.cat([F.silu(z0) * x0, hidden_states], dim=-1) + + # 4. Final linear projection + out = self.out_proj(hidden_states[:, None, ...]) + # Fused calculations or step by step if no initialized cache is found + else: + A = -torch.exp(self.A_log.float()) # (num_heads) or (intermediate_size, state_size) + dt_limit_kwargs = {} if self.time_step_limit == (0.0, float("inf")) else {"dt_limit": self.time_step_limit} + + # 2-4. Fused kernel for conv1d, SSM, and the final projection + if self.training and cache_params is None: + out = mamba_split_conv1d_scan_combined( + projected_states, + self.conv1d.weight.squeeze(1), + self.conv1d.bias, + self.dt_bias, + A, + D=self.D, + chunk_size=self.chunk_size, + seq_idx=None, # was seq_idx + activation=self.activation, + rmsnorm_weight=self.norm.weight if self.mamba_rms_norm else None, + rmsnorm_eps=self.norm.variance_epsilon if self.mamba_rms_norm else None, + outproj_weight=self.out_proj.weight, + outproj_bias=self.out_proj.bias, + headdim=self.head_dim, + ngroups=self.n_groups, + norm_before_gate=False, + return_final_states=False, + **dt_limit_kwargs, + ) + + else: + d_mlp = ( + projected_states.shape[-1] + - 2 * self.intermediate_size + - 2 * self.n_groups * self.ssm_state_size + - self.num_heads + ) // 2 + if attention_mask is not None: + projected_states = projected_states * attention_mask[..., None] + _, gate, hidden_states_B_C, dt = projected_states.split( + [ + 2 * d_mlp, + self.intermediate_size, + self.conv_dim, + self.num_heads, + ], + dim=-1, + ) + + if cache_params is not None: + conv_states = F.pad( + hidden_states_B_C.permute(0, 2, 1), + (self.conv_kernel_size - hidden_states_B_C.shape[-2], 0), + ) + cache_params.update_conv_state(self.layer_idx, conv_states, cache_position) + + time_step = nn.functional.softplus(dt + self.dt_bias) + # 1D Convolution + if causal_conv1d_fn is None or self.activation not in ["silu", "swish"]: + hidden_states_B_C = self.act( + self.conv1d(hidden_states_B_C.transpose(1, 2)).transpose(1, 2)[:, :seq_len] + ) # (B, L, self.d_inner + 2 * ngroups * d_state) + else: + hidden_states_B_C = causal_conv1d_fn( + x=hidden_states_B_C.transpose(1, 2), + weight=self.conv1d.weight.squeeze(1), + bias=self.conv1d.bias, + activation=self.activation, + ).transpose(1, 2)[:, :seq_len] + + hidden_states, B, C = torch.split( + hidden_states_B_C, + [ + self.intermediate_size, + groups_time_state_size, + groups_time_state_size, + ], + dim=-1, + ) + + if attention_mask is not None and attention_mask.shape[1] > 1 and attention_mask.shape[0] > 1: + # tune out hidden states for pad tokens, see https://github.com/state-spaces/mamba/issues/66 + dtype = hidden_states.dtype + hidden_states = (hidden_states * attention_mask[:, :, None]).to(dtype) + # This is a hack to make sure multi-GPU inference works with HF accelerate + # see: https://github.com/Dao-AILab/flash-attention/issues/523 for more details + with torch.cuda.device(hidden_states.device): + scan_output, ssm_state = mamba_chunk_scan_combined( + hidden_states.view(batch_size, seq_len, -1, self.head_dim), + time_step, + A, + B.view(batch_size, seq_len, self.n_groups, -1), + C.view(batch_size, seq_len, self.n_groups, -1), + chunk_size=self.chunk_size, + D=self.D, + z=None, + seq_idx=None, + return_final_states=True, + **dt_limit_kwargs, + ) + if ssm_state is not None and cache_params is not None: + cache_params.ssm_states[self.layer_idx].copy_(ssm_state) + scan_output = scan_output.view(batch_size, seq_len, -1) + # Multiply "gate" branch and apply extra normalization layer + if self.mamba_rms_norm: + out = self.norm(scan_output, gate) + else: + out = scan_output * torch.nn.functional.silu(gate) + out = self.out_proj(out) + return out + + # fmt: off + def torch_forward( + self, + input_states, + cache_params: Optional[FalconHybridMambaAttentionDynamicCache] = None, + cache_position: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + ): + batch_size, seq_len, _ = input_states.shape + dtype = input_states.dtype + + # 1. Gated MLP's linear projection + input_states = apply_mask_to_padding_states(input_states, attention_mask) + # Add Multipliers + input_states = input_states * self.ssm_in_multiplier + projected_states = self.in_proj(input_states) + projected_states = projected_states * self.mup_vector # ADD Mup Multipliers + gate, hidden_states_B_C, dt = projected_states.split([ + self.intermediate_size, self.conv_dim, self.num_heads + ], dim=-1) + + use_precomputed_states = ( + cache_params is not None + and cache_params.has_previous_state + and seq_len == 1 + and cache_params.conv_states[self.layer_idx].shape[0] + == cache_params.ssm_states[self.layer_idx].shape[0] + == batch_size + and cache_position is not None + and cache_position[0] > 0 + ) + + # 2. Convolution sequence transformation + if use_precomputed_states: + cache_params.conv_states[self.layer_idx] = cache_params.conv_states[self.layer_idx].roll(shifts=-1, dims=-1) + cache_params.conv_states[self.layer_idx][:, :, -1] = hidden_states_B_C[:, 0, :].to(cache_params.conv_states[self.layer_idx].device) + + # We need to guarantee that anything regarding the cache is on the same device + conv_states = cache_params.conv_states[self.layer_idx].to(device=self.conv1d.weight.device) + + hidden_states_B_C = torch.sum( + conv_states * self.conv1d.weight.squeeze(1), dim=-1 + ) + if self.use_conv_bias: + hidden_states_B_C = hidden_states_B_C + self.conv1d.bias + hidden_states_B_C = self.act(hidden_states_B_C) + else: + # Init cache + if cache_params is not None: + hidden_states_B_C_transposed = hidden_states_B_C.transpose(1, 2) + conv_states = nn.functional.pad( + hidden_states_B_C_transposed, (self.conv_kernel_size - hidden_states_B_C_transposed.shape[-1], 0) + ) + cache_params.conv_states[self.layer_idx].copy_(conv_states) + + hidden_states_B_C = self.act(self.conv1d(hidden_states_B_C.transpose(1, 2))[..., :seq_len].transpose(1, 2)) + + hidden_states_B_C = apply_mask_to_padding_states(hidden_states_B_C, attention_mask) + hidden_states, B, C = torch.split( + hidden_states_B_C, + [self.intermediate_size, self.n_groups * self.ssm_state_size, self.n_groups * self.ssm_state_size], + dim=-1 + ) + + # 3. SSM transformation + A = -torch.exp(self.A_log.float()) # [num_heads] + if use_precomputed_states: + # We need to guarantee that anything regarding the cache is on the same device + cache_device = cache_params.ssm_states[self.layer_idx].device + + # Note: there is no need to pad parameter matrices here, as there is just one new token + # for batched generation + dt = dt[:, 0, :][:, None, ...] + dt = dt.transpose(1, 2).expand(batch_size, dt.shape[-1], self.head_dim) + # [num_heads] -> [num_heads, head_dim] + dt_bias = self.dt_bias[..., None].expand(self.dt_bias.shape[0], self.head_dim) + + dt = torch.nn.functional.softplus(dt + dt_bias.to(dt.dtype)) + dt = torch.clamp(dt, self.time_step_limit[0], self.time_step_limit[1]) + A = A[..., None, None].expand(self.num_heads, self.head_dim, self.ssm_state_size).to(dtype=torch.float32) + # [bsz, num_heads, head_dim, state_size] + dA = (torch.exp(dt[..., None] * A)).to(device=cache_device) + + # Discretize B + # [bsz, n_groups * state_size] -> [bsz, n_groups, 1, state_size] -> + # -> [bsz, n_groups, group to head repetition factor, state_size] -> [bsz, num_heads, state_size] + B = B.reshape(batch_size, self.n_groups, -1)[..., None, :] + B = B.expand(batch_size, self.n_groups, self.num_heads // self.n_groups, B.shape[-1]).contiguous() + B = B.reshape(batch_size, -1, B.shape[-1]) + # [bsz, num_heads, head_dim, state_size] + dB = dt[..., None] * B[..., None, :] + + # Discretize x into dB + # [bsz, intermediate_size] -> [bsz, num_heads, head_dim] + hidden_states = hidden_states.reshape(batch_size, -1, self.head_dim) + dBx = (dB * hidden_states[..., None]).to(device=cache_device) + + # State calculation + cache_params.ssm_states[self.layer_idx].copy_( + cache_params.ssm_states[self.layer_idx] * dA + dBx + ) + + # Subsequent output + # [bsz, n_groups * state_size] -> [bsz, num_heads, state_size] + C = C.reshape(batch_size, self.n_groups, -1)[..., None, :] + C = C.expand(batch_size, self.n_groups, self.num_heads // self.n_groups, C.shape[-1]).contiguous() + C = C.reshape(batch_size, -1, C.shape[-1]) + # [bsz, num_heads, head_dim] + + ssm_states = cache_params.ssm_states[self.layer_idx].to(device=C.device, dtype=C.dtype) # Shape: [b, h, d, n] + # Reshape ssm_states to merge the first two dimensions + ssm_states_reshaped = ssm_states.view(batch_size * self.num_heads, self.head_dim, self.ssm_state_size) # Shape: [b*h, d, n] + C_reshaped = C.view(batch_size * self.num_heads, self.ssm_state_size, 1) # Shape: [b*h, n, 1] + y = torch.bmm(ssm_states_reshaped, C_reshaped) + y = y.view(batch_size, self.num_heads, self.head_dim) + + # D skip connection + # [num_heads] -> [num_heads, head_dim] + D = self.D[..., None].expand(self.D.shape[0], self.head_dim) + y = (y + hidden_states * D).to(y.dtype) + + # [bsz, num_heads, head_dim] -> [bsz, 1, intermediate_size] + y = y.reshape(batch_size, -1)[:, None, ...] + else: + # begin ssd naive implementation without einsums + dt = nn.functional.softplus(dt + self.dt_bias) + dt = torch.clamp(dt, self.time_step_limit[0], self.time_step_limit[1]) + hidden_states = hidden_states.reshape(batch_size, seq_len, -1, self.head_dim).float() + B = B.reshape(batch_size, seq_len, -1, self.ssm_state_size).float() + C = C.reshape(batch_size, seq_len, -1, self.ssm_state_size).float() + B = B.repeat_interleave(self.num_heads // self.n_groups, dim=2, output_size=self.num_heads) + C = C.repeat_interleave(self.num_heads // self.n_groups, dim=2, output_size=self.num_heads) + pad_size = (self.chunk_size - seq_len % self.chunk_size) % self.chunk_size + + D_residual = self.D[..., None] * pad_tensor_by_size(hidden_states, pad_size) + + # Discretize x and A + hidden_states = hidden_states * dt[..., None] + A = A.to(hidden_states.dtype) * dt + + # Rearrange into blocks/chunks + hidden_states, A, B, C = [reshape_into_chunks(t, pad_size, self.chunk_size) for t in (hidden_states, A, B, C)] + + # [bsz, -1, chunk_size, num_heads] -> [bsz, num_heads, -1, chunk_size] + A = A.permute(0, 3, 1, 2) + A_cumsum = torch.cumsum(A, dim=-1) + + # 1. Compute the output for each intra-chunk (diagonal blocks) + # This is the analog of a causal mask + L = torch.exp(segment_sum(A)) + + # Contraction of C and B to get G (attention-weights like) + G_intermediate = C[:, :, :, None, :, :] * B[:, :, None, :, :, :] # shape: (b, c, l, s, h, n) + G = G_intermediate.sum(dim=-1) # shape: (b, c, l, s, h) + + # Compute M, equivalent to applying attention mask to weights + M_intermediate = G[..., None] * L.permute(0, 2, 3, 4, 1)[..., None] + M = M_intermediate.sum(dim=-1) + + # Compute Y_diag (apply to values) + Y_diag = (M[..., None] * hidden_states[:, :, None]).sum(dim=3) + + # 2. Compute the state for each intra-chunk + # (right term of low-rank factorization of off-diagonal blocks; B terms) + decay_states = torch.exp(A_cumsum[:, :, :, -1:] - A_cumsum) + B_decay = B * decay_states.permute(0, -2, -1, 1)[..., None] + states = (B_decay[..., None, :] * hidden_states[..., None]).sum(dim=2) + + # 3. Compute the inter-chunk SSM recurrence; produces correct SSM states at chunk boundaries + # (middle term of factorization of off-diag blocks; A terms) + if use_precomputed_states: + previous_states = cache_params.ssm_states[self.layer_idx][:, None, ...].to(device=states.device) + else: + previous_states = torch.zeros_like(states[:, :1]) + states = torch.cat([previous_states, states], dim=1) + decay_chunk = torch.exp(segment_sum(nn.functional.pad(A_cumsum[:, :, :, -1], (1, 0)))) + decay_chunk = decay_chunk.transpose(1, 3) + new_states = (decay_chunk[..., None, None] * states[:, :, None, ...]).sum(dim=1) + states, ssm_state = new_states[:, :-1], new_states[:, -1] + + # 4. Compute state -> output conversion per chunk + # (left term of low-rank factorization of off-diagonal blocks; C terms) + state_decay_out = torch.exp(A_cumsum) + C_times_states = (C[..., None, :] * states[:, :, None, ...]) + state_decay_out_permuted = state_decay_out.permute(0, 2, 3, 1) + Y_off = (C_times_states.sum(-1) * state_decay_out_permuted[..., None]) + + # Add output of intra-chunk and inter-chunk terms (diagonal and off-diagonal blocks) + y = Y_diag + Y_off + # [bsz, -1, self.chunk_size, num_heads, head_dim] -> [bsz, (padded) seq_len, num_heads, head_dim] + y = y.reshape(batch_size, -1, self.num_heads, self.head_dim) + + y = y + D_residual + # Cutting off padded chunks + if pad_size > 0: + y = y[:, :seq_len, :, :] + y = y.reshape(batch_size, seq_len, -1) + + # Init cache + if ssm_state is not None and cache_params is not None: + cache_params.ssm_states[self.layer_idx].copy_(ssm_state) + + if self.mamba_rms_norm: + scan_output = self.norm(y, gate) + else: + scan_output = y * torch.nn.functional.silu(gate) + + # end ssd naive + + # 4. Final linear projection + contextualized_states = self.out_proj(scan_output.to(dtype)) # [batch, seq_len, hidden_size] + return contextualized_states + # fmt: on + + def forward( + self, + hidden_states, + cache_params: Optional[FalconHybridMambaAttentionDynamicCache] = None, + cache_position: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + ): + if is_fast_path_available and "cuda" in self.in_proj.weight.device.type: + return self.cuda_kernels_forward(hidden_states, cache_params, cache_position, attention_mask) + dtype = hidden_states.dtype + if attention_mask is not None and attention_mask.shape[1] > 1 and attention_mask.shape[0] > 1: + # tune out hidden states for pad tokens, see https://github.com/state-spaces/mamba/issues/66 + hidden_states = (hidden_states * attention_mask[:, :, None]).to(dtype) + + return self.torch_forward(hidden_states, cache_params, cache_position, attention_mask) + + +class FalconH1MLP(nn.Module): + def __init__(self, config: FalconH1Config): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.intermediate_size = config.intermediate_size + self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias) + self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias) + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias) + self.act_fn = ACT2FN[config.hidden_act] + self.gate_multiplier, self.down_multiplier = config.mlp_multipliers + + def forward(self, x): + y = self.up_proj(x) * self.act_fn(self.gate_proj(x) * self.gate_multiplier) + y = self.down_proj(y) * self.down_multiplier + return y + + +@use_kernel_forward_from_hub("RMSNorm") +class FalconH1RMSNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + """ + FalconH1RMSNorm is equivalent to T5LayerNorm + """ + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + return self.weight * hidden_states.to(input_dtype) + + def extra_repr(self): + return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}" + + +class FalconH1DecoderLayer(GradientCheckpointingLayer): + def __init__(self, config: FalconH1Config, layer_idx: int): + super().__init__() + self.feed_forward = FalconH1MLP(config) + + head_dim = config.hidden_size // config.num_attention_heads + self.channels_attn = config.num_attention_heads * head_dim + 2 * config.num_key_value_heads * head_dim + + self.mamba = FalconH1Mixer(config=config, layer_idx=layer_idx) + + self.self_attn = FalconH1Attention(config, layer_idx) + + self.attention_in_multiplier = config.attention_in_multiplier + self.ssm_out_multiplier = config.ssm_out_multiplier + self.attn_out_multiplier = config.attention_out_multiplier + + self.input_layernorm = FalconH1RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.pre_ff_layernorm = FalconH1RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + mamba_attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[FalconHybridMambaAttentionDynamicCache] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + cache_position: Optional[torch.LongTensor] = None, + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + **kwargs, + ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]: + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`, *optional*): attention mask of size + `(batch, sequence_length)` where padding elements are indicated by 0. + past_key_values (`FalconHybridMambaAttentionDynamicCache`, *optional*): cached past key and value projection states + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*): + Indices depicting the position of the input sequence tokens in the sequence. + position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*): + Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`, + with `head_dim` being the embedding dimension of each attention head. + kwargs (`dict`, *optional*): + Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code + into the model + """ + + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + + mamba_hidden_states = self.mamba( + hidden_states=hidden_states, + cache_params=past_key_values, + cache_position=cache_position, + attention_mask=mamba_attention_mask, + ) + mamba_hidden_states = mamba_hidden_states * self.ssm_out_multiplier + + attention_hidden_states, self_attn_weights = self.self_attn( + hidden_states=hidden_states * self.attention_in_multiplier, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + output_attentions=output_attentions, + use_cache=use_cache, + cache_position=cache_position, + position_embeddings=position_embeddings, + **kwargs, + ) + attention_hidden_states = attention_hidden_states * self.attn_out_multiplier + + hidden_states = mamba_hidden_states + attention_hidden_states + + # residual connection after attention + hidden_states = residual + hidden_states + + # feed-forward + residual = hidden_states + hidden_states = self.pre_ff_layernorm(hidden_states) + hidden_states = self.feed_forward(hidden_states) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights,) + + return outputs + + +@auto_docstring +class FalconH1PreTrainedModel(PreTrainedModel): + config: FalconH1Config + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["FalconH1DecoderLayer"] + _skip_keys_device_placement = "past_key_values" + _supports_flash_attn = True + _supports_sdpa = True + _is_stateful = True + + def _init_weights(self, module): + std = self.config.initializer_range + for name, param in module.named_parameters(recurse=True): + if not param.requires_grad: + continue + if "layernorm" in name.lower() and "weight" in name: + # LayerNorm weights usually initialized to 1 + param.data.fill_(1.0) + elif "bias" in name: + param.data.zero_() + else: + try: + param.data.normal_(mean=0.0, std=std) + except Exception as e: + print(f"Skipping init for {name} due to error: {e}") + + +def compute_mup_vector(config): + """ + Computes the MuP vector based on model configuration. + + FalconH1 applies different MuP multiplier for each dimension of the hidden states. + The MuP vector is partitioned into chunks, and each chunk is multiplied with its + corresponding projected dimension. + + Args: + config: FalconH1Config object + + Returns: + torch.Tensor: The computed MuP vector + """ + # We'll need some values from the config to compute the vector dimensions + intermediate_size = ( + config.mamba_d_ssm if config.mamba_d_ssm is not None else int(config.mamba_expand * config.hidden_size) + ) + groups_time_state_size = config.mamba_n_groups * config.mamba_d_state + num_heads = config.mamba_n_heads + zxbcdt_multipliers = config.ssm_multipliers + + vector_shape = 2 * intermediate_size + 2 * groups_time_state_size + num_heads + mup_vector = torch.ones(1, 1, vector_shape) + + # Apply multipliers to different sections of the vector + mup_vector[:, :, :intermediate_size] *= zxbcdt_multipliers[0] + mup_vector[:, :, intermediate_size : 2 * intermediate_size] *= zxbcdt_multipliers[1] + mup_vector[:, :, 2 * intermediate_size : 2 * intermediate_size + groups_time_state_size] *= zxbcdt_multipliers[2] + mup_vector[ + :, :, 2 * intermediate_size + groups_time_state_size : 2 * intermediate_size + 2 * groups_time_state_size + ] *= zxbcdt_multipliers[3] + mup_vector[:, :, 2 * intermediate_size + 2 * groups_time_state_size :] *= zxbcdt_multipliers[4] + + return mup_vector + + +@auto_docstring +# Adapted from transformers.models.jamba.modeling_jamba.JambaModel +class FalconH1Model(FalconH1PreTrainedModel): + def __init__(self, config: FalconH1Config): + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) + decoder_layers = [] + for i in range(config.num_hidden_layers): + decoder_layers.append(FalconH1DecoderLayer(config, layer_idx=i)) + self.layers = nn.ModuleList(decoder_layers) + + self._attn_implementation = config._attn_implementation + self.final_layernorm = FalconH1RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.rotary_emb = FalconH1RotaryEmbedding(config=config) + + self.embedding_multiplier = config.embedding_multiplier + self.lm_head_multiplier = config.lm_head_multiplier + + self.gradient_checkpointing = False + # Compute the MuP vector once and register it for all layers + mup_vector = compute_mup_vector(config) + for layer in self.layers: + layer.mamba.register_buffer("mup_vector", mup_vector, persistent=False) + + # Initialize weights and apply final processing + self.post_init() + + @can_return_tuple + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[FalconHybridMambaAttentionDynamicCache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs, # NOOP kwargs, for now + ) -> Union[tuple, BaseModelOutputWithPast]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") + + if self.gradient_checkpointing and self.training and use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`." + ) + use_cache = False + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) * self.embedding_multiplier + hidden_states = inputs_embeds + + if use_cache and past_key_values is None: + logger.warning_once( + "FalconH1 requires an initialized `FalconHybridMambaAttentionDynamicCache` to return a cache. None was " + "provided, so no cache will be returned." + ) + + if cache_position is None: + cache_position = torch.arange(hidden_states.shape[1], device=hidden_states.device) + if position_ids is None: + position_ids = cache_position.unsqueeze(0) + + causal_mask = self._update_causal_mask( + attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions + ) + mamba_mask = self._update_mamba_mask(attention_mask, cache_position) + + # create position embeddings to be shared across the decoder layers + position_embeddings = self.rotary_emb(hidden_states, position_ids) + + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + + for decoder_layer in self.layers: + if output_hidden_states: + all_hidden_states += (hidden_states,) + + layer_outputs = decoder_layer( + hidden_states, + attention_mask=causal_mask, + mamba_attention_mask=mamba_mask, + position_ids=position_ids, + past_key_values=past_key_values, + output_attentions=output_attentions, + use_cache=use_cache, + cache_position=cache_position, + position_embeddings=position_embeddings, + ) + + hidden_states = layer_outputs[0] + + if output_attentions: + if layer_outputs[1] is not None: + # append attentions only of attention layers. Mamba layers return `None` as the attention weights + all_self_attns += (layer_outputs[1],) + + hidden_states = self.final_layernorm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if past_key_values and not past_key_values.has_previous_state: + past_key_values.has_previous_state = True + + next_cache = None if not use_cache else past_key_values + + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + ) + + def _update_mamba_mask(self, attention_mask, cache_position): + """ + No need for zeroing states when + 1. Cached forward + 2. Attending to all inputs + """ + mamba_mask = attention_mask + if cache_position[0] > 0 or (attention_mask is not None and torch.all(attention_mask == 1)): + mamba_mask = None + return mamba_mask + + def _update_causal_mask( + self, + attention_mask: torch.Tensor, + input_tensor: torch.Tensor, + cache_position: torch.Tensor, + past_key_values: FalconHybridMambaAttentionDynamicCache, + output_attentions: bool, + ): + if self.config._attn_implementation == "flash_attention_2": + if attention_mask is not None and 0.0 in attention_mask: + return attention_mask + return None + + # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in + # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail + # to infer the attention mask. + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + + # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward + if self.config._attn_implementation == "sdpa" and not output_attentions: + if AttentionMaskConverter._ignore_causal_mask_sdpa( + attention_mask, + inputs_embeds=input_tensor, + past_key_values_length=past_seen_tokens, + is_training=self.training, + ): + return None + + dtype = input_tensor.dtype + sequence_length = input_tensor.shape[1] + target_length = ( + attention_mask.shape[-1] + if isinstance(attention_mask, torch.Tensor) + else past_seen_tokens + sequence_length + 1 + ) + + # In case the provided `attention` mask is 2D, we generate a causal mask here (4D). + causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position( + attention_mask, + sequence_length=sequence_length, + target_length=target_length, + dtype=dtype, + cache_position=cache_position, + batch_size=input_tensor.shape[0], + ) + + if ( + self.config._attn_implementation == "sdpa" + and attention_mask is not None + and attention_mask.device.type in ["cuda", "xpu", "npu"] + and not output_attentions + ): + # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when + # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path. + # Details: https://github.com/pytorch/pytorch/issues/110213 + min_dtype = torch.finfo(dtype).min + causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype) + + return causal_mask + + @staticmethod + def _prepare_4d_causal_attention_mask_with_cache_position( + attention_mask: torch.Tensor, + sequence_length: int, + target_length: int, + dtype: torch.dtype, + cache_position: torch.Tensor, + batch_size: int, + **kwargs, + ): + """ + Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape + `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing. + + Args: + attention_mask (`torch.Tensor`): + A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape + `(batch_size, 1, query_length, key_value_length)`. + sequence_length (`int`): + The sequence length being processed. + target_length (`int`): + The target length: when generating with static cache, the mask should be as long as the static cache, + to account for the 0 padding, the part of the cache that is not filled yet. + dtype (`torch.dtype`): + The dtype to use for the 4D attention mask. + cache_position (`torch.Tensor`): + Indices depicting the position of the input sequence tokens in the sequence. + batch_size (`torch.Tensor`): + Batch size. + """ + if attention_mask is not None and attention_mask.dim() == 4: + # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing. + causal_mask = attention_mask + else: + min_dtype = torch.finfo(dtype).min + causal_mask = torch.full( + (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=cache_position.device + ) + if sequence_length != 1: + causal_mask = torch.triu(causal_mask, diagonal=1) + causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(-1, 1) + causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1) + if attention_mask is not None: + causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit + mask_length = attention_mask.shape[-1] + padding_attention_mask = (attention_mask[:, None, None, :] == attention_mask[:, None, :, None])[ + :, :, -sequence_length:, : + ].to(dtype) + padding_mask = causal_mask[:, :, :, :mask_length] + padding_attention_mask + padding_mask = padding_mask == 0 + causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill( + padding_mask, min_dtype + ) + + return causal_mask + + +@auto_docstring +class FalconH1ForCausalLM(FalconH1PreTrainedModel, GenerationMixin): + _tied_weights_keys = ["lm_head.weight"] + _tp_plan = {"lm_head": "colwise_rep"} + _pp_plan = {"lm_head": (["hidden_states"], ["logits"])} + + def __init__(self, config): + super().__init__(config) + self.model = FalconH1Model(config) + self.vocab_size = config.vocab_size + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + @can_return_tuple + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[FalconHybridMambaAttentionDynamicCache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + logits_to_keep: Union[int, torch.Tensor] = 0, + **kwargs, + ) -> Union[tuple, CausalLMOutputWithPast]: + r""" + Example: + + ```python + >>> from transformers import AutoTokenizer, FalconH1ForCausalLM + + >>> model = FalconH1ForCausalLM.from_pretrained("...") + >>> tokenizer = AutoTokenizer.from_pretrained("...") + + >>> prompt = "Hey, are you conscious? Can you talk to me?" + >>> inputs = tokenizer(prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(inputs.input_ids, max_length=30) + >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you." + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + cache_position=cache_position, + **kwargs, + ) + + hidden_states = outputs[0] + # Only compute necessary logits, and do not upcast them to float if we are not computing the loss + slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep + logits = self.lm_head(hidden_states[:, slice_indices, :]) * self.model.lm_head_multiplier + + loss = None + if labels is not None: + loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs) + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def prepare_inputs_for_generation( + self, + input_ids, + past_key_values=None, + attention_mask=None, + inputs_embeds=None, + cache_position=None, + position_ids=None, + use_cache=True, + **kwargs, + ): + # Overwritten -- has a unique cache type, `FalconHybridMambaAttentionDynamicCache` + + empty_past_kv = past_key_values is None + + # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens + # Exception 1: when passing input_embeds, input_ids may be missing entries + # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here + # Exception 3: with synced GPUs cache_position may go out of bounds, but we only want dummy token in that case. + # (we can't check exception 3 while compiling) + if not empty_past_kv: + if ( + inputs_embeds is not None # Exception 1 + or (is_torchdynamo_compiling() or cache_position[-1] >= input_ids.shape[1]) # Exception 3 + ): + input_ids = input_ids[:, -cache_position.shape[0] :] + elif input_ids.shape[1] != cache_position.shape[0]: # Default case (the "else", a no op, is Exception 2) + input_ids = input_ids[:, cache_position] + else: + past_key_values = FalconHybridMambaAttentionDynamicCache( + self.config, + input_ids.shape[0], + self.dtype, + devices=[ + self.model.layers[i].mamba.conv1d.weight.device for i in range(self.config.num_hidden_layers) + ], + ) + + if attention_mask is not None and position_ids is None: + # create position_ids on the fly for batch generation + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + if not empty_past_kv: + position_ids = position_ids[:, -input_ids.shape[1] :] + + # if `inputs_embeds` are passed, we only want to use them in the 1st generation step + if inputs_embeds is not None and empty_past_kv: + model_inputs = {"inputs_embeds": inputs_embeds} + else: + model_inputs = {"input_ids": input_ids.contiguous()} # `contiguous()` needed for compilation use cases + + model_inputs.update( + { + "position_ids": position_ids, + "past_key_values": past_key_values, + "use_cache": use_cache, + "attention_mask": attention_mask, + "logits_to_keep": self.config.num_logits_to_keep, + "cache_position": cache_position, + } + ) + + # Forward ALL kwargs that are uninitialized (e.g. `use_cache`). + for key, value in kwargs.items(): + if key not in model_inputs: + model_inputs[key] = value + + return model_inputs + + +__all__ = ["FalconH1Model", "FalconH1ForCausalLM", "FalconH1PreTrainedModel"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/falcon_h1/modular_falcon_h1.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/falcon_h1/modular_falcon_h1.py new file mode 100644 index 0000000000000000000000000000000000000000..fe716dded4b35dad154084d3ca7afc632bf8e996 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/falcon_h1/modular_falcon_h1.py @@ -0,0 +1,1383 @@ +# coding=utf-8 +# Copyright 2025 Technology Innovation Institute and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch FalconH1 model.""" + +from typing import Any, Callable, Optional, Union + +import torch +import torch.nn.functional as F +from torch import nn + +from transformers.activations import ACT2FN +from transformers.models.jamba.modeling_jamba import HybridMambaAttentionDynamicCache +from transformers.models.llama.modeling_llama import ( + LlamaAttention, + LlamaForCausalLM, + LlamaMLP, + LlamaRMSNorm, + LlamaRotaryEmbedding, + apply_rotary_pos_emb, + eager_attention_forward, +) +from transformers.models.mamba2.modeling_mamba2 import ( + MambaRMSNormGated, + pad_tensor_by_size, + reshape_into_chunks, + segment_sum, +) + +from ...cache_utils import Cache +from ...modeling_attn_mask_utils import AttentionMaskConverter +from ...modeling_flash_attention_utils import FlashAttentionKwargs +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast +from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel +from ...processing_utils import Unpack +from ...utils import auto_docstring, can_return_tuple, is_torchdynamo_compiling, logging +from ...utils.deprecation import deprecate_kwarg +from ...utils.import_utils import is_causal_conv1d_available, is_mamba_2_ssm_available +from .configuration_falcon_h1 import FalconH1Config + + +if is_mamba_2_ssm_available(): + from mamba_ssm.ops.triton.selective_state_update import selective_state_update + from mamba_ssm.ops.triton.ssd_combined import mamba_chunk_scan_combined, mamba_split_conv1d_scan_combined +else: + selective_state_update = None + +if is_causal_conv1d_available(): + from causal_conv1d import causal_conv1d_fn, causal_conv1d_update +else: + causal_conv1d_update, causal_conv1d_fn = None, None + +is_fast_path_available = all((selective_state_update, causal_conv1d_fn, causal_conv1d_update)) + + +logger = logging.get_logger(__name__) + + +class FalconHybridMambaAttentionDynamicCache(HybridMambaAttentionDynamicCache): + """ + A dynamic cache that can handle both the attention cache (which has a seq_len dimension) and the mamba cache + (which has a constant shape regardless of seq_len). + + This cache has two sets of lists of tensors: `key_cache` and `value_cache` for attention cache and `conv_states` + and `ssm_states` for mamba cache. Each of these lists has `num_layers` tensors. The expected shape for each tensor + For attention layers, `key_cache` and `value_cache` have a shape of `(batch_size, num_heads, seq_len, head_dim)`, + while `conv_states` and `ssm_states` have a shape of `(batch_size, 0)` (empty tensors). + For mamba layers, `key_cache` and `value_cache` have a shape of `(batch_size, 0)` (empty tensors), + while `conv_states` represents the convolution state and has a shape of `(batch_size, d_inner, d_conv)`, + and `ssm_states` represents the ssm state and has a shape of `(batch_size, d_inner, d_state)`. + """ + + def __init__( + self, + config: FalconH1Config, + batch_size: int, + dtype: torch.dtype = torch.float16, + devices: Optional[list[str]] = None, + ): + self.seqlen_offset = 0 + self.dtype = dtype + self.has_previous_state = False + self.conv_kernel_size = config.mamba_d_conv + + self.intermediate_size = ( + config.mamba_d_ssm if config.mamba_d_ssm is not None else int(config.mamba_expand * config.hidden_size) + ) + + self.conv_states = { + i: torch.zeros( + batch_size, + self.intermediate_size + 2 * config.mamba_n_groups * config.mamba_d_state, + self.conv_kernel_size, + device=devices[i], + dtype=dtype, + ) + for i in range(config.num_hidden_layers) + } + self.ssm_states = { + i: torch.zeros( + batch_size, + config.mamba_n_heads, + config.mamba_d_head, + config.mamba_d_state, + device=devices[i], + dtype=dtype, + ) + for i in range(config.num_hidden_layers) + } + + self.transformer_layers = [] + for i in range(config.num_hidden_layers): + self.transformer_layers.append(i) + + self.key_cache: list[torch.Tensor] = [] + self.value_cache: list[torch.Tensor] = [] + + def update( + self, + key_states: torch.Tensor, + value_states: torch.Tensor, + layer_idx: int, + cache_kwargs: Optional[dict[str, Any]] = None, + ) -> tuple[torch.Tensor, torch.Tensor]: + """ + Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`. + + Parameters: + key_states (`torch.Tensor`): + The new key states to cache. + value_states (`torch.Tensor`): + The new value states to cache. + layer_idx (`int`): + The index of the layer to cache the states for. + cache_kwargs (`dict[str, Any]`, `optional`): + Additional arguments for the cache subclass. No additional arguments are used in `DynamicCache`. + + Return: + A tuple containing the updated key and value states. + """ + # Update the cache + if len(self.key_cache) <= layer_idx: + # There may be skipped layers, fill them with empty lists + for _ in range(len(self.key_cache), layer_idx): + self.key_cache.append([]) + self.value_cache.append([]) + self.key_cache.append(key_states) + self.value_cache.append(value_states) + elif len(self.key_cache[layer_idx]) == 0: # fills previously skipped layers; checking for tensor causes errors + self.key_cache[layer_idx] = key_states + self.value_cache[layer_idx] = value_states + else: + self.key_cache[layer_idx] = torch.cat([self.key_cache[layer_idx], key_states], dim=-2) + self.value_cache[layer_idx] = torch.cat([self.value_cache[layer_idx], value_states], dim=-2) + + return self.key_cache[layer_idx], self.value_cache[layer_idx] + + def update_conv_state( + self, + layer_idx: int, + new_conv_state: torch.Tensor, + cache_position: torch.LongTensor, + ) -> torch.Tensor: + conv_state = self.conv_states[layer_idx] + cache_position = cache_position.clamp(0, self.conv_kernel_size - 1) + + conv_state = conv_state.roll(shifts=-1, dims=-1) + if len(cache_position) > 1: + conv_state[:, :, :] = new_conv_state.to(conv_state.device) + else: + conv_state[:, :, -1] = new_conv_state[:, :, -1].to(conv_state.device) + self.conv_states[layer_idx].zero_() + self.conv_states[layer_idx] += conv_state + return self.conv_states[layer_idx] + + def reset(self): + self.conv_states.zero_() + self.ssm_states.zero_() + + +class FalconH1RotaryEmbedding(LlamaRotaryEmbedding): + pass + + +class FalconH1Attention(LlamaAttention): + def __init__(self, config: FalconH1Config, layer_idx: int): + super().__init__(config, layer_idx) + self.key_multiplier = config.key_multiplier + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + position_embeddings: tuple[torch.Tensor, torch.Tensor], + attention_mask: Optional[torch.Tensor], + past_key_values: Optional[Cache] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Unpack[FlashAttentionKwargs], + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: + input_shape = hidden_states.shape[:-1] + hidden_shape = (*input_shape, -1, self.head_dim) + + query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2) + key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2) * self.key_multiplier + value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2) + + cos, sin = position_embeddings + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) + + if past_key_values is not None: + # sin and cos are specific to RoPE models; cache_position needed for the static cache + cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} + key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs) + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + attn_output, attn_weights = attention_interface( + self, + query_states, + key_states, + value_states, + attention_mask, + dropout=0.0 if not self.training else self.attention_dropout, + scaling=self.scaling, + **kwargs, + ) + + attn_output = attn_output.reshape(*input_shape, -1).contiguous() + attn_output = self.o_proj(attn_output) + return attn_output, attn_weights + + +class FalconH1RMSNormGated(MambaRMSNormGated): + def __init__(self, hidden_size, eps=1e-6, n_groups=1, norm_before_gate=True): + super().__init__(hidden_size=hidden_size, eps=eps) + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + self.n_groups = n_groups + self.norm_before_gate = norm_before_gate + + def forward(self, hidden_states, gate=None): + input_dtype = hidden_states.dtype + + if not self.norm_before_gate and gate is not None: + hidden_states = hidden_states * F.silu(gate.to(torch.float32)) + + if len(hidden_states.shape) == 3: + batch_size, seq_len, dim = hidden_states.shape + else: + batch_size, dim = hidden_states.shape + seq_len = 1 + hidden_states = hidden_states.to(torch.float32) + + hidden_states = hidden_states.view(batch_size, seq_len, self.n_groups, int(dim // self.n_groups)) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + + hidden_states = self.weight.view(self.n_groups, int(dim // self.n_groups)) * hidden_states + hidden_states = hidden_states.view(batch_size, seq_len, dim) + + if seq_len == 1: + hidden_states = hidden_states.squeeze(1) + + if self.norm_before_gate and gate is not None: + hidden_states = hidden_states * F.silu(gate.to(torch.float32)) + return hidden_states.to(input_dtype) + + +def apply_mask_to_padding_states(hidden_states, attention_mask): + """ + Tunes out the hidden states for padding tokens, see https://github.com/state-spaces/mamba/issues/66 + """ + if attention_mask is not None and attention_mask.shape[1] > 1 and attention_mask.shape[0] > 1: + dtype = hidden_states.dtype + hidden_states = (hidden_states * attention_mask[:, :, None]).to(dtype) + + return hidden_states + + +# Adapted from transformers.models.mamba2.modeling_mamba2.Mamba2Mixer +class FalconH1Mixer(nn.Module): + """ + FalconH1Mixer is identical to classic Mamba2 mixer classes but differs on two different things + - Users can pass custom intermediate_size through `config.mamba_d_ssm` + - The use of gated RMS normalization layer is optional + """ + + def __init__(self, config: FalconH1Config, layer_idx: int): + super().__init__() + self.num_heads = config.mamba_n_heads + self.hidden_size = config.hidden_size + self.ssm_state_size = config.mamba_d_state + self.conv_kernel_size = config.mamba_d_conv + self.intermediate_size = ( + int(config.mamba_expand * self.hidden_size) if config.mamba_d_ssm is None else config.mamba_d_ssm + ) + self.layer_idx = layer_idx + self.use_conv_bias = config.mamba_conv_bias + self.activation = config.hidden_act + self.act = ACT2FN[config.hidden_act] + self.use_bias = config.mamba_proj_bias + + self.layer_norm_epsilon = config.rms_norm_eps + self.groups_time_state_size = config.mamba_n_groups * self.ssm_state_size + + self.n_groups = config.mamba_n_groups + self.head_dim = config.mamba_d_head + self.chunk_size = config.mamba_chunk_size + + # FIXME: + self.time_step_limit = (0.0, float("inf")) + self.time_step_min = 0.001 + self.time_step_max = 0.1 + + self.conv_dim = self.intermediate_size + 2 * self.n_groups * self.ssm_state_size + self.conv1d = nn.Conv1d( + in_channels=self.conv_dim, + out_channels=self.conv_dim, + bias=config.mamba_conv_bias, + kernel_size=self.conv_kernel_size, + groups=self.conv_dim, + padding=self.conv_kernel_size - 1, + ) + + # projection of the input hidden states + projection_size = self.intermediate_size + self.conv_dim + self.num_heads + self.in_proj = nn.Linear( + self.hidden_size, + projection_size, + bias=self.use_bias, + ) + # selective projection used to make dt, B and C input dependant + + # time step projection (discretization) + # instantiate once and copy inv_dt in init_weights of PretrainedModel + self.dt_bias = nn.Parameter(torch.ones(self.num_heads)) + + # S4D real initialization. These are not discretized! + # The core is to load them, compute the discrete states, then write the updated state. Keeps the memory bounded + A = torch.arange(1, self.num_heads + 1) + self.A_log = nn.Parameter(torch.log(A)) + self.mamba_rms_norm = config.mamba_rms_norm + + if self.mamba_rms_norm: + self.norm = FalconH1RMSNormGated( + self.intermediate_size, + eps=self.layer_norm_epsilon, + n_groups=self.n_groups, + norm_before_gate=config.mamba_norm_before_gate, + ) + self.D = nn.Parameter(torch.ones(self.num_heads)) + + self.out_proj = nn.Linear(self.intermediate_size, config.hidden_size, bias=config.projectors_bias) + + if not is_fast_path_available: + logger.warning_once( + "The fast path is not available because one of `(selective_state_update, causal_conv1d_fn, causal_conv1d_update)`" + " is None. Falling back to the naive implementation. To install follow https://github.com/state-spaces/mamba/#installation and" + " https://github.com/Dao-AILab/causal-conv1d" + ) + else: + logger.warning_once("The fast path for FalconH1 will be used when running the model on a GPU") + + self.zxbcdt_multipliers = config.ssm_multipliers + self.ssm_in_multiplier = config.ssm_in_multiplier + + def cuda_kernels_forward( + self, + hidden_states: torch.Tensor, + cache_params: Optional[FalconHybridMambaAttentionDynamicCache] = None, + cache_position: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + ): + # 1. Gated MLP's linear projection + hidden_states = apply_mask_to_padding_states(hidden_states, attention_mask) + # Add Multipliers + hidden_states = hidden_states * self.ssm_in_multiplier + projected_states = self.in_proj(hidden_states) + projected_states = projected_states * self.mup_vector # ADD Mup Multipliers + d_to_remove = 2 * self.intermediate_size + 2 * self.n_groups * self.ssm_state_size + self.num_heads + + # Set up dimensions for reshapes later + batch_size, seq_len, _ = hidden_states.shape + groups_time_state_size = self.n_groups * self.ssm_state_size + + use_precomputed_states = ( + cache_params is not None + and cache_params.has_previous_state + and seq_len == 1 + and cache_params.conv_states[self.layer_idx].shape[0] + == cache_params.ssm_states[self.layer_idx].shape[0] + == batch_size + and cache_position is not None + and cache_position[0] > 0 + ) + + # getting projected states from cache if it exists + if use_precomputed_states: + d_mlp = (projected_states.squeeze(1).shape[-1] - d_to_remove) // 2 + + z0, x0, gate, hidden_states_B_C, dt = projected_states.squeeze(1).split( + [d_mlp, d_mlp, self.intermediate_size, self.conv_dim, self.num_heads], dim=-1 + ) + + # 2. Convolution sequence transformation + hidden_states_B_C = causal_conv1d_update( + hidden_states_B_C, + cache_params.conv_states[self.layer_idx], + self.conv1d.weight.squeeze(1), + self.conv1d.bias, + self.activation, + ) + + hidden_states, B, C = torch.split( + hidden_states_B_C, + [self.intermediate_size, groups_time_state_size, groups_time_state_size], + dim=-1, + ) + + # 3. SSM transformation + A = -torch.exp(self.A_log.float()) # (nheads,) + A = A[:, None, ...][:, :, None].expand(-1, self.head_dim, self.ssm_state_size).to(dtype=torch.float32) + dt = dt[:, :, None].expand(-1, -1, self.head_dim) + dt_bias = self.dt_bias[:, None, ...].expand(-1, self.head_dim) + D = self.D[:, None, ...].expand(-1, self.head_dim) + B = B.view(batch_size, self.n_groups, B.shape[1] // self.n_groups) + C = C.view(batch_size, self.n_groups, C.shape[1] // self.n_groups) + hidden_states_reshaped = hidden_states.view(batch_size, self.num_heads, self.head_dim) + hidden_states = selective_state_update( + cache_params.ssm_states[self.layer_idx], + hidden_states_reshaped, + dt, + A, + B, + C, + D, + z=gate.view(batch_size, self.num_heads, self.head_dim) if not self.mamba_rms_norm else None, + dt_bias=dt_bias, + dt_softplus=True, + ) + hidden_states = hidden_states.view(batch_size, self.num_heads * self.head_dim) + + if self.mamba_rms_norm: + hidden_states = self.norm(hidden_states, gate) + + if d_mlp > 0: + hidden_states = torch.cat([F.silu(z0) * x0, hidden_states], dim=-1) + + # 4. Final linear projection + out = self.out_proj(hidden_states[:, None, ...]) + # Fused calculations or step by step if no initialized cache is found + else: + A = -torch.exp(self.A_log.float()) # (num_heads) or (intermediate_size, state_size) + dt_limit_kwargs = {} if self.time_step_limit == (0.0, float("inf")) else {"dt_limit": self.time_step_limit} + + # 2-4. Fused kernel for conv1d, SSM, and the final projection + if self.training and cache_params is None: + out = mamba_split_conv1d_scan_combined( + projected_states, + self.conv1d.weight.squeeze(1), + self.conv1d.bias, + self.dt_bias, + A, + D=self.D, + chunk_size=self.chunk_size, + seq_idx=None, # was seq_idx + activation=self.activation, + rmsnorm_weight=self.norm.weight if self.mamba_rms_norm else None, + rmsnorm_eps=self.norm.variance_epsilon if self.mamba_rms_norm else None, + outproj_weight=self.out_proj.weight, + outproj_bias=self.out_proj.bias, + headdim=self.head_dim, + ngroups=self.n_groups, + norm_before_gate=False, + return_final_states=False, + **dt_limit_kwargs, + ) + + else: + d_mlp = ( + projected_states.shape[-1] + - 2 * self.intermediate_size + - 2 * self.n_groups * self.ssm_state_size + - self.num_heads + ) // 2 + if attention_mask is not None: + projected_states = projected_states * attention_mask[..., None] + _, gate, hidden_states_B_C, dt = projected_states.split( + [ + 2 * d_mlp, + self.intermediate_size, + self.conv_dim, + self.num_heads, + ], + dim=-1, + ) + + if cache_params is not None: + conv_states = F.pad( + hidden_states_B_C.permute(0, 2, 1), + (self.conv_kernel_size - hidden_states_B_C.shape[-2], 0), + ) + cache_params.update_conv_state(self.layer_idx, conv_states, cache_position) + + time_step = nn.functional.softplus(dt + self.dt_bias) + # 1D Convolution + if causal_conv1d_fn is None or self.activation not in ["silu", "swish"]: + hidden_states_B_C = self.act( + self.conv1d(hidden_states_B_C.transpose(1, 2)).transpose(1, 2)[:, :seq_len] + ) # (B, L, self.d_inner + 2 * ngroups * d_state) + else: + hidden_states_B_C = causal_conv1d_fn( + x=hidden_states_B_C.transpose(1, 2), + weight=self.conv1d.weight.squeeze(1), + bias=self.conv1d.bias, + activation=self.activation, + ).transpose(1, 2)[:, :seq_len] + + hidden_states, B, C = torch.split( + hidden_states_B_C, + [ + self.intermediate_size, + groups_time_state_size, + groups_time_state_size, + ], + dim=-1, + ) + + if attention_mask is not None and attention_mask.shape[1] > 1 and attention_mask.shape[0] > 1: + # tune out hidden states for pad tokens, see https://github.com/state-spaces/mamba/issues/66 + dtype = hidden_states.dtype + hidden_states = (hidden_states * attention_mask[:, :, None]).to(dtype) + # This is a hack to make sure multi-GPU inference works with HF accelerate + # see: https://github.com/Dao-AILab/flash-attention/issues/523 for more details + with torch.cuda.device(hidden_states.device): + scan_output, ssm_state = mamba_chunk_scan_combined( + hidden_states.view(batch_size, seq_len, -1, self.head_dim), + time_step, + A, + B.view(batch_size, seq_len, self.n_groups, -1), + C.view(batch_size, seq_len, self.n_groups, -1), + chunk_size=self.chunk_size, + D=self.D, + z=None, + seq_idx=None, + return_final_states=True, + **dt_limit_kwargs, + ) + if ssm_state is not None and cache_params is not None: + cache_params.ssm_states[self.layer_idx].copy_(ssm_state) + scan_output = scan_output.view(batch_size, seq_len, -1) + # Multiply "gate" branch and apply extra normalization layer + if self.mamba_rms_norm: + out = self.norm(scan_output, gate) + else: + out = scan_output * torch.nn.functional.silu(gate) + out = self.out_proj(out) + return out + + # fmt: off + def torch_forward( + self, + input_states, + cache_params: Optional[FalconHybridMambaAttentionDynamicCache] = None, + cache_position: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + ): + batch_size, seq_len, _ = input_states.shape + dtype = input_states.dtype + + # 1. Gated MLP's linear projection + input_states = apply_mask_to_padding_states(input_states, attention_mask) + # Add Multipliers + input_states = input_states * self.ssm_in_multiplier + projected_states = self.in_proj(input_states) + projected_states = projected_states * self.mup_vector # ADD Mup Multipliers + gate, hidden_states_B_C, dt = projected_states.split([ + self.intermediate_size, self.conv_dim, self.num_heads + ], dim=-1) + + use_precomputed_states = ( + cache_params is not None + and cache_params.has_previous_state + and seq_len == 1 + and cache_params.conv_states[self.layer_idx].shape[0] + == cache_params.ssm_states[self.layer_idx].shape[0] + == batch_size + and cache_position is not None + and cache_position[0] > 0 + ) + + # 2. Convolution sequence transformation + if use_precomputed_states: + cache_params.conv_states[self.layer_idx] = cache_params.conv_states[self.layer_idx].roll(shifts=-1, dims=-1) + cache_params.conv_states[self.layer_idx][:, :, -1] = hidden_states_B_C[:, 0, :].to(cache_params.conv_states[self.layer_idx].device) + + # We need to guarantee that anything regarding the cache is on the same device + conv_states = cache_params.conv_states[self.layer_idx].to(device=self.conv1d.weight.device) + + hidden_states_B_C = torch.sum( + conv_states * self.conv1d.weight.squeeze(1), dim=-1 + ) + if self.use_conv_bias: + hidden_states_B_C = hidden_states_B_C + self.conv1d.bias + hidden_states_B_C = self.act(hidden_states_B_C) + else: + # Init cache + if cache_params is not None: + hidden_states_B_C_transposed = hidden_states_B_C.transpose(1, 2) + conv_states = nn.functional.pad( + hidden_states_B_C_transposed, (self.conv_kernel_size - hidden_states_B_C_transposed.shape[-1], 0) + ) + cache_params.conv_states[self.layer_idx].copy_(conv_states) + + hidden_states_B_C = self.act(self.conv1d(hidden_states_B_C.transpose(1, 2))[..., :seq_len].transpose(1, 2)) + + hidden_states_B_C = apply_mask_to_padding_states(hidden_states_B_C, attention_mask) + hidden_states, B, C = torch.split( + hidden_states_B_C, + [self.intermediate_size, self.n_groups * self.ssm_state_size, self.n_groups * self.ssm_state_size], + dim=-1 + ) + + # 3. SSM transformation + A = -torch.exp(self.A_log.float()) # [num_heads] + if use_precomputed_states: + # We need to guarantee that anything regarding the cache is on the same device + cache_device = cache_params.ssm_states[self.layer_idx].device + + # Note: there is no need to pad parameter matrices here, as there is just one new token + # for batched generation + dt = dt[:, 0, :][:, None, ...] + dt = dt.transpose(1, 2).expand(batch_size, dt.shape[-1], self.head_dim) + # [num_heads] -> [num_heads, head_dim] + dt_bias = self.dt_bias[..., None].expand(self.dt_bias.shape[0], self.head_dim) + + dt = torch.nn.functional.softplus(dt + dt_bias.to(dt.dtype)) + dt = torch.clamp(dt, self.time_step_limit[0], self.time_step_limit[1]) + A = A[..., None, None].expand(self.num_heads, self.head_dim, self.ssm_state_size).to(dtype=torch.float32) + # [bsz, num_heads, head_dim, state_size] + dA = (torch.exp(dt[..., None] * A)).to(device=cache_device) + + # Discretize B + # [bsz, n_groups * state_size] -> [bsz, n_groups, 1, state_size] -> + # -> [bsz, n_groups, group to head repetition factor, state_size] -> [bsz, num_heads, state_size] + B = B.reshape(batch_size, self.n_groups, -1)[..., None, :] + B = B.expand(batch_size, self.n_groups, self.num_heads // self.n_groups, B.shape[-1]).contiguous() + B = B.reshape(batch_size, -1, B.shape[-1]) + # [bsz, num_heads, head_dim, state_size] + dB = dt[..., None] * B[..., None, :] + + # Discretize x into dB + # [bsz, intermediate_size] -> [bsz, num_heads, head_dim] + hidden_states = hidden_states.reshape(batch_size, -1, self.head_dim) + dBx = (dB * hidden_states[..., None]).to(device=cache_device) + + # State calculation + cache_params.ssm_states[self.layer_idx].copy_( + cache_params.ssm_states[self.layer_idx] * dA + dBx + ) + + # Subsequent output + # [bsz, n_groups * state_size] -> [bsz, num_heads, state_size] + C = C.reshape(batch_size, self.n_groups, -1)[..., None, :] + C = C.expand(batch_size, self.n_groups, self.num_heads // self.n_groups, C.shape[-1]).contiguous() + C = C.reshape(batch_size, -1, C.shape[-1]) + # [bsz, num_heads, head_dim] + + ssm_states = cache_params.ssm_states[self.layer_idx].to(device=C.device, dtype=C.dtype) # Shape: [b, h, d, n] + # Reshape ssm_states to merge the first two dimensions + ssm_states_reshaped = ssm_states.view(batch_size * self.num_heads, self.head_dim, self.ssm_state_size) # Shape: [b*h, d, n] + C_reshaped = C.view(batch_size * self.num_heads, self.ssm_state_size, 1) # Shape: [b*h, n, 1] + y = torch.bmm(ssm_states_reshaped, C_reshaped) + y = y.view(batch_size, self.num_heads, self.head_dim) + + # D skip connection + # [num_heads] -> [num_heads, head_dim] + D = self.D[..., None].expand(self.D.shape[0], self.head_dim) + y = (y + hidden_states * D).to(y.dtype) + + # [bsz, num_heads, head_dim] -> [bsz, 1, intermediate_size] + y = y.reshape(batch_size, -1)[:, None, ...] + else: + # begin ssd naive implementation without einsums + dt = nn.functional.softplus(dt + self.dt_bias) + dt = torch.clamp(dt, self.time_step_limit[0], self.time_step_limit[1]) + hidden_states = hidden_states.reshape(batch_size, seq_len, -1, self.head_dim).float() + B = B.reshape(batch_size, seq_len, -1, self.ssm_state_size).float() + C = C.reshape(batch_size, seq_len, -1, self.ssm_state_size).float() + B = B.repeat_interleave(self.num_heads // self.n_groups, dim=2, output_size=self.num_heads) + C = C.repeat_interleave(self.num_heads // self.n_groups, dim=2, output_size=self.num_heads) + pad_size = (self.chunk_size - seq_len % self.chunk_size) % self.chunk_size + + D_residual = self.D[..., None] * pad_tensor_by_size(hidden_states, pad_size) + + # Discretize x and A + hidden_states = hidden_states * dt[..., None] + A = A.to(hidden_states.dtype) * dt + + # Rearrange into blocks/chunks + hidden_states, A, B, C = [reshape_into_chunks(t, pad_size, self.chunk_size) for t in (hidden_states, A, B, C)] + + # [bsz, -1, chunk_size, num_heads] -> [bsz, num_heads, -1, chunk_size] + A = A.permute(0, 3, 1, 2) + A_cumsum = torch.cumsum(A, dim=-1) + + # 1. Compute the output for each intra-chunk (diagonal blocks) + # This is the analog of a causal mask + L = torch.exp(segment_sum(A)) + + # Contraction of C and B to get G (attention-weights like) + G_intermediate = C[:, :, :, None, :, :] * B[:, :, None, :, :, :] # shape: (b, c, l, s, h, n) + G = G_intermediate.sum(dim=-1) # shape: (b, c, l, s, h) + + # Compute M, equivalent to applying attention mask to weights + M_intermediate = G[..., None] * L.permute(0, 2, 3, 4, 1)[..., None] + M = M_intermediate.sum(dim=-1) + + # Compute Y_diag (apply to values) + Y_diag = (M[..., None] * hidden_states[:, :, None]).sum(dim=3) + + # 2. Compute the state for each intra-chunk + # (right term of low-rank factorization of off-diagonal blocks; B terms) + decay_states = torch.exp(A_cumsum[:, :, :, -1:] - A_cumsum) + B_decay = B * decay_states.permute(0, -2, -1, 1)[..., None] + states = (B_decay[..., None, :] * hidden_states[..., None]).sum(dim=2) + + # 3. Compute the inter-chunk SSM recurrence; produces correct SSM states at chunk boundaries + # (middle term of factorization of off-diag blocks; A terms) + if use_precomputed_states: + previous_states = cache_params.ssm_states[self.layer_idx][:, None, ...].to(device=states.device) + else: + previous_states = torch.zeros_like(states[:, :1]) + states = torch.cat([previous_states, states], dim=1) + decay_chunk = torch.exp(segment_sum(nn.functional.pad(A_cumsum[:, :, :, -1], (1, 0)))) + decay_chunk = decay_chunk.transpose(1, 3) + new_states = (decay_chunk[..., None, None] * states[:, :, None, ...]).sum(dim=1) + states, ssm_state = new_states[:, :-1], new_states[:, -1] + + # 4. Compute state -> output conversion per chunk + # (left term of low-rank factorization of off-diagonal blocks; C terms) + state_decay_out = torch.exp(A_cumsum) + C_times_states = (C[..., None, :] * states[:, :, None, ...]) + state_decay_out_permuted = state_decay_out.permute(0, 2, 3, 1) + Y_off = (C_times_states.sum(-1) * state_decay_out_permuted[..., None]) + + # Add output of intra-chunk and inter-chunk terms (diagonal and off-diagonal blocks) + y = Y_diag + Y_off + # [bsz, -1, self.chunk_size, num_heads, head_dim] -> [bsz, (padded) seq_len, num_heads, head_dim] + y = y.reshape(batch_size, -1, self.num_heads, self.head_dim) + + y = y + D_residual + # Cutting off padded chunks + if pad_size > 0: + y = y[:, :seq_len, :, :] + y = y.reshape(batch_size, seq_len, -1) + + # Init cache + if ssm_state is not None and cache_params is not None: + cache_params.ssm_states[self.layer_idx].copy_(ssm_state) + + if self.mamba_rms_norm: + scan_output = self.norm(y, gate) + else: + scan_output = y * torch.nn.functional.silu(gate) + + # end ssd naive + + # 4. Final linear projection + contextualized_states = self.out_proj(scan_output.to(dtype)) # [batch, seq_len, hidden_size] + return contextualized_states + # fmt: on + + def forward( + self, + hidden_states, + cache_params: Optional[FalconHybridMambaAttentionDynamicCache] = None, + cache_position: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + ): + if is_fast_path_available and "cuda" in self.in_proj.weight.device.type: + return self.cuda_kernels_forward(hidden_states, cache_params, cache_position, attention_mask) + dtype = hidden_states.dtype + if attention_mask is not None and attention_mask.shape[1] > 1 and attention_mask.shape[0] > 1: + # tune out hidden states for pad tokens, see https://github.com/state-spaces/mamba/issues/66 + hidden_states = (hidden_states * attention_mask[:, :, None]).to(dtype) + + return self.torch_forward(hidden_states, cache_params, cache_position, attention_mask) + + +class FalconH1MLP(LlamaMLP): + def __init__(self, config: FalconH1Config): + super().__init__(config) + self.gate_multiplier, self.down_multiplier = config.mlp_multipliers + + def forward(self, x): + y = self.up_proj(x) * self.act_fn(self.gate_proj(x) * self.gate_multiplier) + y = self.down_proj(y) * self.down_multiplier + return y + + +class FalconH1RMSNorm(LlamaRMSNorm): + pass + + +class FalconH1DecoderLayer(GradientCheckpointingLayer): + def __init__(self, config: FalconH1Config, layer_idx: int): + super().__init__() + self.feed_forward = FalconH1MLP(config) + + head_dim = config.hidden_size // config.num_attention_heads + self.channels_attn = config.num_attention_heads * head_dim + 2 * config.num_key_value_heads * head_dim + + self.mamba = FalconH1Mixer(config=config, layer_idx=layer_idx) + + self.self_attn = FalconH1Attention(config, layer_idx) + + self.attention_in_multiplier = config.attention_in_multiplier + self.ssm_out_multiplier = config.ssm_out_multiplier + self.attn_out_multiplier = config.attention_out_multiplier + + self.input_layernorm = FalconH1RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.pre_ff_layernorm = FalconH1RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + mamba_attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[FalconHybridMambaAttentionDynamicCache] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + cache_position: Optional[torch.LongTensor] = None, + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + **kwargs, + ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]: + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`, *optional*): attention mask of size + `(batch, sequence_length)` where padding elements are indicated by 0. + past_key_values (`FalconHybridMambaAttentionDynamicCache`, *optional*): cached past key and value projection states + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*): + Indices depicting the position of the input sequence tokens in the sequence. + position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*): + Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`, + with `head_dim` being the embedding dimension of each attention head. + kwargs (`dict`, *optional*): + Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code + into the model + """ + + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + + mamba_hidden_states = self.mamba( + hidden_states=hidden_states, + cache_params=past_key_values, + cache_position=cache_position, + attention_mask=mamba_attention_mask, + ) + mamba_hidden_states = mamba_hidden_states * self.ssm_out_multiplier + + attention_hidden_states, self_attn_weights = self.self_attn( + hidden_states=hidden_states * self.attention_in_multiplier, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + output_attentions=output_attentions, + use_cache=use_cache, + cache_position=cache_position, + position_embeddings=position_embeddings, + **kwargs, + ) + attention_hidden_states = attention_hidden_states * self.attn_out_multiplier + + hidden_states = mamba_hidden_states + attention_hidden_states + + # residual connection after attention + hidden_states = residual + hidden_states + + # feed-forward + residual = hidden_states + hidden_states = self.pre_ff_layernorm(hidden_states) + hidden_states = self.feed_forward(hidden_states) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights,) + + return outputs + + +@auto_docstring +class FalconH1PreTrainedModel(PreTrainedModel): + config: FalconH1Config + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["FalconH1DecoderLayer"] + _skip_keys_device_placement = "past_key_values" + _supports_flash_attn = True + _supports_sdpa = True + _is_stateful = True + + def _init_weights(self, module): + std = self.config.initializer_range + for name, param in module.named_parameters(recurse=True): + if not param.requires_grad: + continue + if "layernorm" in name.lower() and "weight" in name: + # LayerNorm weights usually initialized to 1 + param.data.fill_(1.0) + elif "bias" in name: + param.data.zero_() + else: + try: + param.data.normal_(mean=0.0, std=std) + except Exception as e: + print(f"Skipping init for {name} due to error: {e}") + + +def compute_mup_vector(config): + """ + Computes the MuP vector based on model configuration. + + FalconH1 applies different MuP multiplier for each dimension of the hidden states. + The MuP vector is partitioned into chunks, and each chunk is multiplied with its + corresponding projected dimension. + + Args: + config: FalconH1Config object + + Returns: + torch.Tensor: The computed MuP vector + """ + # We'll need some values from the config to compute the vector dimensions + intermediate_size = ( + config.mamba_d_ssm if config.mamba_d_ssm is not None else int(config.mamba_expand * config.hidden_size) + ) + groups_time_state_size = config.mamba_n_groups * config.mamba_d_state + num_heads = config.mamba_n_heads + zxbcdt_multipliers = config.ssm_multipliers + + vector_shape = 2 * intermediate_size + 2 * groups_time_state_size + num_heads + mup_vector = torch.ones(1, 1, vector_shape) + + # Apply multipliers to different sections of the vector + mup_vector[:, :, :intermediate_size] *= zxbcdt_multipliers[0] + mup_vector[:, :, intermediate_size : 2 * intermediate_size] *= zxbcdt_multipliers[1] + mup_vector[:, :, 2 * intermediate_size : 2 * intermediate_size + groups_time_state_size] *= zxbcdt_multipliers[2] + mup_vector[ + :, :, 2 * intermediate_size + groups_time_state_size : 2 * intermediate_size + 2 * groups_time_state_size + ] *= zxbcdt_multipliers[3] + mup_vector[:, :, 2 * intermediate_size + 2 * groups_time_state_size :] *= zxbcdt_multipliers[4] + + return mup_vector + + +@auto_docstring +# Adapted from transformers.models.jamba.modeling_jamba.JambaModel +class FalconH1Model(FalconH1PreTrainedModel): + def __init__(self, config: FalconH1Config): + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) + decoder_layers = [] + for i in range(config.num_hidden_layers): + decoder_layers.append(FalconH1DecoderLayer(config, layer_idx=i)) + self.layers = nn.ModuleList(decoder_layers) + + self._attn_implementation = config._attn_implementation + self.final_layernorm = FalconH1RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.rotary_emb = FalconH1RotaryEmbedding(config=config) + + self.embedding_multiplier = config.embedding_multiplier + self.lm_head_multiplier = config.lm_head_multiplier + + self.gradient_checkpointing = False + # Compute the MuP vector once and register it for all layers + mup_vector = compute_mup_vector(config) + for layer in self.layers: + layer.mamba.register_buffer("mup_vector", mup_vector, persistent=False) + + # Initialize weights and apply final processing + self.post_init() + + @can_return_tuple + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[FalconHybridMambaAttentionDynamicCache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs, # NOOP kwargs, for now + ) -> Union[tuple, BaseModelOutputWithPast]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") + + if self.gradient_checkpointing and self.training and use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`." + ) + use_cache = False + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) * self.embedding_multiplier + hidden_states = inputs_embeds + + if use_cache and past_key_values is None: + logger.warning_once( + "FalconH1 requires an initialized `FalconHybridMambaAttentionDynamicCache` to return a cache. None was " + "provided, so no cache will be returned." + ) + + if cache_position is None: + cache_position = torch.arange(hidden_states.shape[1], device=hidden_states.device) + if position_ids is None: + position_ids = cache_position.unsqueeze(0) + + causal_mask = self._update_causal_mask( + attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions + ) + mamba_mask = self._update_mamba_mask(attention_mask, cache_position) + + # create position embeddings to be shared across the decoder layers + position_embeddings = self.rotary_emb(hidden_states, position_ids) + + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + + for decoder_layer in self.layers: + if output_hidden_states: + all_hidden_states += (hidden_states,) + + layer_outputs = decoder_layer( + hidden_states, + attention_mask=causal_mask, + mamba_attention_mask=mamba_mask, + position_ids=position_ids, + past_key_values=past_key_values, + output_attentions=output_attentions, + use_cache=use_cache, + cache_position=cache_position, + position_embeddings=position_embeddings, + ) + + hidden_states = layer_outputs[0] + + if output_attentions: + if layer_outputs[1] is not None: + # append attentions only of attention layers. Mamba layers return `None` as the attention weights + all_self_attns += (layer_outputs[1],) + + hidden_states = self.final_layernorm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if past_key_values and not past_key_values.has_previous_state: + past_key_values.has_previous_state = True + + next_cache = None if not use_cache else past_key_values + + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + ) + + def _update_mamba_mask(self, attention_mask, cache_position): + """ + No need for zeroing states when + 1. Cached forward + 2. Attending to all inputs + """ + mamba_mask = attention_mask + if cache_position[0] > 0 or (attention_mask is not None and torch.all(attention_mask == 1)): + mamba_mask = None + return mamba_mask + + def _update_causal_mask( + self, + attention_mask: torch.Tensor, + input_tensor: torch.Tensor, + cache_position: torch.Tensor, + past_key_values: FalconHybridMambaAttentionDynamicCache, + output_attentions: bool, + ): + if self.config._attn_implementation == "flash_attention_2": + if attention_mask is not None and 0.0 in attention_mask: + return attention_mask + return None + + # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in + # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail + # to infer the attention mask. + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + + # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward + if self.config._attn_implementation == "sdpa" and not output_attentions: + if AttentionMaskConverter._ignore_causal_mask_sdpa( + attention_mask, + inputs_embeds=input_tensor, + past_key_values_length=past_seen_tokens, + is_training=self.training, + ): + return None + + dtype = input_tensor.dtype + sequence_length = input_tensor.shape[1] + target_length = ( + attention_mask.shape[-1] + if isinstance(attention_mask, torch.Tensor) + else past_seen_tokens + sequence_length + 1 + ) + + # In case the provided `attention` mask is 2D, we generate a causal mask here (4D). + causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position( + attention_mask, + sequence_length=sequence_length, + target_length=target_length, + dtype=dtype, + cache_position=cache_position, + batch_size=input_tensor.shape[0], + ) + + if ( + self.config._attn_implementation == "sdpa" + and attention_mask is not None + and attention_mask.device.type in ["cuda", "xpu", "npu"] + and not output_attentions + ): + # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when + # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path. + # Details: https://github.com/pytorch/pytorch/issues/110213 + min_dtype = torch.finfo(dtype).min + causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype) + + return causal_mask + + @staticmethod + def _prepare_4d_causal_attention_mask_with_cache_position( + attention_mask: torch.Tensor, + sequence_length: int, + target_length: int, + dtype: torch.dtype, + cache_position: torch.Tensor, + batch_size: int, + **kwargs, + ): + """ + Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape + `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing. + + Args: + attention_mask (`torch.Tensor`): + A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape + `(batch_size, 1, query_length, key_value_length)`. + sequence_length (`int`): + The sequence length being processed. + target_length (`int`): + The target length: when generating with static cache, the mask should be as long as the static cache, + to account for the 0 padding, the part of the cache that is not filled yet. + dtype (`torch.dtype`): + The dtype to use for the 4D attention mask. + cache_position (`torch.Tensor`): + Indices depicting the position of the input sequence tokens in the sequence. + batch_size (`torch.Tensor`): + Batch size. + """ + if attention_mask is not None and attention_mask.dim() == 4: + # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing. + causal_mask = attention_mask + else: + min_dtype = torch.finfo(dtype).min + causal_mask = torch.full( + (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=cache_position.device + ) + if sequence_length != 1: + causal_mask = torch.triu(causal_mask, diagonal=1) + causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(-1, 1) + causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1) + if attention_mask is not None: + causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit + mask_length = attention_mask.shape[-1] + padding_attention_mask = (attention_mask[:, None, None, :] == attention_mask[:, None, :, None])[ + :, :, -sequence_length:, : + ].to(dtype) + padding_mask = causal_mask[:, :, :, :mask_length] + padding_attention_mask + padding_mask = padding_mask == 0 + causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill( + padding_mask, min_dtype + ) + + return causal_mask + + +class FalconH1ForCausalLM(LlamaForCausalLM): + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[FalconHybridMambaAttentionDynamicCache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + logits_to_keep: Union[int, torch.Tensor] = 0, + **kwargs, + ) -> Union[tuple, CausalLMOutputWithPast]: + r""" + Example: + + ```python + >>> from transformers import AutoTokenizer, FalconH1ForCausalLM + + >>> model = FalconH1ForCausalLM.from_pretrained("...") + >>> tokenizer = AutoTokenizer.from_pretrained("...") + + >>> prompt = "Hey, are you conscious? Can you talk to me?" + >>> inputs = tokenizer(prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(inputs.input_ids, max_length=30) + >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you." + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + cache_position=cache_position, + **kwargs, + ) + + hidden_states = outputs[0] + # Only compute necessary logits, and do not upcast them to float if we are not computing the loss + slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep + logits = self.lm_head(hidden_states[:, slice_indices, :]) * self.model.lm_head_multiplier + + loss = None + if labels is not None: + loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs) + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def prepare_inputs_for_generation( + self, + input_ids, + past_key_values=None, + attention_mask=None, + inputs_embeds=None, + cache_position=None, + position_ids=None, + use_cache=True, + **kwargs, + ): + # Overwritten -- has a unique cache type, `FalconHybridMambaAttentionDynamicCache` + + empty_past_kv = past_key_values is None + + # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens + # Exception 1: when passing input_embeds, input_ids may be missing entries + # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here + # Exception 3: with synced GPUs cache_position may go out of bounds, but we only want dummy token in that case. + # (we can't check exception 3 while compiling) + if not empty_past_kv: + if ( + inputs_embeds is not None # Exception 1 + or (is_torchdynamo_compiling() or cache_position[-1] >= input_ids.shape[1]) # Exception 3 + ): + input_ids = input_ids[:, -cache_position.shape[0] :] + elif input_ids.shape[1] != cache_position.shape[0]: # Default case (the "else", a no op, is Exception 2) + input_ids = input_ids[:, cache_position] + else: + past_key_values = FalconHybridMambaAttentionDynamicCache( + self.config, + input_ids.shape[0], + self.dtype, + devices=[ + self.model.layers[i].mamba.conv1d.weight.device for i in range(self.config.num_hidden_layers) + ], + ) + + if attention_mask is not None and position_ids is None: + # create position_ids on the fly for batch generation + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + if not empty_past_kv: + position_ids = position_ids[:, -input_ids.shape[1] :] + + # if `inputs_embeds` are passed, we only want to use them in the 1st generation step + if inputs_embeds is not None and empty_past_kv: + model_inputs = {"inputs_embeds": inputs_embeds} + else: + model_inputs = {"input_ids": input_ids.contiguous()} # `contiguous()` needed for compilation use cases + + model_inputs.update( + { + "position_ids": position_ids, + "past_key_values": past_key_values, + "use_cache": use_cache, + "attention_mask": attention_mask, + "logits_to_keep": self.config.num_logits_to_keep, + "cache_position": cache_position, + } + ) + + # Forward ALL kwargs that are uninitialized (e.g. `use_cache`). + for key, value in kwargs.items(): + if key not in model_inputs: + model_inputs[key] = value + + return model_inputs + + +__all__ = ["FalconH1Model", "FalconH1ForCausalLM", "FalconH1PreTrainedModel"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/flaubert/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/flaubert/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e981d9cbcb1e456c206b3bec252df1598e23575a --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/flaubert/__init__.py @@ -0,0 +1,29 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_flaubert import * + from .modeling_flaubert import * + from .modeling_tf_flaubert import * + from .tokenization_flaubert import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/flaubert/configuration_flaubert.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/flaubert/configuration_flaubert.py new file mode 100644 index 0000000000000000000000000000000000000000..071a74fe69b420954ff6ce7154b227c0e0d7e4dd --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/flaubert/configuration_flaubert.py @@ -0,0 +1,235 @@ +# coding=utf-8 +# Copyright 2019-present CNRS, Facebook Inc. and the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Flaubert configuration""" + +from collections import OrderedDict +from collections.abc import Mapping + +from ...configuration_utils import PretrainedConfig +from ...onnx import OnnxConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class FlaubertConfig(PretrainedConfig): + """ + This is the configuration class to store the configuration of a [`FlaubertModel`] or a [`TFFlaubertModel`]. It is + used to instantiate a FlauBERT model according to the specified arguments, defining the model architecture. + Instantiating a configuration with the defaults will yield a similar configuration to that of the FlauBERT + [flaubert/flaubert_base_uncased](https://huggingface.co/flaubert/flaubert_base_uncased) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + pre_norm (`bool`, *optional*, defaults to `False`): + Whether to apply the layer normalization before or after the feed forward layer following the attention in + each layer (Vaswani et al., Tensor2Tensor for Neural Machine Translation. 2018) + layerdrop (`float`, *optional*, defaults to 0.0): + Probability to drop layers during training (Fan et al., Reducing Transformer Depth on Demand with + Structured Dropout. ICLR 2020) + vocab_size (`int`, *optional*, defaults to 30145): + Vocabulary size of the FlauBERT model. Defines the number of different tokens that can be represented by + the `inputs_ids` passed when calling [`FlaubertModel`] or [`TFFlaubertModel`]. + emb_dim (`int`, *optional*, defaults to 2048): + Dimensionality of the encoder layers and the pooler layer. + n_layer (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + n_head (`int`, *optional*, defaults to 16): + Number of attention heads for each attention layer in the Transformer encoder. + dropout (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_dropout (`float`, *optional*, defaults to 0.1): + The dropout probability for the attention mechanism + gelu_activation (`bool`, *optional*, defaults to `True`): + Whether or not to use a *gelu* activation instead of *relu*. + sinusoidal_embeddings (`bool`, *optional*, defaults to `False`): + Whether or not to use sinusoidal positional embeddings instead of absolute positional embeddings. + causal (`bool`, *optional*, defaults to `False`): + Whether or not the model should behave in a causal manner. Causal models use a triangular attention mask in + order to only attend to the left-side context instead if a bidirectional context. + asm (`bool`, *optional*, defaults to `False`): + Whether or not to use an adaptive log softmax projection layer instead of a linear layer for the prediction + layer. + n_langs (`int`, *optional*, defaults to 1): + The number of languages the model handles. Set to 1 for monolingual models. + use_lang_emb (`bool`, *optional*, defaults to `True`) + Whether to use language embeddings. Some models use additional language embeddings, see [the multilingual + models page](http://huggingface.co/transformers/multilingual.html#xlm-language-embeddings) for information + on how to use them. + max_position_embeddings (`int`, *optional*, defaults to 512): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + embed_init_std (`float`, *optional*, defaults to 2048^-0.5): + The standard deviation of the truncated_normal_initializer for initializing the embedding matrices. + init_std (`int`, *optional*, defaults to 50257): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices except the + embedding matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-12): + The epsilon used by the layer normalization layers. + bos_index (`int`, *optional*, defaults to 0): + The index of the beginning of sentence token in the vocabulary. + eos_index (`int`, *optional*, defaults to 1): + The index of the end of sentence token in the vocabulary. + pad_index (`int`, *optional*, defaults to 2): + The index of the padding token in the vocabulary. + unk_index (`int`, *optional*, defaults to 3): + The index of the unknown token in the vocabulary. + mask_index (`int`, *optional*, defaults to 5): + The index of the masking token in the vocabulary. + is_encoder(`bool`, *optional*, defaults to `True`): + Whether or not the initialized model should be a transformer encoder or decoder as seen in Vaswani et al. + summary_type (`string`, *optional*, defaults to "first"): + Argument used when doing sequence summary. Used in the sequence classification and multiple choice models. + + Has to be one of the following options: + + - `"last"`: Take the last token hidden state (like XLNet). + - `"first"`: Take the first token hidden state (like BERT). + - `"mean"`: Take the mean of all tokens hidden states. + - `"cls_index"`: Supply a Tensor of classification token position (like GPT/GPT-2). + - `"attn"`: Not implemented now, use multi-head attention. + summary_use_proj (`bool`, *optional*, defaults to `True`): + Argument used when doing sequence summary. Used in the sequence classification and multiple choice models. + + Whether or not to add a projection after the vector extraction. + summary_activation (`str`, *optional*): + Argument used when doing sequence summary. Used in the sequence classification and multiple choice models. + + Pass `"tanh"` for a tanh activation to the output, any other value will result in no activation. + summary_proj_to_labels (`bool`, *optional*, defaults to `True`): + Used in the sequence classification and multiple choice models. + + Whether the projection outputs should have `config.num_labels` or `config.hidden_size` classes. + summary_first_dropout (`float`, *optional*, defaults to 0.1): + Used in the sequence classification and multiple choice models. + + The dropout ratio to be used after the projection and activation. + start_n_top (`int`, *optional*, defaults to 5): + Used in the SQuAD evaluation script. + end_n_top (`int`, *optional*, defaults to 5): + Used in the SQuAD evaluation script. + mask_token_id (`int`, *optional*, defaults to 0): + Model agnostic parameter to identify masked tokens when generating text in an MLM context. + lang_id (`int`, *optional*, defaults to 1): + The ID of the language used by the model. This parameter is used when generating text in a given language. + """ + + model_type = "flaubert" + attribute_map = { + "hidden_size": "emb_dim", + "num_attention_heads": "n_heads", + "num_hidden_layers": "n_layers", + "n_words": "vocab_size", # For backward compatibility + } + + def __init__( + self, + pre_norm=False, + layerdrop=0.0, + vocab_size=30145, + emb_dim=2048, + n_layers=12, + n_heads=16, + dropout=0.1, + attention_dropout=0.1, + gelu_activation=True, + sinusoidal_embeddings=False, + causal=False, + asm=False, + n_langs=1, + use_lang_emb=True, + max_position_embeddings=512, + embed_init_std=2048**-0.5, + layer_norm_eps=1e-12, + init_std=0.02, + bos_index=0, + eos_index=1, + pad_index=2, + unk_index=3, + mask_index=5, + is_encoder=True, + summary_type="first", + summary_use_proj=True, + summary_activation=None, + summary_proj_to_labels=True, + summary_first_dropout=0.1, + start_n_top=5, + end_n_top=5, + mask_token_id=0, + lang_id=0, + pad_token_id=2, + bos_token_id=0, + **kwargs, + ): + """Constructs FlaubertConfig.""" + self.pre_norm = pre_norm + self.layerdrop = layerdrop + self.vocab_size = vocab_size + self.emb_dim = emb_dim + self.n_layers = n_layers + self.n_heads = n_heads + self.dropout = dropout + self.attention_dropout = attention_dropout + self.gelu_activation = gelu_activation + self.sinusoidal_embeddings = sinusoidal_embeddings + self.causal = causal + self.asm = asm + self.n_langs = n_langs + self.use_lang_emb = use_lang_emb + self.layer_norm_eps = layer_norm_eps + self.bos_index = bos_index + self.eos_index = eos_index + self.pad_index = pad_index + self.unk_index = unk_index + self.mask_index = mask_index + self.is_encoder = is_encoder + self.max_position_embeddings = max_position_embeddings + self.embed_init_std = embed_init_std + self.init_std = init_std + self.summary_type = summary_type + self.summary_use_proj = summary_use_proj + self.summary_activation = summary_activation + self.summary_proj_to_labels = summary_proj_to_labels + self.summary_first_dropout = summary_first_dropout + self.start_n_top = start_n_top + self.end_n_top = end_n_top + self.mask_token_id = mask_token_id + self.lang_id = lang_id + + if "n_words" in kwargs: + self.n_words = kwargs["n_words"] + + super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, **kwargs) + + +class FlaubertOnnxConfig(OnnxConfig): + @property + def inputs(self) -> Mapping[str, Mapping[int, str]]: + if self.task == "multiple-choice": + dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"} + else: + dynamic_axis = {0: "batch", 1: "sequence"} + return OrderedDict( + [ + ("input_ids", dynamic_axis), + ("attention_mask", dynamic_axis), + ] + ) + + +__all__ = ["FlaubertConfig", "FlaubertOnnxConfig"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/flaubert/modeling_flaubert.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/flaubert/modeling_flaubert.py new file mode 100644 index 0000000000000000000000000000000000000000..1dadc6f5377b4007c78f44d81d7a39acf2dedbdb --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/flaubert/modeling_flaubert.py @@ -0,0 +1,1700 @@ +# coding=utf-8 +# Copyright 2019-present CNRS, Facebook Inc. and the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch Flaubert model, based on XLM.""" + +import math +from dataclasses import dataclass +from typing import Callable, Optional, Union + +import numpy as np +import torch +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from ...activations import gelu, get_activation +from ...cache_utils import DynamicCache, EncoderDecoderCache +from ...generation import GenerationMixin +from ...modeling_outputs import ( + BaseModelOutput, + MaskedLMOutput, + MultipleChoiceModelOutput, + QuestionAnsweringModelOutput, + SequenceClassifierOutput, + TokenClassifierOutput, +) +from ...modeling_utils import PreTrainedModel +from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer +from ...utils import ModelOutput, auto_docstring, logging +from .configuration_flaubert import FlaubertConfig + + +logger = logging.get_logger(__name__) + + +# Copied from transformers.models.xlm.modeling_xlm.create_sinusoidal_embeddings +def create_sinusoidal_embeddings(n_pos, dim, out): + position_enc = np.array([[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)]) + out.requires_grad = False + out[:, 0::2] = torch.FloatTensor(np.sin(position_enc[:, 0::2])) + out[:, 1::2] = torch.FloatTensor(np.cos(position_enc[:, 1::2])) + out.detach_() + + +# Copied from transformers.models.xlm.modeling_xlm.get_masks +def get_masks(slen, lengths, causal, padding_mask=None): + """ + Generate hidden states mask, and optionally an attention mask. + """ + alen = torch.arange(slen, dtype=torch.long, device=lengths.device) + if padding_mask is not None: + mask = padding_mask + else: + assert lengths.max().item() <= slen + mask = alen < lengths[:, None] + + # attention mask is the same as mask, or triangular inferior attention (causal) + bs = lengths.size(0) + if causal: + attn_mask = alen[None, None, :].repeat(bs, slen, 1) <= alen[None, :, None] + else: + attn_mask = mask + + # sanity check + assert mask.size() == (bs, slen) + assert causal is False or attn_mask.size() == (bs, slen, slen) + + return mask, attn_mask + + +# Copied from transformers.models.xlm.modeling_xlm.MultiHeadAttention +class MultiHeadAttention(nn.Module): + def __init__(self, n_heads, dim, config, layer_idx: int = 0): + super().__init__() + self.layer_id = layer_idx + self.dim = dim + self.n_heads = n_heads + self.head_dim = dim // n_heads + self.dropout = config.attention_dropout + assert self.dim % self.n_heads == 0 + + self.q_lin = nn.Linear(dim, dim) + self.k_lin = nn.Linear(dim, dim) + self.v_lin = nn.Linear(dim, dim) + self.out_lin = nn.Linear(dim, dim) + self.pruned_heads = set() + + def prune_heads(self, heads): + attention_head_size = self.dim // self.n_heads + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices(heads, self.n_heads, attention_head_size, self.pruned_heads) + # Prune linear layers + self.q_lin = prune_linear_layer(self.q_lin, index) + self.k_lin = prune_linear_layer(self.k_lin, index) + self.v_lin = prune_linear_layer(self.v_lin, index) + self.out_lin = prune_linear_layer(self.out_lin, index, dim=1) + # Update hyper params + self.n_heads = self.n_heads - len(heads) + self.dim = attention_head_size * self.n_heads + self.pruned_heads = self.pruned_heads.union(heads) + + def forward( + self, + input, + mask, + kv=None, + cache=None, + head_mask=None, + output_attentions=False, + cache_position=None, + ): + """ + Self-attention (if kv is None) or attention over source sentence (provided by kv). + """ + # Input is (bs, qlen, dim) + # Mask is (bs, klen) (non-causal) or (bs, klen, klen) + bs, qlen, dim = input.size() + is_cross_attention = kv is not None + mask_reshape = (bs, 1, qlen, -1) if mask.dim() == 3 else (bs, 1, 1, -1) + + q = self.q_lin(input).view(bs, -1, self.n_heads, self.head_dim).transpose(1, 2) + if cache is not None: + if isinstance(cache, EncoderDecoderCache): + is_updated = cache.is_updated.get(self.layer_id) + if is_cross_attention: + # after the first generated id, we can subsequently re-use all key/value_states from cache + curr_past_key_value = cache.cross_attention_cache + else: + curr_past_key_value = cache.self_attention_cache + else: + curr_past_key_value = cache + + current_states = kv if is_cross_attention else input + if is_cross_attention and cache is not None and is_updated: + # reuse k,v, cross_attentions + k = curr_past_key_value.key_cache[self.layer_id] + v = curr_past_key_value.value_cache[self.layer_id] + else: + k = self.k_lin(current_states) + v = self.v_lin(current_states) + k = k.view(bs, -1, self.n_heads, self.head_dim).transpose(1, 2) + v = v.view(bs, -1, self.n_heads, self.head_dim).transpose(1, 2) + + if cache is not None: + # save all key/value_states to cache to be re-used for fast auto-regressive generation + cache_position = cache_position if not is_cross_attention else None + k, v = curr_past_key_value.update(k, v, self.layer_id, {"cache_position": cache_position}) + # set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls + if is_cross_attention: + cache.is_updated[self.layer_id] = True + + q = q / math.sqrt(self.head_dim) # (bs, n_heads, qlen, head_dim) + scores = torch.matmul(q, k.transpose(2, 3)) # (bs, n_heads, qlen, klen) + mask = (mask == 0).view(mask_reshape).expand_as(scores) # (bs, n_heads, qlen, klen) + scores.masked_fill_(mask, torch.finfo(scores.dtype).min) # (bs, n_heads, qlen, klen) + + weights = nn.functional.softmax(scores.float(), dim=-1).type_as(scores) # (bs, n_heads, qlen, klen) + weights = nn.functional.dropout(weights, p=self.dropout, training=self.training) # (bs, n_heads, qlen, klen) + + # Mask heads if we want to + if head_mask is not None: + weights = weights * head_mask + + context = torch.matmul(weights, v) # (bs, n_heads, qlen, head_dim) + context = context.transpose(1, 2).contiguous().view(bs, -1, self.n_heads * self.head_dim) + + outputs = (self.out_lin(context),) + if output_attentions: + outputs = outputs + (weights,) + return outputs + + +# Copied from transformers.models.xlm.modeling_xlm.TransformerFFN +class TransformerFFN(nn.Module): + def __init__(self, in_dim, dim_hidden, out_dim, config): + super().__init__() + self.dropout = config.dropout + self.lin1 = nn.Linear(in_dim, dim_hidden) + self.lin2 = nn.Linear(dim_hidden, out_dim) + self.act = gelu if config.gelu_activation else nn.functional.relu + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 + + def forward(self, input): + return apply_chunking_to_forward(self.ff_chunk, self.chunk_size_feed_forward, self.seq_len_dim, input) + + def ff_chunk(self, input): + x = self.lin1(input) + x = self.act(x) + x = self.lin2(x) + x = nn.functional.dropout(x, p=self.dropout, training=self.training) + return x + + +@auto_docstring( + custom_intro=""" + The bare Flaubert Model transformer outputting raw hidden-states without any specific head on top. + """ +) +# Copied from transformers.models.xlm.modeling_xlm.XLMPredLayer with XLM->Flaubert +class FlaubertPredLayer(nn.Module): + """ + Prediction layer (cross_entropy or adaptive_softmax). + """ + + def __init__(self, config): + super().__init__() + self.asm = config.asm + self.n_words = config.n_words + self.pad_index = config.pad_index + dim = config.emb_dim + + if config.asm is False: + self.proj = nn.Linear(dim, config.n_words, bias=True) + else: + self.proj = nn.AdaptiveLogSoftmaxWithLoss( + in_features=dim, + n_classes=config.n_words, + cutoffs=config.asm_cutoffs, + div_value=config.asm_div_value, + head_bias=True, # default is False + ) + + def forward(self, x, y=None): + """Compute the loss, and optionally the scores.""" + outputs = () + if self.asm is False: + scores = self.proj(x) + outputs = (scores,) + outputs + if y is not None: + loss = nn.functional.cross_entropy(scores.view(-1, self.n_words), y.view(-1), reduction="mean") + outputs = (loss,) + outputs + else: + scores = self.proj.log_prob(x) + outputs = (scores,) + outputs + if y is not None: + _, loss = self.proj(x, y) + outputs = (loss,) + outputs + + return outputs + + +@dataclass +@auto_docstring( + custom_intro=""" + Base class for outputs of question answering models using a [`~modeling_utils.FlaubertSQuADHead`]. + """ +) +# Copied from transformers.models.xlm.modeling_xlm.XLMSquadHeadOutput with XLM->Flaubert +class FlaubertSquadHeadOutput(ModelOutput): + r""" + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned if both `start_positions` and `end_positions` are provided): + Classification loss as the sum of start token, end token (and is_impossible if provided) classification + losses. + start_top_log_probs (`torch.FloatTensor` of shape `(batch_size, config.start_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided): + Log probabilities for the top config.start_n_top start token possibilities (beam-search). + start_top_index (`torch.LongTensor` of shape `(batch_size, config.start_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided): + Indices for the top config.start_n_top start token possibilities (beam-search). + end_top_log_probs (`torch.FloatTensor` of shape `(batch_size, config.start_n_top * config.end_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided): + Log probabilities for the top `config.start_n_top * config.end_n_top` end token possibilities + (beam-search). + end_top_index (`torch.LongTensor` of shape `(batch_size, config.start_n_top * config.end_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided): + Indices for the top `config.start_n_top * config.end_n_top` end token possibilities (beam-search). + cls_logits (`torch.FloatTensor` of shape `(batch_size,)`, *optional*, returned if `start_positions` or `end_positions` is not provided): + Log probabilities for the `is_impossible` label of the answers. + """ + + loss: Optional[torch.FloatTensor] = None + start_top_log_probs: Optional[torch.FloatTensor] = None + start_top_index: Optional[torch.LongTensor] = None + end_top_log_probs: Optional[torch.FloatTensor] = None + end_top_index: Optional[torch.LongTensor] = None + cls_logits: Optional[torch.FloatTensor] = None + + +# Copied from transformers.models.xlm.modeling_xlm.XLMPoolerStartLogits with XLM->Flaubert +class FlaubertPoolerStartLogits(nn.Module): + """ + Compute SQuAD start logits from sequence hidden states. + + Args: + config ([`FlaubertConfig`]): + The config used by the model, will be used to grab the `hidden_size` of the model. + """ + + def __init__(self, config: FlaubertConfig): + super().__init__() + self.dense = nn.Linear(config.hidden_size, 1) + + def forward( + self, hidden_states: torch.FloatTensor, p_mask: Optional[torch.FloatTensor] = None + ) -> torch.FloatTensor: + """ + Args: + hidden_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`): + The final hidden states of the model. + p_mask (`torch.FloatTensor` of shape `(batch_size, seq_len)`, *optional*): + Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS). 1.0 means token + should be masked. + + Returns: + `torch.FloatTensor`: The start logits for SQuAD. + """ + x = self.dense(hidden_states).squeeze(-1) + + if p_mask is not None: + if p_mask.dtype == torch.float16: + x = x * (1 - p_mask) - 65500 * p_mask + else: + x = x * (1 - p_mask) - 1e30 * p_mask + + return x + + +# Copied from transformers.models.xlm.modeling_xlm.XLMPoolerEndLogits with XLM->Flaubert +class FlaubertPoolerEndLogits(nn.Module): + """ + Compute SQuAD end logits from sequence hidden states. + + Args: + config ([`FlaubertConfig`]): + The config used by the model, will be used to grab the `hidden_size` of the model and the `layer_norm_eps` + to use. + """ + + def __init__(self, config: FlaubertConfig): + super().__init__() + self.dense_0 = nn.Linear(config.hidden_size * 2, config.hidden_size) + self.activation = nn.Tanh() + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dense_1 = nn.Linear(config.hidden_size, 1) + + def forward( + self, + hidden_states: torch.FloatTensor, + start_states: Optional[torch.FloatTensor] = None, + start_positions: Optional[torch.LongTensor] = None, + p_mask: Optional[torch.FloatTensor] = None, + ) -> torch.FloatTensor: + """ + Args: + hidden_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`): + The final hidden states of the model. + start_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`, *optional*): + The hidden states of the first tokens for the labeled span. + start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + The position of the first token for the labeled span. + p_mask (`torch.FloatTensor` of shape `(batch_size, seq_len)`, *optional*): + Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS). 1.0 means token + should be masked. + + + + One of `start_states` or `start_positions` should be not `None`. If both are set, `start_positions` overrides + `start_states`. + + + + Returns: + `torch.FloatTensor`: The end logits for SQuAD. + """ + assert start_states is not None or start_positions is not None, ( + "One of start_states, start_positions should be not None" + ) + if start_positions is not None: + slen, hsz = hidden_states.shape[-2:] + start_positions = start_positions[:, None, None].expand(-1, -1, hsz) # shape (bsz, 1, hsz) + start_states = hidden_states.gather(-2, start_positions) # shape (bsz, 1, hsz) + start_states = start_states.expand(-1, slen, -1) # shape (bsz, slen, hsz) + + x = self.dense_0(torch.cat([hidden_states, start_states], dim=-1)) + x = self.activation(x) + x = self.LayerNorm(x) + x = self.dense_1(x).squeeze(-1) + + if p_mask is not None: + if p_mask.dtype == torch.float16: + x = x * (1 - p_mask) - 65500 * p_mask + else: + x = x * (1 - p_mask) - 1e30 * p_mask + + return x + + +# Copied from transformers.models.xlm.modeling_xlm.XLMPoolerAnswerClass with XLM->Flaubert +class FlaubertPoolerAnswerClass(nn.Module): + """ + Compute SQuAD 2.0 answer class from classification and start tokens hidden states. + + Args: + config ([`FlaubertConfig`]): + The config used by the model, will be used to grab the `hidden_size` of the model. + """ + + def __init__(self, config: FlaubertConfig): + super().__init__() + self.dense_0 = nn.Linear(config.hidden_size * 2, config.hidden_size) + self.activation = nn.Tanh() + self.dense_1 = nn.Linear(config.hidden_size, 1, bias=False) + + def forward( + self, + hidden_states: torch.FloatTensor, + start_states: Optional[torch.FloatTensor] = None, + start_positions: Optional[torch.LongTensor] = None, + cls_index: Optional[torch.LongTensor] = None, + ) -> torch.FloatTensor: + """ + Args: + hidden_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`): + The final hidden states of the model. + start_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`, *optional*): + The hidden states of the first tokens for the labeled span. + start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + The position of the first token for the labeled span. + cls_index (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Position of the CLS token for each sentence in the batch. If `None`, takes the last token. + + + + One of `start_states` or `start_positions` should be not `None`. If both are set, `start_positions` overrides + `start_states`. + + + + Returns: + `torch.FloatTensor`: The SQuAD 2.0 answer class. + """ + # No dependency on end_feature so that we can obtain one single `cls_logits` for each sample. + hsz = hidden_states.shape[-1] + assert start_states is not None or start_positions is not None, ( + "One of start_states, start_positions should be not None" + ) + if start_positions is not None: + start_positions = start_positions[:, None, None].expand(-1, -1, hsz) # shape (bsz, 1, hsz) + start_states = hidden_states.gather(-2, start_positions).squeeze(-2) # shape (bsz, hsz) + + if cls_index is not None: + cls_index = cls_index[:, None, None].expand(-1, -1, hsz) # shape (bsz, 1, hsz) + cls_token_state = hidden_states.gather(-2, cls_index).squeeze(-2) # shape (bsz, hsz) + else: + cls_token_state = hidden_states[:, -1, :] # shape (bsz, hsz) + + x = self.dense_0(torch.cat([start_states, cls_token_state], dim=-1)) + x = self.activation(x) + x = self.dense_1(x).squeeze(-1) + + return x + + +# Copied from transformers.models.xlm.modeling_xlm.XLMSQuADHead with XLM->Flaubert +class FlaubertSQuADHead(nn.Module): + r""" + A SQuAD head inspired by XLNet. + + Args: + config ([`FlaubertConfig`]): + The config used by the model, will be used to grab the `hidden_size` of the model and the `layer_norm_eps` + to use. + """ + + def __init__(self, config: FlaubertConfig): + super().__init__() + self.start_n_top = config.start_n_top + self.end_n_top = config.end_n_top + + self.start_logits = FlaubertPoolerStartLogits(config) + self.end_logits = FlaubertPoolerEndLogits(config) + self.answer_class = FlaubertPoolerAnswerClass(config) + + @auto_docstring + def forward( + self, + hidden_states: torch.FloatTensor, + start_positions: Optional[torch.LongTensor] = None, + end_positions: Optional[torch.LongTensor] = None, + cls_index: Optional[torch.LongTensor] = None, + is_impossible: Optional[torch.LongTensor] = None, + p_mask: Optional[torch.FloatTensor] = None, + return_dict: bool = False, + ) -> Union[FlaubertSquadHeadOutput, tuple[torch.FloatTensor]]: + r""" + hidden_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`): + Final hidden states of the model on the sequence tokens. + start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Positions of the first token for the labeled span. + end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Positions of the last token for the labeled span. + cls_index (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Position of the CLS token for each sentence in the batch. If `None`, takes the last token. + is_impossible (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Whether the question has a possible answer in the paragraph or not. + p_mask (`torch.FloatTensor` of shape `(batch_size, seq_len)`, *optional*): + Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS). 1.0 means token + should be masked. + """ + start_logits = self.start_logits(hidden_states, p_mask=p_mask) + + if start_positions is not None and end_positions is not None: + # If we are on multi-GPU, let's remove the dimension added by batch splitting + for x in (start_positions, end_positions, cls_index, is_impossible): + if x is not None and x.dim() > 1: + x.squeeze_(-1) + + # during training, compute the end logits based on the ground truth of the start position + end_logits = self.end_logits(hidden_states, start_positions=start_positions, p_mask=p_mask) + + loss_fct = CrossEntropyLoss() + start_loss = loss_fct(start_logits, start_positions) + end_loss = loss_fct(end_logits, end_positions) + total_loss = (start_loss + end_loss) / 2 + + if cls_index is not None and is_impossible is not None: + # Predict answerability from the representation of CLS and START + cls_logits = self.answer_class(hidden_states, start_positions=start_positions, cls_index=cls_index) + loss_fct_cls = nn.BCEWithLogitsLoss() + cls_loss = loss_fct_cls(cls_logits, is_impossible) + + # note(zhiliny): by default multiply the loss by 0.5 so that the scale is comparable to start_loss and end_loss + total_loss += cls_loss * 0.5 + + return FlaubertSquadHeadOutput(loss=total_loss) if return_dict else (total_loss,) + + else: + # during inference, compute the end logits based on beam search + bsz, slen, hsz = hidden_states.size() + start_log_probs = nn.functional.softmax(start_logits, dim=-1) # shape (bsz, slen) + + start_top_log_probs, start_top_index = torch.topk( + start_log_probs, self.start_n_top, dim=-1 + ) # shape (bsz, start_n_top) + start_top_index_exp = start_top_index.unsqueeze(-1).expand(-1, -1, hsz) # shape (bsz, start_n_top, hsz) + start_states = torch.gather(hidden_states, -2, start_top_index_exp) # shape (bsz, start_n_top, hsz) + start_states = start_states.unsqueeze(1).expand(-1, slen, -1, -1) # shape (bsz, slen, start_n_top, hsz) + + hidden_states_expanded = hidden_states.unsqueeze(2).expand_as( + start_states + ) # shape (bsz, slen, start_n_top, hsz) + p_mask = p_mask.unsqueeze(-1) if p_mask is not None else None + end_logits = self.end_logits(hidden_states_expanded, start_states=start_states, p_mask=p_mask) + end_log_probs = nn.functional.softmax(end_logits, dim=1) # shape (bsz, slen, start_n_top) + + end_top_log_probs, end_top_index = torch.topk( + end_log_probs, self.end_n_top, dim=1 + ) # shape (bsz, end_n_top, start_n_top) + end_top_log_probs = end_top_log_probs.view(-1, self.start_n_top * self.end_n_top) + end_top_index = end_top_index.view(-1, self.start_n_top * self.end_n_top) + + start_states = torch.einsum("blh,bl->bh", hidden_states, start_log_probs) + cls_logits = self.answer_class(hidden_states, start_states=start_states, cls_index=cls_index) + + if not return_dict: + return (start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits) + else: + return FlaubertSquadHeadOutput( + start_top_log_probs=start_top_log_probs, + start_top_index=start_top_index, + end_top_log_probs=end_top_log_probs, + end_top_index=end_top_index, + cls_logits=cls_logits, + ) + + +# Copied from transformers.models.xlm.modeling_xlm.XLMSequenceSummary with XLM->Flaubert +class FlaubertSequenceSummary(nn.Module): + r""" + Compute a single vector summary of a sequence hidden states. + + Args: + config ([`FlaubertConfig`]): + The config used by the model. Relevant arguments in the config class of the model are (refer to the actual + config class of your model for the default values it uses): + + - **summary_type** (`str`) -- The method to use to make this summary. Accepted values are: + + - `"last"` -- Take the last token hidden state (like XLNet) + - `"first"` -- Take the first token hidden state (like Bert) + - `"mean"` -- Take the mean of all tokens hidden states + - `"cls_index"` -- Supply a Tensor of classification token position (GPT/GPT-2) + - `"attn"` -- Not implemented now, use multi-head attention + + - **summary_use_proj** (`bool`) -- Add a projection after the vector extraction. + - **summary_proj_to_labels** (`bool`) -- If `True`, the projection outputs to `config.num_labels` classes + (otherwise to `config.hidden_size`). + - **summary_activation** (`Optional[str]`) -- Set to `"tanh"` to add a tanh activation to the output, + another string or `None` will add no activation. + - **summary_first_dropout** (`float`) -- Optional dropout probability before the projection and activation. + - **summary_last_dropout** (`float`)-- Optional dropout probability after the projection and activation. + """ + + def __init__(self, config: FlaubertConfig): + super().__init__() + + self.summary_type = getattr(config, "summary_type", "last") + if self.summary_type == "attn": + # We should use a standard multi-head attention module with absolute positional embedding for that. + # Cf. https://github.com/zihangdai/xlnet/blob/master/modeling.py#L253-L276 + # We can probably just use the multi-head attention module of PyTorch >=1.1.0 + raise NotImplementedError + + self.summary = nn.Identity() + if hasattr(config, "summary_use_proj") and config.summary_use_proj: + if hasattr(config, "summary_proj_to_labels") and config.summary_proj_to_labels and config.num_labels > 0: + num_classes = config.num_labels + else: + num_classes = config.hidden_size + self.summary = nn.Linear(config.hidden_size, num_classes) + + activation_string = getattr(config, "summary_activation", None) + self.activation: Callable = get_activation(activation_string) if activation_string else nn.Identity() + + self.first_dropout = nn.Identity() + if hasattr(config, "summary_first_dropout") and config.summary_first_dropout > 0: + self.first_dropout = nn.Dropout(config.summary_first_dropout) + + self.last_dropout = nn.Identity() + if hasattr(config, "summary_last_dropout") and config.summary_last_dropout > 0: + self.last_dropout = nn.Dropout(config.summary_last_dropout) + + def forward( + self, hidden_states: torch.FloatTensor, cls_index: Optional[torch.LongTensor] = None + ) -> torch.FloatTensor: + """ + Compute a single vector summary of a sequence hidden states. + + Args: + hidden_states (`torch.FloatTensor` of shape `[batch_size, seq_len, hidden_size]`): + The hidden states of the last layer. + cls_index (`torch.LongTensor` of shape `[batch_size]` or `[batch_size, ...]` where ... are optional leading dimensions of `hidden_states`, *optional*): + Used if `summary_type == "cls_index"` and takes the last token of the sequence as classification token. + + Returns: + `torch.FloatTensor`: The summary of the sequence hidden states. + """ + if self.summary_type == "last": + output = hidden_states[:, -1] + elif self.summary_type == "first": + output = hidden_states[:, 0] + elif self.summary_type == "mean": + output = hidden_states.mean(dim=1) + elif self.summary_type == "cls_index": + if cls_index is None: + cls_index = torch.full_like( + hidden_states[..., :1, :], + hidden_states.shape[-2] - 1, + dtype=torch.long, + ) + else: + cls_index = cls_index.unsqueeze(-1).unsqueeze(-1) + cls_index = cls_index.expand((-1,) * (cls_index.dim() - 1) + (hidden_states.size(-1),)) + # shape of cls_index: (bsz, XX, 1, hidden_size) where XX are optional leading dim of hidden_states + output = hidden_states.gather(-2, cls_index).squeeze(-2) # shape (bsz, XX, hidden_size) + elif self.summary_type == "attn": + raise NotImplementedError + + output = self.first_dropout(output) + output = self.summary(output) + output = self.activation(output) + output = self.last_dropout(output) + + return output + + +@auto_docstring +# Copied from transformers.models.xlm.modeling_xlm.XLMPreTrainedModel with XLM->Flaubert +class FlaubertPreTrainedModel(PreTrainedModel): + config: FlaubertConfig + load_tf_weights = None + base_model_prefix = "transformer" + + def __init__(self, *inputs, **kwargs): + super().__init__(*inputs, **kwargs) + + @property + def dummy_inputs(self): + inputs_list = torch.tensor([[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]) + attns_list = torch.tensor([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]]) + if self.config.use_lang_emb and self.config.n_langs > 1: + langs_list = torch.tensor([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]]) + else: + langs_list = None + return {"input_ids": inputs_list, "attention_mask": attns_list, "langs": langs_list} + + def _init_weights(self, module): + """Initialize the weights.""" + if isinstance(module, nn.Embedding): + if self.config is not None and self.config.embed_init_std is not None: + nn.init.normal_(module.weight, mean=0, std=self.config.embed_init_std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + if isinstance(module, nn.Linear): + if self.config is not None and self.config.init_std is not None: + nn.init.normal_(module.weight, mean=0, std=self.config.init_std) + if module.bias is not None: + nn.init.constant_(module.bias, 0.0) + if isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + if isinstance(module, FlaubertModel) and self.config.sinusoidal_embeddings: + create_sinusoidal_embeddings( + self.config.max_position_embeddings, self.config.emb_dim, out=module.position_embeddings.weight + ) + + +@auto_docstring +class FlaubertModel(FlaubertPreTrainedModel): + def __init__(self, config): # , dico, is_encoder, with_output): + super().__init__(config) + + # encoder / decoder, output layer + self.is_encoder = config.is_encoder + self.is_decoder = not config.is_encoder + if self.is_decoder: + raise NotImplementedError("Currently Flaubert can only be used as an encoder") + # self.with_output = with_output + self.causal = config.causal + + # dictionary / languages + self.n_langs = config.n_langs + self.use_lang_emb = config.use_lang_emb + self.n_words = config.n_words + self.eos_index = config.eos_index + self.pad_index = config.pad_index + # self.dico = dico + # self.id2lang = config.id2lang + # self.lang2id = config.lang2id + # assert len(self.dico) == self.n_words + # assert len(self.id2lang) == len(self.lang2id) == self.n_langs + + # model parameters + self.dim = config.emb_dim # 512 by default + self.hidden_dim = self.dim * 4 # 2048 by default + self.n_heads = config.n_heads # 8 by default + self.n_layers = config.n_layers + self.dropout = config.dropout + self.attention_dropout = config.attention_dropout + assert self.dim % self.n_heads == 0, "transformer dim must be a multiple of n_heads" + + # embeddings + self.position_embeddings = nn.Embedding(config.max_position_embeddings, self.dim) + if config.n_langs > 1 and config.use_lang_emb: + self.lang_embeddings = nn.Embedding(self.n_langs, self.dim) + self.embeddings = nn.Embedding(self.n_words, self.dim, padding_idx=self.pad_index) + self.layer_norm_emb = nn.LayerNorm(self.dim, eps=config.layer_norm_eps) + + # transformer layers + self.attentions = nn.ModuleList() + self.layer_norm1 = nn.ModuleList() + self.ffns = nn.ModuleList() + self.layer_norm2 = nn.ModuleList() + # if self.is_decoder: + # self.layer_norm15 = nn.ModuleList() + # self.encoder_attn = nn.ModuleList() + + for i in range(self.n_layers): + self.attentions.append(MultiHeadAttention(self.n_heads, self.dim, config=config, layer_idx=i)) + self.layer_norm1.append(nn.LayerNorm(self.dim, eps=config.layer_norm_eps)) + # if self.is_decoder: + # self.layer_norm15.append(nn.LayerNorm(self.dim, eps=config.layer_norm_eps)) + # self.encoder_attn.append(MultiHeadAttention(self.n_heads, self.dim, dropout=self.attention_dropout)) + self.ffns.append(TransformerFFN(self.dim, self.hidden_dim, self.dim, config=config)) + self.layer_norm2.append(nn.LayerNorm(self.dim, eps=config.layer_norm_eps)) + + if hasattr(config, "pruned_heads"): + pruned_heads = config.pruned_heads.copy().items() + config.pruned_heads = {} + for layer, heads in pruned_heads: + if self.attentions[int(layer)].n_heads == config.n_heads: + self.prune_heads({int(layer): list(map(int, heads))}) + + # Initialize weights and apply final processing + self.post_init() + + self.layerdrop = getattr(config, "layerdrop", 0.0) + self.pre_norm = getattr(config, "pre_norm", False) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) + + # Copied from transformers.models.xlm.modeling_xlm.XLMModel.get_input_embeddings + def get_input_embeddings(self): + return self.embeddings + + # Copied from transformers.models.xlm.modeling_xlm.XLMModel.set_input_embeddings + def set_input_embeddings(self, new_embeddings): + self.embeddings = new_embeddings + + # Copied from transformers.models.xlm.modeling_xlm.XLMModel._prune_heads + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.attentions[layer].prune_heads(heads) + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + langs: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + lengths: Optional[torch.LongTensor] = None, + cache: Optional[dict[str, torch.FloatTensor]] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.Tensor] = None, + ) -> Union[tuple, BaseModelOutput]: + r""" + langs (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + A parallel sequence of tokens to be used to indicate the language of each token in the input. Indices are + languages ids which can be obtained from the language names by using two conversion mappings provided in + the configuration of the model (only provided for multilingual models). More precisely, the *language name + to language id* mapping is in `model.config.lang2id` (which is a dictionary string to int) and the + *language id to language name* mapping is in `model.config.id2lang` (dictionary int to string). + + See usage examples detailed in the [multilingual documentation](../multilingual). + lengths (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Length of each sentence that can be used to avoid performing attention on padding token indices. You can + also use `attention_mask` for the same result (see above), kept here for compatibility. Indices selected in + `[0, ..., input_ids.size(-1)]`: + cache (`dict[str, torch.FloatTensor]`, *optional*): + Dictionary strings to `torch.FloatTensor` that contains precomputed hidden-states (key and values in the + attention blocks) as computed by the model (see `cache` output below). Can be used to speed up sequential + decoding. The dictionary object will be modified in-place during the forward pass to add newly computed + hidden-states. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # removed: src_enc=None, src_len=None + if input_ids is not None: + bs, slen = input_ids.size() + else: + bs, slen = inputs_embeds.size()[:-1] + + device = input_ids.device if input_ids is not None else inputs_embeds.device + + if cache is None: + cache = EncoderDecoderCache(DynamicCache(config=self.config), DynamicCache(config=self.config)) + + if isinstance(cache, tuple): + cache = EncoderDecoderCache.from_legacy_cache(cache) + + if lengths is None: + if input_ids is not None: + lengths = (input_ids != self.pad_index).sum(dim=1).long() + else: + lengths = torch.tensor([slen] * bs, device=device) + # mask = input_ids != self.pad_index + + # check inputs + assert lengths.size(0) == bs + assert lengths.max().item() <= slen + # input_ids = input_ids.transpose(0, 1) # batch size as dimension 0 + # assert (src_enc is None) == (src_len is None) + # if src_enc is not None: + # assert self.is_decoder + # assert src_enc.size(0) == bs + + # generate masks + mask, attn_mask = get_masks(slen, lengths, self.causal, padding_mask=attention_mask) + # if self.is_decoder and src_enc is not None: + # src_mask = torch.arange(src_len.max(), dtype=torch.long, device=lengths.device) < src_len[:, None] + + # Setting the position-ids to the registered buffer in constructor, it helps + # when tracing the model without passing position-ids, solves + # issues similar to issue #5664 + if position_ids is None: + if hasattr(self, "position_ids"): + position_ids = self.position_ids[:, :slen] + position_ids = position_ids.expand((bs, slen)) + else: + position_ids = torch.arange(slen, dtype=torch.long, device=device) + position_ids = position_ids.unsqueeze(0).expand((bs, slen)) + else: + assert position_ids.size() == (bs, slen) # (slen, bs) + # position_ids = position_ids.transpose(0, 1) + + # langs + if langs is not None: + assert langs.size() == (bs, slen) # (slen, bs) + # langs = langs.transpose(0, 1) + + # Prepare head mask if needed + head_mask = self.get_head_mask(head_mask, self.config.n_layers) + + # do not recompute cached elements + if cache is not None and input_ids is not None: + _slen = slen - cache.get_seq_length() + input_ids = input_ids[:, -_slen:] + position_ids = position_ids[:, -_slen:] + if langs is not None: + langs = langs[:, -_slen:] + mask = mask[:, -_slen:] + attn_mask = attn_mask[:, -_slen:] + + # embeddings + if inputs_embeds is None: + inputs_embeds = self.embeddings(input_ids) + + tensor = inputs_embeds + self.position_embeddings(position_ids).expand_as(inputs_embeds) + if langs is not None and self.use_lang_emb and self.config.n_langs > 1: + tensor = tensor + self.lang_embeddings(langs) + if token_type_ids is not None: + tensor = tensor + self.embeddings(token_type_ids) + tensor = self.layer_norm_emb(tensor) + tensor = nn.functional.dropout(tensor, p=self.dropout, training=self.training) + tensor *= mask.unsqueeze(-1).to(tensor.dtype) + + # transformer layers + hidden_states = () if output_hidden_states else None + attentions = () if output_attentions else None + for i in range(self.n_layers): + # LayerDrop + if self.training: + dropout_probability = torch.rand([]) + if dropout_probability < self.layerdrop: + continue + + if output_hidden_states: + hidden_states = hidden_states + (tensor,) + + # self attention + if not self.pre_norm: + attn_outputs = self.attentions[i]( + tensor, + attn_mask, + cache=cache, + head_mask=head_mask[i], + output_attentions=output_attentions, + cache_position=cache_position, + ) + attn = attn_outputs[0] + if output_attentions: + attentions = attentions + (attn_outputs[1],) + attn = nn.functional.dropout(attn, p=self.dropout, training=self.training) + tensor = tensor + attn + tensor = self.layer_norm1[i](tensor) + else: + tensor_normalized = self.layer_norm1[i](tensor) + attn_outputs = self.attentions[i](tensor_normalized, attn_mask, cache=cache, head_mask=head_mask[i]) + attn = attn_outputs[0] + if output_attentions: + attentions = attentions + (attn_outputs[1],) + attn = nn.functional.dropout(attn, p=self.dropout, training=self.training) + tensor = tensor + attn + + # FFN + if not self.pre_norm: + tensor = tensor + self.ffns[i](tensor) + tensor = self.layer_norm2[i](tensor) + else: + tensor_normalized = self.layer_norm2[i](tensor) + tensor = tensor + self.ffns[i](tensor_normalized) + + tensor *= mask.unsqueeze(-1).to(tensor.dtype) + + # Add last hidden state + if output_hidden_states: + hidden_states = hidden_states + (tensor,) + + if not return_dict: + return tuple(v for v in [tensor, hidden_states, attentions] if v is not None) + + return BaseModelOutput(last_hidden_state=tensor, hidden_states=hidden_states, attentions=attentions) + + +@auto_docstring( + custom_intro=""" + The Flaubert Model transformer with a language modeling head on top (linear layer with weights tied to the input + embeddings). + """ +) +class FlaubertWithLMHeadModel(FlaubertPreTrainedModel, GenerationMixin): + _tied_weights_keys = ["pred_layer.proj.weight"] + + def __init__(self, config): + super().__init__(config) + self.transformer = FlaubertModel(config) + self.pred_layer = FlaubertPredLayer(config) + + # Initialize weights and apply final processing + self.post_init() + + def get_output_embeddings(self): + return self.pred_layer.proj + + def set_output_embeddings(self, new_embeddings): + self.pred_layer.proj = new_embeddings + + def prepare_inputs_for_generation(self, input_ids, **kwargs): + # Overwritten -- uses a language id + + mask_token_id = self.config.mask_token_id + lang_id = self.config.lang_id + + effective_batch_size = input_ids.shape[0] + mask_token = torch.full((effective_batch_size, 1), mask_token_id, dtype=torch.long, device=input_ids.device) + input_ids = torch.cat([input_ids, mask_token], dim=1) + if lang_id is not None: + langs = torch.full_like(input_ids, lang_id) + else: + langs = None + return {"input_ids": input_ids, "langs": langs} + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + langs: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + lengths: Optional[torch.Tensor] = None, + cache: Optional[dict[str, torch.Tensor]] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, MaskedLMOutput]: + r""" + langs (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + A parallel sequence of tokens to be used to indicate the language of each token in the input. Indices are + languages ids which can be obtained from the language names by using two conversion mappings provided in + the configuration of the model (only provided for multilingual models). More precisely, the *language name + to language id* mapping is in `model.config.lang2id` (which is a dictionary string to int) and the + *language id to language name* mapping is in `model.config.id2lang` (dictionary int to string). + + See usage examples detailed in the [multilingual documentation](../multilingual). + lengths (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Length of each sentence that can be used to avoid performing attention on padding token indices. You can + also use `attention_mask` for the same result (see above), kept here for compatibility. Indices selected in + `[0, ..., input_ids.size(-1)]`: + cache (`dict[str, torch.FloatTensor]`, *optional*): + Dictionary strings to `torch.FloatTensor` that contains precomputed hidden-states (key and values in the + attention blocks) as computed by the model (see `cache` output below). Can be used to speed up sequential + decoding. The dictionary object will be modified in-place during the forward pass to add newly computed + hidden-states. + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set + `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100` + are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]` + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + transformer_outputs = self.transformer( + input_ids, + attention_mask=attention_mask, + langs=langs, + token_type_ids=token_type_ids, + position_ids=position_ids, + lengths=lengths, + cache=cache, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + output = transformer_outputs[0] + outputs = self.pred_layer(output, labels) # (loss, logits) or (logits,) depending on if labels are provided. + + if not return_dict: + return outputs + transformer_outputs[1:] + + return MaskedLMOutput( + loss=outputs[0] if labels is not None else None, + logits=outputs[0] if labels is None else outputs[1], + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) + + +@auto_docstring( + custom_intro=""" + Flaubert Model with a sequence classification/regression head on top (a linear layer on top of the pooled output) + e.g. for GLUE tasks. + """ +) +# Copied from transformers.models.xlm.modeling_xlm.XLMForSequenceClassification with XLM_INPUTS->FLAUBERT_INPUTS,XLM->Flaubert +class FlaubertForSequenceClassification(FlaubertPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.config = config + + self.transformer = FlaubertModel(config) + self.sequence_summary = FlaubertSequenceSummary(config) + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + langs: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + lengths: Optional[torch.Tensor] = None, + cache: Optional[dict[str, torch.Tensor]] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, SequenceClassifierOutput]: + r""" + langs (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + A parallel sequence of tokens to be used to indicate the language of each token in the input. Indices are + languages ids which can be obtained from the language names by using two conversion mappings provided in + the configuration of the model (only provided for multilingual models). More precisely, the *language name + to language id* mapping is in `model.config.lang2id` (which is a dictionary string to int) and the + *language id to language name* mapping is in `model.config.id2lang` (dictionary int to string). + + See usage examples detailed in the [multilingual documentation](../multilingual). + lengths (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Length of each sentence that can be used to avoid performing attention on padding token indices. You can + also use *attention_mask* for the same result (see above), kept here for compatibility. Indices selected in + `[0, ..., input_ids.size(-1)]`. + cache (`dict[str, torch.FloatTensor]`, *optional*): + Instance of `EncoderDecoderCache` that contains precomputed KV states. Can be used to speed up sequential + decoding. + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + transformer_outputs = self.transformer( + input_ids, + attention_mask=attention_mask, + langs=langs, + token_type_ids=token_type_ids, + position_ids=position_ids, + lengths=lengths, + cache=cache, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + output = transformer_outputs[0] + logits = self.sequence_summary(output) + + loss = None + if labels is not None: + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits, labels) + + if not return_dict: + output = (logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutput( + loss=loss, + logits=logits, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) + + +@auto_docstring +# Copied from transformers.models.xlm.modeling_xlm.XLMForTokenClassification with XLM_INPUTS->FLAUBERT_INPUTS,XLM->Flaubert +class FlaubertForTokenClassification(FlaubertPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + + self.transformer = FlaubertModel(config) + self.dropout = nn.Dropout(config.dropout) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + langs: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + lengths: Optional[torch.Tensor] = None, + cache: Optional[dict[str, torch.Tensor]] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, TokenClassifierOutput]: + r""" + langs (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + A parallel sequence of tokens to be used to indicate the language of each token in the input. Indices are + languages ids which can be obtained from the language names by using two conversion mappings provided in + the configuration of the model (only provided for multilingual models). More precisely, the *language name + to language id* mapping is in `model.config.lang2id` (which is a dictionary string to int) and the + *language id to language name* mapping is in `model.config.id2lang` (dictionary int to string). + + See usage examples detailed in the [multilingual documentation](../multilingual). + lengths (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Length of each sentence that can be used to avoid performing attention on padding token indices. You can + also use *attention_mask* for the same result (see above), kept here for compatibility. Indices selected in + `[0, ..., input_ids.size(-1)]`. + cache (`dict[str, torch.FloatTensor]`, *optional*): + Instance of `EncoderDecoderCache` that contains precomputed KV states. Can be used to speed up sequential + decoding. + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.transformer( + input_ids, + attention_mask=attention_mask, + langs=langs, + token_type_ids=token_type_ids, + position_ids=position_ids, + lengths=lengths, + cache=cache, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + sequence_output = self.dropout(sequence_output) + logits = self.classifier(sequence_output) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + + if not return_dict: + output = (logits,) + outputs[1:] + return ((loss,) + output) if loss is not None else output + + return TokenClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@auto_docstring( + custom_intro=""" + Flaubert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear + layers on top of the hidden-states output to compute `span start logits` and `span end logits`). + """ +) +# Copied from transformers.models.xlm.modeling_xlm.XLMForQuestionAnsweringSimple with XLM_INPUTS->FLAUBERT_INPUTS,XLM->Flaubert +class FlaubertForQuestionAnsweringSimple(FlaubertPreTrainedModel): + def __init__(self, config): + super().__init__(config) + + self.transformer = FlaubertModel(config) + self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + langs: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + lengths: Optional[torch.Tensor] = None, + cache: Optional[dict[str, torch.Tensor]] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + start_positions: Optional[torch.Tensor] = None, + end_positions: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, QuestionAnsweringModelOutput]: + r""" + langs (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + A parallel sequence of tokens to be used to indicate the language of each token in the input. Indices are + languages ids which can be obtained from the language names by using two conversion mappings provided in + the configuration of the model (only provided for multilingual models). More precisely, the *language name + to language id* mapping is in `model.config.lang2id` (which is a dictionary string to int) and the + *language id to language name* mapping is in `model.config.id2lang` (dictionary int to string). + + See usage examples detailed in the [multilingual documentation](../multilingual). + lengths (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Length of each sentence that can be used to avoid performing attention on padding token indices. You can + also use *attention_mask* for the same result (see above), kept here for compatibility. Indices selected in + `[0, ..., input_ids.size(-1)]`. + cache (`dict[str, torch.FloatTensor]`, *optional*): + Instance of `EncoderDecoderCache` that contains precomputed KV states. Can be used to speed up sequential + decoding. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + transformer_outputs = self.transformer( + input_ids, + attention_mask=attention_mask, + langs=langs, + token_type_ids=token_type_ids, + position_ids=position_ids, + lengths=lengths, + cache=cache, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = transformer_outputs[0] + + logits = self.qa_outputs(sequence_output) + start_logits, end_logits = logits.split(1, dim=-1) + start_logits = start_logits.squeeze(-1).contiguous() + end_logits = end_logits.squeeze(-1).contiguous() + + total_loss = None + if start_positions is not None and end_positions is not None: + # If we are on multi-GPU, split add a dimension + if len(start_positions.size()) > 1: + start_positions = start_positions.squeeze(-1) + if len(end_positions.size()) > 1: + end_positions = end_positions.squeeze(-1) + # sometimes the start/end positions are outside our model inputs, we ignore these terms + ignored_index = start_logits.size(1) + start_positions = start_positions.clamp(0, ignored_index) + end_positions = end_positions.clamp(0, ignored_index) + + loss_fct = CrossEntropyLoss(ignore_index=ignored_index) + start_loss = loss_fct(start_logits, start_positions) + end_loss = loss_fct(end_logits, end_positions) + total_loss = (start_loss + end_loss) / 2 + + if not return_dict: + output = (start_logits, end_logits) + transformer_outputs[1:] + return ((total_loss,) + output) if total_loss is not None else output + + return QuestionAnsweringModelOutput( + loss=total_loss, + start_logits=start_logits, + end_logits=end_logits, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) + + +@dataclass +@auto_docstring( + custom_intro=""" + Base class for outputs of question answering models using a `SquadHead`. + """ +) +# Copied from transformer.models.xlm.modeling_xlm.XLMForQuestionAnsweringOutput with XLM->Flaubert +class FlaubertForQuestionAnsweringOutput(ModelOutput): + r""" + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned if both `start_positions` and `end_positions` are provided): + Classification loss as the sum of start token, end token (and is_impossible if provided) classification + losses. + start_top_log_probs (`torch.FloatTensor` of shape `(batch_size, config.start_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided): + Log probabilities for the top config.start_n_top start token possibilities (beam-search). + start_top_index (`torch.LongTensor` of shape `(batch_size, config.start_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided): + Indices for the top config.start_n_top start token possibilities (beam-search). + end_top_log_probs (`torch.FloatTensor` of shape `(batch_size, config.start_n_top * config.end_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided): + Log probabilities for the top `config.start_n_top * config.end_n_top` end token possibilities + (beam-search). + end_top_index (`torch.LongTensor` of shape `(batch_size, config.start_n_top * config.end_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided): + Indices for the top `config.start_n_top * config.end_n_top` end token possibilities (beam-search). + cls_logits (`torch.FloatTensor` of shape `(batch_size,)`, *optional*, returned if `start_positions` or `end_positions` is not provided): + Log probabilities for the `is_impossible` label of the answers. + """ + + loss: Optional[torch.FloatTensor] = None + start_top_log_probs: Optional[torch.FloatTensor] = None + start_top_index: Optional[torch.LongTensor] = None + end_top_log_probs: Optional[torch.FloatTensor] = None + end_top_index: Optional[torch.LongTensor] = None + cls_logits: Optional[torch.FloatTensor] = None + hidden_states: Optional[tuple[torch.FloatTensor]] = None + attentions: Optional[tuple[torch.FloatTensor]] = None + + +@auto_docstring +# Copied from transformers.models.xlm.modeling_xlm.XLMForQuestionAnswering with XLM_INPUTS->FLAUBERT_INPUTS,XLM->Flaubert +class FlaubertForQuestionAnswering(FlaubertPreTrainedModel): + def __init__(self, config): + super().__init__(config) + + self.transformer = FlaubertModel(config) + self.qa_outputs = FlaubertSQuADHead(config) + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + langs: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + lengths: Optional[torch.Tensor] = None, + cache: Optional[dict[str, torch.Tensor]] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + start_positions: Optional[torch.Tensor] = None, + end_positions: Optional[torch.Tensor] = None, + is_impossible: Optional[torch.Tensor] = None, + cls_index: Optional[torch.Tensor] = None, + p_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, FlaubertForQuestionAnsweringOutput]: + r""" + langs (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + A parallel sequence of tokens to be used to indicate the language of each token in the input. Indices are + languages ids which can be obtained from the language names by using two conversion mappings provided in + the configuration of the model (only provided for multilingual models). More precisely, the *language name + to language id* mapping is in `model.config.lang2id` (which is a dictionary string to int) and the + *language id to language name* mapping is in `model.config.id2lang` (dictionary int to string). + + See usage examples detailed in the [multilingual documentation](../multilingual). + lengths (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Length of each sentence that can be used to avoid performing attention on padding token indices. You can + also use *attention_mask* for the same result (see above), kept here for compatibility. Indices selected in + `[0, ..., input_ids.size(-1)]`. + cache (`dict[str, torch.FloatTensor]`, *optional*): + Instance of `EncoderDecoderCache` that contains precomputed KV states. Can be used to speed up sequential + decoding. + is_impossible (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels whether a question has an answer or no answer (SQuAD 2.0) + cls_index (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for position (index) of the classification token to use as input for computing plausibility of the + answer. + p_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*): + Optional mask of tokens which can't be in answers (e.g. [CLS], [PAD], ...). 1.0 means token should be + masked. 0.0 mean token is not masked. + + Example: + + ```python + >>> from transformers import AutoTokenizer, FlaubertForQuestionAnswering + >>> import torch + + >>> tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-mlm-en-2048") + >>> model = FlaubertForQuestionAnswering.from_pretrained("FacebookAI/xlm-mlm-en-2048") + + >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze( + ... 0 + ... ) # Batch size 1 + >>> start_positions = torch.tensor([1]) + >>> end_positions = torch.tensor([3]) + + >>> outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions) + >>> loss = outputs.loss + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + transformer_outputs = self.transformer( + input_ids, + attention_mask=attention_mask, + langs=langs, + token_type_ids=token_type_ids, + position_ids=position_ids, + lengths=lengths, + cache=cache, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + output = transformer_outputs[0] + + outputs = self.qa_outputs( + output, + start_positions=start_positions, + end_positions=end_positions, + cls_index=cls_index, + is_impossible=is_impossible, + p_mask=p_mask, + return_dict=return_dict, + ) + + if not return_dict: + return outputs + transformer_outputs[1:] + + return FlaubertForQuestionAnsweringOutput( + loss=outputs.loss, + start_top_log_probs=outputs.start_top_log_probs, + start_top_index=outputs.start_top_index, + end_top_log_probs=outputs.end_top_log_probs, + end_top_index=outputs.end_top_index, + cls_logits=outputs.cls_logits, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) + + +@auto_docstring +# Copied from transformers.models.xlm.modeling_xlm.XLMForMultipleChoice with XLM_INPUTS->FLAUBERT_INPUTS,XLM->Flaubert +class FlaubertForMultipleChoice(FlaubertPreTrainedModel): + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + self.transformer = FlaubertModel(config) + self.sequence_summary = FlaubertSequenceSummary(config) + self.logits_proj = nn.Linear(config.num_labels, 1) + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + langs: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + lengths: Optional[torch.Tensor] = None, + cache: Optional[dict[str, torch.Tensor]] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, MultipleChoiceModelOutput]: + r""" + input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + langs (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*): + A parallel sequence of tokens to be used to indicate the language of each token in the input. Indices are + languages ids which can be obtained from the language names by using two conversion mappings provided in + the configuration of the model (only provided for multilingual models). More precisely, the *language name + to language id* mapping is in `model.config.lang2id` (which is a dictionary string to int) and the + *language id to language name* mapping is in `model.config.id2lang` (dictionary int to string). + + See usage examples detailed in the [multilingual documentation](../multilingual). + token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, + 1]`: + + - 0 corresponds to a *sentence A* token, + - 1 corresponds to a *sentence B* token. + + [What are token type IDs?](../glossary#token-type-ids) + position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + + [What are position IDs?](../glossary#position-ids) + lengths (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Length of each sentence that can be used to avoid performing attention on padding token indices. You can + also use *attention_mask* for the same result (see above), kept here for compatibility. Indices selected in + `[0, ..., input_ids.size(-1)]`. + cache (`dict[str, torch.FloatTensor]`, *optional*): + Instance of `EncoderDecoderCache` that contains precomputed KV states. Can be used to speed up sequential + decoding. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., + num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See + `input_ids` above) + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] + + input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None + attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None + token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None + position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None + langs = langs.view(-1, langs.size(-1)) if langs is not None else None + inputs_embeds = ( + inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1)) + if inputs_embeds is not None + else None + ) + + if lengths is not None: + logger.warning( + "The `lengths` parameter cannot be used with the Flaubert multiple choice models. Please use the " + "attention mask instead." + ) + lengths = None + + transformer_outputs = self.transformer( + input_ids=input_ids, + attention_mask=attention_mask, + langs=langs, + token_type_ids=token_type_ids, + position_ids=position_ids, + lengths=lengths, + cache=cache, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + output = transformer_outputs[0] + logits = self.sequence_summary(output) + logits = self.logits_proj(logits) + reshaped_logits = logits.view(-1, num_choices) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(reshaped_logits, labels) + + if not return_dict: + output = (reshaped_logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return MultipleChoiceModelOutput( + loss=loss, + logits=reshaped_logits, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) + + +__all__ = [ + "FlaubertForMultipleChoice", + "FlaubertForQuestionAnswering", + "FlaubertForQuestionAnsweringSimple", + "FlaubertForSequenceClassification", + "FlaubertForTokenClassification", + "FlaubertModel", + "FlaubertWithLMHeadModel", + "FlaubertPreTrainedModel", +] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/flaubert/modeling_tf_flaubert.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/flaubert/modeling_tf_flaubert.py new file mode 100644 index 0000000000000000000000000000000000000000..88b7ae9f0c9ddd96e761b3739a035c55a217dec8 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/flaubert/modeling_tf_flaubert.py @@ -0,0 +1,1343 @@ +# coding=utf-8 +# Copyright 2019-present, Facebook, Inc and the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +TF 2.0 Flaubert model. +""" + +from __future__ import annotations + +import itertools +import random +import warnings +from dataclasses import dataclass + +import numpy as np +import tensorflow as tf + +from ...activations_tf import get_tf_activation +from ...modeling_tf_outputs import ( + TFBaseModelOutput, + TFMultipleChoiceModelOutput, + TFQuestionAnsweringModelOutput, + TFSequenceClassifierOutput, + TFTokenClassifierOutput, +) +from ...modeling_tf_utils import ( + TFModelInputType, + TFMultipleChoiceLoss, + TFPreTrainedModel, + TFQuestionAnsweringLoss, + TFSequenceClassificationLoss, + TFSequenceSummary, + TFSharedEmbeddings, + TFTokenClassificationLoss, + get_initializer, + keras, + keras_serializable, + unpack_inputs, +) +from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax +from ...utils import ( + MULTIPLE_CHOICE_DUMMY_INPUTS, + ModelOutput, + add_code_sample_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, +) +from .configuration_flaubert import FlaubertConfig + + +logger = logging.get_logger(__name__) + +_CHECKPOINT_FOR_DOC = "flaubert/flaubert_base_cased" +_CONFIG_FOR_DOC = "FlaubertConfig" + + +FLAUBERT_START_DOCSTRING = r""" + + This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and + behavior. + + + + TensorFlow models and layers in `transformers` accept two formats as input: + + - having all inputs as keyword arguments (like PyTorch models), or + - having all inputs as a list, tuple or dict in the first positional argument. + + The reason the second format is supported is that Keras methods prefer this format when passing inputs to models + and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just + pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second + format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with + the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first + positional argument: + + - a single Tensor with `input_ids` only and nothing else: `model(input_ids)` + - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring: + `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])` + - a dictionary with one or several input Tensors associated to the input names given in the docstring: + `model({"input_ids": input_ids, "token_type_ids": token_type_ids})` + + Note that when creating models and layers with + [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry + about any of this, as you can just pass inputs like you would to any other Python function! + + + + Parameters: + config ([`FlaubertConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +FLAUBERT_INPUTS_DOCSTRING = r""" + Args: + input_ids (`Numpy array` or `tf.Tensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and + [`PreTrainedTokenizer.encode`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`Numpy array` or `tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - `1` for tokens that are **not masked**, + - `0` for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + langs (`tf.Tensor` or `Numpy array` of shape `(batch_size, sequence_length)`, *optional*): + A parallel sequence of tokens to be used to indicate the language of each token in the input. Indices are + languages ids which can be obtained from the language names by using two conversion mappings provided in + the configuration of the model (only provided for multilingual models). More precisely, the *language name + to language id* mapping is in `model.config.lang2id` (which is a dictionary string to int) and the + *language id to language name* mapping is in `model.config.id2lang` (dictionary int to string). + + See usage examples detailed in the [multilingual documentation](../multilingual). + token_type_ids (`tf.Tensor` or `Numpy array` of shape `(batch_size, sequence_length)`, *optional*): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, + 1]`: + + - `0` corresponds to a *sentence A* token, + - `1` corresponds to a *sentence B* token. + + [What are token type IDs?](../glossary#token-type-ids) + position_ids (`tf.Tensor` or `Numpy array` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + + [What are position IDs?](../glossary#position-ids) + lengths (`tf.Tensor` or `Numpy array` of shape `(batch_size,)`, *optional*): + Length of each sentence that can be used to avoid performing attention on padding token indices. You can + also use *attention_mask* for the same result (see above), kept here for compatibility Indices selected in + `[0, ..., input_ids.size(-1)]`: + cache (`dict[str, tf.Tensor]`, *optional*): + Dictionary string to `tf.FloatTensor` that contains precomputed hidden states (key and values in the + attention blocks) as computed by the model (see `cache` output below). Can be used to speed up sequential + decoding. + + The dictionary object will be modified in-place during the forward pass to add newly computed + hidden-states. + head_mask (`Numpy array` or `tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`: + + - `1` indicates the head is **not masked**, + - `0` indicates the head is **masked**. + + inputs_embeds (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the + config will be used instead. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. This argument can be used only in eager mode, in graph mode the value in the config will be + used instead. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in + eager mode, in graph mode the value will always be set to True. + training (`bool`, *optional*, defaults to `False`): + Whether or not to use the model in training mode (some modules like dropout modules have different + behaviors between training and evaluation). +""" + + +def get_masks(slen, lengths, causal, padding_mask=None): + """ + Generate hidden states mask, and optionally an attention mask. + """ + bs = shape_list(lengths)[0] + if padding_mask is not None: + mask = padding_mask + else: + # assert lengths.max().item() <= slen + alen = tf.range(slen, dtype=lengths.dtype) + mask = alen < tf.expand_dims(lengths, axis=1) + + # attention mask is the same as mask, or triangular inferior attention (causal) + if causal: + attn_mask = tf.less_equal( + tf.tile(tf.reshape(alen, (1, 1, slen)), (bs, slen, 1)), tf.reshape(alen, (1, slen, 1)) + ) + else: + attn_mask = mask + + # sanity check + # assert shape_list(mask) == [bs, slen] + tf.debugging.assert_equal(shape_list(mask), [bs, slen]) + if causal: + tf.debugging.assert_equal(shape_list(attn_mask), [bs, slen, slen]) + + return mask, attn_mask + + +class TFFlaubertPreTrainedModel(TFPreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = FlaubertConfig + base_model_prefix = "transformer" + + @property + def dummy_inputs(self): + # Sometimes Flaubert has language embeddings so don't forget to build them as well if needed + inputs_list = tf.constant([[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]], dtype=tf.int32) + attns_list = tf.constant([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]], dtype=tf.int32) + if self.config.use_lang_emb and self.config.n_langs > 1: + return { + "input_ids": inputs_list, + "attention_mask": attns_list, + "langs": tf.constant([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]], dtype=tf.int32), + } + else: + return {"input_ids": inputs_list, "attention_mask": attns_list} + + +@add_start_docstrings( + "The bare Flaubert Model transformer outputting raw hidden-states without any specific head on top.", + FLAUBERT_START_DOCSTRING, +) +class TFFlaubertModel(TFFlaubertPreTrainedModel): + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + self.transformer = TFFlaubertMainLayer(config, name="transformer") + + @unpack_inputs + @add_start_docstrings_to_model_forward(FLAUBERT_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFBaseModelOutput, + config_class=_CONFIG_FOR_DOC, + ) + def call( + self, + input_ids: np.ndarray | tf.Tensor | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + langs: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + lengths: np.ndarray | tf.Tensor | None = None, + cache: dict[str, tf.Tensor] | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: tf.Tensor | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + training: bool | None = False, + ) -> tuple | TFBaseModelOutput: + outputs = self.transformer( + input_ids=input_ids, + attention_mask=attention_mask, + langs=langs, + token_type_ids=token_type_ids, + position_ids=position_ids, + lengths=lengths, + cache=cache, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + return outputs + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "transformer", None) is not None: + with tf.name_scope(self.transformer.name): + self.transformer.build(None) + + +# Copied from transformers.models.xlm.modeling_tf_xlm.TFXLMMultiHeadAttention with XLM->Flaubert +class TFFlaubertMultiHeadAttention(keras.layers.Layer): + NEW_ID = itertools.count() + + def __init__(self, n_heads, dim, config, **kwargs): + super().__init__(**kwargs) + self.layer_id = next(TFFlaubertMultiHeadAttention.NEW_ID) + self.dim = dim + self.n_heads = n_heads + self.output_attentions = config.output_attentions + assert self.dim % self.n_heads == 0 + + self.q_lin = keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name="q_lin") + self.k_lin = keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name="k_lin") + self.v_lin = keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name="v_lin") + self.out_lin = keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name="out_lin") + self.dropout = keras.layers.Dropout(config.attention_dropout) + self.pruned_heads = set() + self.dim = dim + + def prune_heads(self, heads): + raise NotImplementedError + + def call(self, input, mask, kv, cache, head_mask, output_attentions, training=False): + """ + Self-attention (if kv is None) or attention over source sentence (provided by kv). + """ + # Input is (bs, qlen, dim) + # Mask is (bs, klen) (non-causal) or (bs, klen, klen) + bs, qlen, dim = shape_list(input) + + if kv is None: + klen = qlen if cache is None else cache["slen"] + qlen + else: + klen = shape_list(kv)[1] + + # assert dim == self.dim, f'Dimensions do not match: {dim} input vs {self.dim} configured' + dim_per_head = self.dim // self.n_heads + mask_reshape = (bs, 1, qlen, klen) if len(shape_list(mask)) == 3 else (bs, 1, 1, klen) + + def shape(x): + """projection""" + return tf.transpose(tf.reshape(x, (bs, -1, self.n_heads, dim_per_head)), perm=(0, 2, 1, 3)) + + def unshape(x): + """compute context""" + return tf.reshape(tf.transpose(x, perm=(0, 2, 1, 3)), (bs, -1, self.n_heads * dim_per_head)) + + q = shape(self.q_lin(input)) # (bs, n_heads, qlen, dim_per_head) + + if kv is None: + k = shape(self.k_lin(input)) # (bs, n_heads, qlen, dim_per_head) + v = shape(self.v_lin(input)) # (bs, n_heads, qlen, dim_per_head) + elif cache is None or self.layer_id not in cache: + k = v = kv + k = shape(self.k_lin(k)) # (bs, n_heads, qlen, dim_per_head) + v = shape(self.v_lin(v)) # (bs, n_heads, qlen, dim_per_head) + + if cache is not None: + if self.layer_id in cache: + if kv is None: + k_, v_ = cache[self.layer_id] + k = tf.concat([k_, k], axis=2) # (bs, n_heads, klen, dim_per_head) + v = tf.concat([v_, v], axis=2) # (bs, n_heads, klen, dim_per_head) + else: + k, v = cache[self.layer_id] + + cache[self.layer_id] = (k, v) + + f_dim_per_head = tf.cast(dim_per_head, dtype=q.dtype) + q = tf.multiply(q, tf.math.rsqrt(f_dim_per_head)) # (bs, n_heads, qlen, dim_per_head) + k = tf.cast(k, dtype=q.dtype) + scores = tf.matmul(q, k, transpose_b=True) # (bs, n_heads, qlen, klen) + mask = tf.reshape(mask, mask_reshape) # (bs, n_heads, qlen, klen) + # scores.masked_fill_(mask, -float('inf')) # (bs, n_heads, qlen, klen) + mask = tf.cast(mask, dtype=scores.dtype) + scores = scores - 1e30 * (1.0 - mask) + weights = stable_softmax(scores, axis=-1) # (bs, n_heads, qlen, klen) + weights = self.dropout(weights, training=training) # (bs, n_heads, qlen, klen) + + # Mask heads if we want to + if head_mask is not None: + weights = weights * head_mask + + context = tf.matmul(weights, v) # (bs, n_heads, qlen, dim_per_head) + context = unshape(context) # (bs, qlen, dim) + outputs = (self.out_lin(context),) + + if output_attentions: + outputs = outputs + (weights,) + + return outputs + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "q_lin", None) is not None: + with tf.name_scope(self.q_lin.name): + self.q_lin.build([None, None, self.dim]) + if getattr(self, "k_lin", None) is not None: + with tf.name_scope(self.k_lin.name): + self.k_lin.build([None, None, self.dim]) + if getattr(self, "v_lin", None) is not None: + with tf.name_scope(self.v_lin.name): + self.v_lin.build([None, None, self.dim]) + if getattr(self, "out_lin", None) is not None: + with tf.name_scope(self.out_lin.name): + self.out_lin.build([None, None, self.dim]) + + +# Copied from transformers.models.xlm.modeling_tf_xlm.TFXLMTransformerFFN +class TFFlaubertTransformerFFN(keras.layers.Layer): + def __init__(self, in_dim, dim_hidden, out_dim, config, **kwargs): + super().__init__(**kwargs) + + self.lin1 = keras.layers.Dense(dim_hidden, kernel_initializer=get_initializer(config.init_std), name="lin1") + self.lin2 = keras.layers.Dense(out_dim, kernel_initializer=get_initializer(config.init_std), name="lin2") + self.act = get_tf_activation("gelu") if config.gelu_activation else get_tf_activation("relu") + self.dropout = keras.layers.Dropout(config.dropout) + self.in_dim = in_dim + self.dim_hidden = dim_hidden + + def call(self, input, training=False): + x = self.lin1(input) + x = self.act(x) + x = self.lin2(x) + x = self.dropout(x, training=training) + + return x + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "lin1", None) is not None: + with tf.name_scope(self.lin1.name): + self.lin1.build([None, None, self.in_dim]) + if getattr(self, "lin2", None) is not None: + with tf.name_scope(self.lin2.name): + self.lin2.build([None, None, self.dim_hidden]) + + +@keras_serializable +class TFFlaubertMainLayer(keras.layers.Layer): + config_class = FlaubertConfig + + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + + self.config = config + self.n_heads = config.n_heads + self.n_langs = config.n_langs + self.dim = config.emb_dim + self.hidden_dim = self.dim * 4 + self.n_words = config.n_words + self.pad_index = config.pad_index + self.causal = config.causal + self.n_layers = config.n_layers + self.use_lang_emb = config.use_lang_emb + self.layerdrop = getattr(config, "layerdrop", 0.0) + self.pre_norm = getattr(config, "pre_norm", False) + self.output_attentions = config.output_attentions + self.output_hidden_states = config.output_hidden_states + self.return_dict = config.use_return_dict + self.max_position_embeddings = config.max_position_embeddings + self.embed_init_std = config.embed_init_std + self.dropout = keras.layers.Dropout(config.dropout) + self.embeddings = TFSharedEmbeddings( + self.n_words, self.dim, initializer_range=config.embed_init_std, name="embeddings" + ) + self.layer_norm_emb = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm_emb") + self.attentions = [] + self.layer_norm1 = [] + self.ffns = [] + self.layer_norm2 = [] + + for i in range(self.n_layers): + self.attentions.append( + TFFlaubertMultiHeadAttention(self.n_heads, self.dim, config=config, name=f"attentions_._{i}") + ) + self.layer_norm1.append( + keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name=f"layer_norm1_._{i}") + ) + # if self.is_decoder: + # self.layer_norm15.append(nn.LayerNorm(self.dim, eps=config.layer_norm_eps)) + # self.encoder_attn.append(MultiHeadAttention(self.n_heads, self.dim, dropout=self.attention_dropout)) + self.ffns.append( + TFFlaubertTransformerFFN(self.dim, self.hidden_dim, self.dim, config=config, name=f"ffns_._{i}") + ) + self.layer_norm2.append( + keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name=f"layer_norm2_._{i}") + ) + + def build(self, input_shape=None): + with tf.name_scope("position_embeddings"): + self.position_embeddings = self.add_weight( + name="embeddings", + shape=[self.max_position_embeddings, self.dim], + initializer=get_initializer(self.embed_init_std), + ) + + if self.n_langs > 1 and self.use_lang_emb: + with tf.name_scope("lang_embeddings"): + self.lang_embeddings = self.add_weight( + name="embeddings", + shape=[self.n_langs, self.dim], + initializer=get_initializer(self.embed_init_std), + ) + + if self.built: + return + self.built = True + if getattr(self, "embeddings", None) is not None: + with tf.name_scope(self.embeddings.name): + self.embeddings.build(None) + if getattr(self, "layer_norm_emb", None) is not None: + with tf.name_scope(self.layer_norm_emb.name): + self.layer_norm_emb.build([None, None, self.dim]) + for layer in self.attentions: + with tf.name_scope(layer.name): + layer.build(None) + for layer in self.layer_norm1: + with tf.name_scope(layer.name): + layer.build([None, None, self.dim]) + for layer in self.ffns: + with tf.name_scope(layer.name): + layer.build(None) + for layer in self.layer_norm2: + with tf.name_scope(layer.name): + layer.build([None, None, self.dim]) + + def get_input_embeddings(self): + return self.embeddings + + def set_input_embeddings(self, value): + self.embeddings.weight = value + self.embeddings.vocab_size = shape_list(value)[0] + + @unpack_inputs + def call( + self, + input_ids: np.ndarray | tf.Tensor | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + langs: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + lengths: np.ndarray | tf.Tensor | None = None, + cache: dict[str, tf.Tensor] | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: tf.Tensor | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + training: bool | None = False, + ) -> tuple | TFBaseModelOutput: + # removed: src_enc=None, src_len=None + + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + bs, slen = shape_list(input_ids) + elif inputs_embeds is not None: + bs, slen = shape_list(inputs_embeds)[:2] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + if lengths is None: + if input_ids is not None: + lengths = tf.reduce_sum( + tf.cast(tf.not_equal(input_ids, self.pad_index), dtype=input_ids.dtype), axis=1 + ) + else: + lengths = tf.convert_to_tensor([slen] * bs) + # mask = input_ids != self.pad_index + + # check inputs + # assert shape_list(lengths)[0] == bs + ( + tf.debugging.assert_equal(shape_list(lengths)[0], bs), + f"Expected batch size {shape_list(lengths)[0]} and received batch size {bs} mismatched", + ) + # assert lengths.max().item() <= slen + # input_ids = input_ids.transpose(0, 1) # batch size as dimension 0 + # assert (src_enc is None) == (src_len is None) + # if src_enc is not None: + # assert self.is_decoder + # assert src_enc.size(0) == bs + + # generate masks + mask, attn_mask = get_masks(slen, lengths, self.causal, padding_mask=attention_mask) + # if self.is_decoder and src_enc is not None: + # src_mask = torch.arange(src_len.max(), dtype=torch.long, device=lengths.device) < src_len[:, None] + + # position_ids + if position_ids is None: + position_ids = tf.expand_dims(tf.range(slen), axis=0) + position_ids = tf.tile(position_ids, (bs, 1)) + + # assert shape_list(position_ids) == [bs, slen] # (slen, bs) + ( + tf.debugging.assert_equal(shape_list(position_ids), [bs, slen]), + f"Position id shape {shape_list(position_ids)} and input shape {[bs, slen]} mismatched", + ) + # position_ids = position_ids.transpose(0, 1) + + # langs + if langs is not None: + # assert shape_list(langs) == [bs, slen] # (slen, bs) + ( + tf.debugging.assert_equal(shape_list(langs), [bs, slen]), + f"Lang shape {shape_list(langs)} and input shape {[bs, slen]} mismatched", + ) + # langs = langs.transpose(0, 1) + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x qlen x klen] + if head_mask is not None: + raise NotImplementedError + else: + head_mask = [None] * self.n_layers + + # do not recompute cached elements + if cache is not None and input_ids is not None: + _slen = slen - cache["slen"] + input_ids = input_ids[:, -_slen:] + position_ids = position_ids[:, -_slen:] + if langs is not None: + langs = langs[:, -_slen:] + mask = mask[:, -_slen:] + attn_mask = attn_mask[:, -_slen:] + + # embeddings + if inputs_embeds is None: + check_embeddings_within_bounds(input_ids, self.embeddings.vocab_size) + inputs_embeds = self.embeddings(input_ids) + + tensor = inputs_embeds + tf.gather(self.position_embeddings, position_ids) + + if langs is not None and self.use_lang_emb: + tensor = tensor + tf.gather(self.lang_embeddings, langs) + if token_type_ids is not None: + tensor = tensor + self.embeddings(token_type_ids) + + tensor = self.layer_norm_emb(tensor) + tensor = self.dropout(tensor, training=training) + mask = tf.cast(mask, dtype=tensor.dtype) + tensor = tensor * tf.expand_dims(mask, axis=-1) + + # hidden_states and attentions cannot be None in graph mode. + hidden_states = () if output_hidden_states else None + attentions = () if output_attentions else None + + # transformer layers + for i in range(self.n_layers): + # LayerDrop + dropout_probability = random.uniform(0, 1) + + if training and (dropout_probability < self.layerdrop): + continue + + if output_hidden_states: + hidden_states = hidden_states + (tensor,) + + # self attention + if not self.pre_norm: + attn_outputs = self.attentions[i]( + tensor, + attn_mask, + None, + cache, + head_mask[i], + output_attentions, + training=training, + ) + attn = attn_outputs[0] + + if output_attentions: + attentions = attentions + (attn_outputs[1],) + + attn = self.dropout(attn, training=training) + tensor = tensor + attn + tensor = self.layer_norm1[i](tensor) + else: + tensor_normalized = self.layer_norm1[i](tensor) + attn_outputs = self.attentions[i]( + tensor_normalized, + attn_mask, + None, + cache, + head_mask[i], + output_attentions, + training=training, + ) + attn = attn_outputs[0] + + if output_attentions: + attentions = attentions + (attn_outputs[1],) + + attn = self.dropout(attn, training=training) + tensor = tensor + attn + + # encoder attention (for decoder only) + # if self.is_decoder and src_enc is not None: + # attn = self.encoder_attn[i](tensor, src_mask, kv=src_enc, cache=cache) + # attn = nn.functional.dropout(attn, p=self.dropout, training=self.training) + # tensor = tensor + attn + # tensor = self.layer_norm15[i](tensor) + + # FFN + if not self.pre_norm: + tensor = tensor + self.ffns[i](tensor) + tensor = self.layer_norm2[i](tensor) + else: + tensor_normalized = self.layer_norm2[i](tensor) + tensor = tensor + self.ffns[i](tensor_normalized) + + tensor = tensor * tf.expand_dims(mask, axis=-1) + + # Add last hidden state + if output_hidden_states: + hidden_states = hidden_states + (tensor,) + + # update cache length + if cache is not None: + cache["slen"] += tensor.size(1) + + # move back sequence length to dimension 0 + # tensor = tensor.transpose(0, 1) + + if not return_dict: + return tuple(v for v in [tensor, hidden_states, attentions] if v is not None) + + return TFBaseModelOutput(last_hidden_state=tensor, hidden_states=hidden_states, attentions=attentions) + + +# Copied from transformers.models.xlm.modeling_tf_xlm.TFXLMPredLayer +class TFFlaubertPredLayer(keras.layers.Layer): + """ + Prediction layer (cross_entropy or adaptive_softmax). + """ + + def __init__(self, config, input_embeddings, **kwargs): + super().__init__(**kwargs) + + self.asm = config.asm + self.n_words = config.n_words + self.pad_index = config.pad_index + + if config.asm is False: + self.input_embeddings = input_embeddings + else: + raise NotImplementedError + # self.proj = nn.AdaptiveLogSoftmaxWithLoss( + # in_features=dim, + # n_classes=config.n_words, + # cutoffs=config.asm_cutoffs, + # div_value=config.asm_div_value, + # head_bias=True, # default is False + # ) + + def build(self, input_shape): + # The output weights are the same as the input embeddings, but there is an output-only bias for each token. + self.bias = self.add_weight(shape=(self.n_words,), initializer="zeros", trainable=True, name="bias") + + super().build(input_shape) + + def get_output_embeddings(self): + return self.input_embeddings + + def set_output_embeddings(self, value): + self.input_embeddings.weight = value + self.input_embeddings.vocab_size = shape_list(value)[0] + + def get_bias(self): + return {"bias": self.bias} + + def set_bias(self, value): + self.bias = value["bias"] + self.vocab_size = shape_list(value["bias"])[0] + + def call(self, hidden_states): + hidden_states = self.input_embeddings(hidden_states, mode="linear") + hidden_states = hidden_states + self.bias + + return hidden_states + + +@dataclass +class TFFlaubertWithLMHeadModelOutput(ModelOutput): + """ + Base class for [`TFFlaubertWithLMHeadModel`] outputs. + + Args: + logits (`tf.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape + `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + logits: tf.Tensor | None = None + hidden_states: tuple[tf.Tensor] | None = None + attentions: tuple[tf.Tensor] | None = None + + +@add_start_docstrings( + """ + The Flaubert Model transformer with a language modeling head on top (linear layer with weights tied to the input + embeddings). + """, + FLAUBERT_START_DOCSTRING, +) +class TFFlaubertWithLMHeadModel(TFFlaubertPreTrainedModel): + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + self.transformer = TFFlaubertMainLayer(config, name="transformer") + self.pred_layer = TFFlaubertPredLayer(config, self.transformer.embeddings, name="pred_layer_._proj") + # Flaubert does not have past caching features + self.supports_xla_generation = False + + def get_lm_head(self): + return self.pred_layer + + def get_prefix_bias_name(self): + warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning) + return self.name + "/" + self.pred_layer.name + + def prepare_inputs_for_generation(self, inputs, **kwargs): + mask_token_id = self.config.mask_token_id + lang_id = self.config.lang_id + + effective_batch_size = inputs.shape[0] + mask_token = tf.fill((effective_batch_size, 1), 1) * mask_token_id + inputs = tf.concat([inputs, mask_token], axis=1) + + if lang_id is not None: + langs = tf.ones_like(inputs) * lang_id + else: + langs = None + return {"input_ids": inputs, "langs": langs} + + @unpack_inputs + @add_start_docstrings_to_model_forward(FLAUBERT_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFFlaubertWithLMHeadModelOutput, + config_class=_CONFIG_FOR_DOC, + ) + def call( + self, + input_ids: np.ndarray | tf.Tensor | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + langs: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + lengths: np.ndarray | tf.Tensor | None = None, + cache: dict[str, tf.Tensor] | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: tf.Tensor | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + training: bool | None = False, + ) -> tuple | TFFlaubertWithLMHeadModelOutput: + transformer_outputs = self.transformer( + input_ids=input_ids, + attention_mask=attention_mask, + langs=langs, + token_type_ids=token_type_ids, + position_ids=position_ids, + lengths=lengths, + cache=cache, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + output = transformer_outputs[0] + outputs = self.pred_layer(output) + + if not return_dict: + return (outputs,) + transformer_outputs[1:] + + return TFFlaubertWithLMHeadModelOutput( + logits=outputs, hidden_states=transformer_outputs.hidden_states, attentions=transformer_outputs.attentions + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "transformer", None) is not None: + with tf.name_scope(self.transformer.name): + self.transformer.build(None) + if getattr(self, "pred_layer", None) is not None: + with tf.name_scope(self.pred_layer.name): + self.pred_layer.build(None) + + +@add_start_docstrings( + """ + Flaubert Model with a sequence classification/regression head on top (a linear layer on top of the pooled output) + e.g. for GLUE tasks. + """, + FLAUBERT_START_DOCSTRING, +) +# Copied from transformers.models.xlm.modeling_tf_xlm.TFXLMForSequenceClassification with XLM_INPUTS->FLAUBERT_INPUTS,XLM->Flaubert +class TFFlaubertForSequenceClassification(TFFlaubertPreTrainedModel, TFSequenceClassificationLoss): + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + self.num_labels = config.num_labels + + self.transformer = TFFlaubertMainLayer(config, name="transformer") + self.sequence_summary = TFSequenceSummary(config, initializer_range=config.init_std, name="sequence_summary") + + @unpack_inputs + @add_start_docstrings_to_model_forward(FLAUBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFSequenceClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + langs: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + lengths: np.ndarray | tf.Tensor | None = None, + cache: dict[str, tf.Tensor] | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + labels: np.ndarray | tf.Tensor | None = None, + training: bool = False, + ) -> TFSequenceClassifierOutput | tuple[tf.Tensor]: + r""" + labels (`tf.Tensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + transformer_outputs = self.transformer( + input_ids=input_ids, + attention_mask=attention_mask, + langs=langs, + token_type_ids=token_type_ids, + position_ids=position_ids, + lengths=lengths, + cache=cache, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + output = transformer_outputs[0] + + logits = self.sequence_summary(output) + + loss = None if labels is None else self.hf_compute_loss(labels, logits) + + if not return_dict: + output = (logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return TFSequenceClassifierOutput( + loss=loss, + logits=logits, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "transformer", None) is not None: + with tf.name_scope(self.transformer.name): + self.transformer.build(None) + if getattr(self, "sequence_summary", None) is not None: + with tf.name_scope(self.sequence_summary.name): + self.sequence_summary.build(None) + + +@add_start_docstrings( + """ + Flaubert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear + layer on top of the hidden-states output to compute `span start logits` and `span end logits`). + """, + FLAUBERT_START_DOCSTRING, +) +# Copied from transformers.models.xlm.modeling_tf_xlm.TFXLMForQuestionAnsweringSimple with XLM_INPUTS->FLAUBERT_INPUTS,XLM->Flaubert +class TFFlaubertForQuestionAnsweringSimple(TFFlaubertPreTrainedModel, TFQuestionAnsweringLoss): + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + self.transformer = TFFlaubertMainLayer(config, name="transformer") + self.qa_outputs = keras.layers.Dense( + config.num_labels, kernel_initializer=get_initializer(config.init_std), name="qa_outputs" + ) + self.config = config + + @unpack_inputs + @add_start_docstrings_to_model_forward(FLAUBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFQuestionAnsweringModelOutput, + config_class=_CONFIG_FOR_DOC, + ) + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + langs: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + lengths: np.ndarray | tf.Tensor | None = None, + cache: dict[str, tf.Tensor] | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + start_positions: np.ndarray | tf.Tensor | None = None, + end_positions: np.ndarray | tf.Tensor | None = None, + training: bool = False, + ) -> TFQuestionAnsweringModelOutput | tuple[tf.Tensor]: + r""" + start_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*): + Labels for position (index) of the start of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence + are not taken into account for computing the loss. + end_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*): + Labels for position (index) of the end of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence + are not taken into account for computing the loss. + """ + transformer_outputs = self.transformer( + input_ids=input_ids, + attention_mask=attention_mask, + langs=langs, + token_type_ids=token_type_ids, + position_ids=position_ids, + lengths=lengths, + cache=cache, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + sequence_output = transformer_outputs[0] + + logits = self.qa_outputs(sequence_output) + start_logits, end_logits = tf.split(logits, 2, axis=-1) + start_logits = tf.squeeze(start_logits, axis=-1) + end_logits = tf.squeeze(end_logits, axis=-1) + + loss = None + if start_positions is not None and end_positions is not None: + labels = {"start_position": start_positions} + labels["end_position"] = end_positions + loss = self.hf_compute_loss(labels, (start_logits, end_logits)) + + if not return_dict: + output = (start_logits, end_logits) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return TFQuestionAnsweringModelOutput( + loss=loss, + start_logits=start_logits, + end_logits=end_logits, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "transformer", None) is not None: + with tf.name_scope(self.transformer.name): + self.transformer.build(None) + if getattr(self, "qa_outputs", None) is not None: + with tf.name_scope(self.qa_outputs.name): + self.qa_outputs.build([None, None, self.config.hidden_size]) + + +@add_start_docstrings( + """ + Flaubert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for + Named-Entity-Recognition (NER) tasks. + """, + FLAUBERT_START_DOCSTRING, +) +# Copied from transformers.models.xlm.modeling_tf_xlm.TFXLMForTokenClassification with XLM_INPUTS->FLAUBERT_INPUTS,XLM->Flaubert +class TFFlaubertForTokenClassification(TFFlaubertPreTrainedModel, TFTokenClassificationLoss): + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + self.num_labels = config.num_labels + + self.transformer = TFFlaubertMainLayer(config, name="transformer") + self.dropout = keras.layers.Dropout(config.dropout) + self.classifier = keras.layers.Dense( + config.num_labels, kernel_initializer=get_initializer(config.init_std), name="classifier" + ) + self.config = config + + @unpack_inputs + @add_start_docstrings_to_model_forward(FLAUBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFTokenClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + langs: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + lengths: np.ndarray | tf.Tensor | None = None, + cache: dict[str, tf.Tensor] | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + labels: np.ndarray | tf.Tensor | None = None, + training: bool = False, + ) -> TFTokenClassifierOutput | tuple[tf.Tensor]: + r""" + labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. + """ + transformer_outputs = self.transformer( + input_ids=input_ids, + attention_mask=attention_mask, + langs=langs, + token_type_ids=token_type_ids, + position_ids=position_ids, + lengths=lengths, + cache=cache, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + sequence_output = transformer_outputs[0] + + sequence_output = self.dropout(sequence_output, training=training) + logits = self.classifier(sequence_output) + + loss = None if labels is None else self.hf_compute_loss(labels, logits) + + if not return_dict: + output = (logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return TFTokenClassifierOutput( + loss=loss, + logits=logits, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "transformer", None) is not None: + with tf.name_scope(self.transformer.name): + self.transformer.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build([None, None, self.config.hidden_size]) + + +@add_start_docstrings( + """ + Flaubert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a + softmax) e.g. for RocStories/SWAG tasks. + """, + FLAUBERT_START_DOCSTRING, +) +# Copied from transformers.models.xlm.modeling_tf_xlm.TFXLMForMultipleChoice with XLM_INPUTS->FLAUBERT_INPUTS,XLM->Flaubert +class TFFlaubertForMultipleChoice(TFFlaubertPreTrainedModel, TFMultipleChoiceLoss): + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + self.transformer = TFFlaubertMainLayer(config, name="transformer") + self.sequence_summary = TFSequenceSummary(config, initializer_range=config.init_std, name="sequence_summary") + self.logits_proj = keras.layers.Dense( + 1, kernel_initializer=get_initializer(config.initializer_range), name="logits_proj" + ) + self.config = config + + @property + def dummy_inputs(self): + """ + Dummy inputs to build the network. + + Returns: + tf.Tensor with dummy inputs + """ + # Sometimes Flaubert has language embeddings so don't forget to build them as well if needed + if self.config.use_lang_emb and self.config.n_langs > 1: + return { + "input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS, dtype=tf.int32), + "langs": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS, dtype=tf.int32), + } + else: + return { + "input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS, dtype=tf.int32), + } + + @unpack_inputs + @add_start_docstrings_to_model_forward( + FLAUBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length") + ) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFMultipleChoiceModelOutput, + config_class=_CONFIG_FOR_DOC, + ) + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + langs: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + lengths: np.ndarray | tf.Tensor | None = None, + cache: dict[str, tf.Tensor] | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + labels: np.ndarray | tf.Tensor | None = None, + training: bool = False, + ) -> TFMultipleChoiceModelOutput | tuple[tf.Tensor]: + if input_ids is not None: + num_choices = shape_list(input_ids)[1] + seq_length = shape_list(input_ids)[2] + else: + num_choices = shape_list(inputs_embeds)[1] + seq_length = shape_list(inputs_embeds)[2] + + flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None + flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None + flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None + flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None + flat_langs = tf.reshape(langs, (-1, seq_length)) if langs is not None else None + flat_inputs_embeds = ( + tf.reshape(inputs_embeds, (-1, seq_length, shape_list(inputs_embeds)[3])) + if inputs_embeds is not None + else None + ) + + if lengths is not None: + logger.warning( + "The `lengths` parameter cannot be used with the Flaubert multiple choice models. Please use the " + "attention mask instead.", + ) + lengths = None + + transformer_outputs = self.transformer( + flat_input_ids, + flat_attention_mask, + flat_langs, + flat_token_type_ids, + flat_position_ids, + lengths, + cache, + head_mask, + flat_inputs_embeds, + output_attentions, + output_hidden_states, + return_dict=return_dict, + training=training, + ) + output = transformer_outputs[0] + logits = self.sequence_summary(output) + logits = self.logits_proj(logits) + reshaped_logits = tf.reshape(logits, (-1, num_choices)) + + loss = None if labels is None else self.hf_compute_loss(labels, reshaped_logits) + + if not return_dict: + output = (reshaped_logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return TFMultipleChoiceModelOutput( + loss=loss, + logits=reshaped_logits, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "transformer", None) is not None: + with tf.name_scope(self.transformer.name): + self.transformer.build(None) + if getattr(self, "sequence_summary", None) is not None: + with tf.name_scope(self.sequence_summary.name): + self.sequence_summary.build(None) + if getattr(self, "logits_proj", None) is not None: + with tf.name_scope(self.logits_proj.name): + self.logits_proj.build([None, None, self.config.num_labels]) + + +__all__ = [ + "TFFlaubertForMultipleChoice", + "TFFlaubertForQuestionAnsweringSimple", + "TFFlaubertForSequenceClassification", + "TFFlaubertForTokenClassification", + "TFFlaubertModel", + "TFFlaubertPreTrainedModel", + "TFFlaubertWithLMHeadModel", +] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/flaubert/tokenization_flaubert.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/flaubert/tokenization_flaubert.py new file mode 100644 index 0000000000000000000000000000000000000000..dee653450ebacc02bcfee8142c5d08b54358c1a9 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/flaubert/tokenization_flaubert.py @@ -0,0 +1,538 @@ +# coding=utf-8 +# Copyright 2019-present CNRS, Facebook Inc. and the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes for Flaubert.""" + +import json +import os +import re +import unicodedata +from typing import Optional + +from ...tokenization_utils import PreTrainedTokenizer +from ...utils import logging + + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = { + "vocab_file": "vocab.json", + "merges_file": "merges.txt", +} + + +def convert_to_unicode(text): + """ + Converts `text` to Unicode (if it's not already), assuming UTF-8 input. + """ + + def ensure_text(s, encoding="utf-8", errors="strict"): + if isinstance(s, bytes): + return s.decode(encoding, errors) + elif isinstance(s, str): + return s + else: + raise TypeError(f"not expecting type '{type(s)}'") + + return ensure_text(text, encoding="utf-8", errors="ignore") + + +# Copied from transformers.models.xlm.tokenization_xlm.get_pairs +def get_pairs(word): + """ + Return set of symbol pairs in a word. word is represented as tuple of symbols (symbols being variable-length + strings) + """ + pairs = set() + prev_char = word[0] + for char in word[1:]: + pairs.add((prev_char, char)) + prev_char = char + return pairs + + +# Copied from transformers.models.xlm.tokenization_xlm.replace_unicode_punct +def replace_unicode_punct(text): + """ + Port of https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/replace-unicode-punctuation.perl + """ + text = text.replace(",", ",") + text = re.sub(r"。\s*", ". ", text) + text = text.replace("、", ",") + text = text.replace("”", '"') + text = text.replace("“", '"') + text = text.replace("∶", ":") + text = text.replace(":", ":") + text = text.replace("?", "?") + text = text.replace("《", '"') + text = text.replace("》", '"') + text = text.replace(")", ")") + text = text.replace("!", "!") + text = text.replace("(", "(") + text = text.replace(";", ";") + text = text.replace("1", "1") + text = text.replace("」", '"') + text = text.replace("「", '"') + text = text.replace("0", "0") + text = text.replace("3", "3") + text = text.replace("2", "2") + text = text.replace("5", "5") + text = text.replace("6", "6") + text = text.replace("9", "9") + text = text.replace("7", "7") + text = text.replace("8", "8") + text = text.replace("4", "4") + text = re.sub(r".\s*", ". ", text) + text = text.replace("~", "~") + text = text.replace("’", "'") + text = text.replace("…", "...") + text = text.replace("━", "-") + text = text.replace("〈", "<") + text = text.replace("〉", ">") + text = text.replace("【", "[") + text = text.replace("】", "]") + text = text.replace("%", "%") + return text + + +# Copied from transformers.models.xlm.tokenization_xlm.remove_non_printing_char +def remove_non_printing_char(text): + """ + Port of https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/remove-non-printing-char.perl + """ + output = [] + for char in text: + cat = unicodedata.category(char) + if cat.startswith("C"): + continue + output.append(char) + return "".join(output) + + +class FlaubertTokenizer(PreTrainedTokenizer): + """ + Construct a Flaubert tokenizer. Based on Byte-Pair Encoding. The tokenization process is the following: + + - Moses preprocessing and tokenization. + - Normalizing all inputs text. + - The arguments `special_tokens` and the function `set_special_tokens`, can be used to add additional symbols (like + "__classify__") to a vocabulary. + - The argument `do_lowercase` controls lower casing (automatically set for pretrained vocabularies). + + This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to + this superclass for more information regarding those methods. + + Args: + vocab_file (`str`): + Vocabulary file. + merges_file (`str`): + Merges file. + do_lowercase (`bool`, *optional*, defaults to `False`): + Controls lower casing. + unk_token (`str`, *optional*, defaults to `""`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + bos_token (`str`, *optional*, defaults to `""`): + The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token. + + + + When building a sequence using special tokens, this is not the token that is used for the beginning of + sequence. The token used is the `cls_token`. + + + + sep_token (`str`, *optional*, defaults to `""`): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for + sequence classification or for a text and a question for question answering. It is also used as the last + token of a sequence built with special tokens. + pad_token (`str`, *optional*, defaults to `""`): + The token used for padding, for example when batching sequences of different lengths. + cls_token (`str`, *optional*, defaults to `""`): + The classifier token which is used when doing sequence classification (classification of the whole sequence + instead of per-token classification). It is the first token of the sequence when built with special tokens. + mask_token (`str`, *optional*, defaults to `""`): + The token used for masking values. This is the token used when training this model with masked language + modeling. This is the token which the model will try to predict. + additional_special_tokens (`List[str]`, *optional*, defaults to `['', '', '', '', '', '', '', '', '', '']`): + List of additional special tokens. + lang2id (`Dict[str, int]`, *optional*): + Dictionary mapping languages string identifiers to their IDs. + id2lang (`Dict[int, str]`, *optional*): + Dictionary mapping language IDs to their string identifiers. + """ + + vocab_files_names = VOCAB_FILES_NAMES + + def __init__( + self, + vocab_file, + merges_file, + do_lowercase=False, + unk_token="", + bos_token="", + sep_token="", + pad_token="", + cls_token="", + mask_token="", + additional_special_tokens=[ + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + ], + lang2id=None, + id2lang=None, + **kwargs, + ): + do_lowercase_and_remove_accent = kwargs.pop("do_lowercase_and_remove_accent", None) + if do_lowercase_and_remove_accent is not None: + logger.warning( + "`do_lowercase_and_remove_accent` is passed as a keyword argument, but this won't do anything." + " `FlaubertTokenizer` will always set it to `False`." + ) + # always `False` + self.do_lowercase_and_remove_accent = False + + self.do_lowercase = do_lowercase + + try: + import sacremoses + except ImportError: + raise ImportError( + "You need to install sacremoses to use FlaubertTokenizer. " + "See https://pypi.org/project/sacremoses/ for installation." + ) + + self.sm = sacremoses + + # cache of sm.MosesPunctNormalizer instance + self.cache_moses_punct_normalizer = {} + # cache of sm.MosesTokenizer instance + self.cache_moses_tokenizer = {} + self.lang_with_custom_tokenizer = {"zh", "th", "ja"} + self.lang2id = lang2id + self.id2lang = id2lang + if lang2id is not None and id2lang is not None: + assert len(lang2id) == len(id2lang) + + self.ja_word_tokenizer = None + self.zh_word_tokenizer = None + + with open(vocab_file, encoding="utf-8") as vocab_handle: + self.encoder = json.load(vocab_handle) + self.decoder = {v: k for k, v in self.encoder.items()} + with open(merges_file, encoding="utf-8") as merges_handle: + merges = merges_handle.read().split("\n")[:-1] + merges = [tuple(merge.split()[:2]) for merge in merges] + self.bpe_ranks = dict(zip(merges, range(len(merges)))) + self.cache = {} + + super().__init__( + do_lowercase=do_lowercase, + unk_token=unk_token, + bos_token=bos_token, + sep_token=sep_token, + pad_token=pad_token, + cls_token=cls_token, + mask_token=mask_token, + additional_special_tokens=additional_special_tokens, + lang2id=lang2id, + id2lang=id2lang, + **kwargs, + ) + + @property + # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.do_lower_case + def do_lower_case(self): + return self.do_lowercase_and_remove_accent + + # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.moses_punct_norm + def moses_punct_norm(self, text, lang): + if lang not in self.cache_moses_punct_normalizer: + punct_normalizer = self.sm.MosesPunctNormalizer(lang=lang) + self.cache_moses_punct_normalizer[lang] = punct_normalizer + else: + punct_normalizer = self.cache_moses_punct_normalizer[lang] + return punct_normalizer.normalize(text) + + # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.moses_tokenize + def moses_tokenize(self, text, lang): + if lang not in self.cache_moses_tokenizer: + moses_tokenizer = self.sm.MosesTokenizer(lang=lang) + self.cache_moses_tokenizer[lang] = moses_tokenizer + else: + moses_tokenizer = self.cache_moses_tokenizer[lang] + return moses_tokenizer.tokenize(text, return_str=False, escape=False) + + # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.moses_pipeline + def moses_pipeline(self, text, lang): + text = replace_unicode_punct(text) + text = self.moses_punct_norm(text, lang) + text = remove_non_printing_char(text) + return text + + # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.ja_tokenize + def ja_tokenize(self, text): + if self.ja_word_tokenizer is None: + try: + import Mykytea + + self.ja_word_tokenizer = Mykytea.Mykytea( + f"-model {os.path.expanduser('~')}/local/share/kytea/model.bin" + ) + except (AttributeError, ImportError): + logger.error( + "Make sure you install KyTea (https://github.com/neubig/kytea) and it's python wrapper" + " (https://github.com/chezou/Mykytea-python) with the following steps" + ) + logger.error("1. git clone git@github.com:neubig/kytea.git && cd kytea") + logger.error("2. autoreconf -i") + logger.error("3. ./configure --prefix=$HOME/local") + logger.error("4. make && make install") + logger.error("5. pip install kytea") + raise + return list(self.ja_word_tokenizer.getWS(text)) + + @property + # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.vocab_size + def vocab_size(self): + return len(self.encoder) + + # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.get_vocab + def get_vocab(self): + return dict(self.encoder, **self.added_tokens_encoder) + + # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.bpe + def bpe(self, token): + word = tuple(token[:-1]) + (token[-1] + "",) + if token in self.cache: + return self.cache[token] + pairs = get_pairs(word) + + if not pairs: + return token + "" + + while True: + bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf"))) + if bigram not in self.bpe_ranks: + break + first, second = bigram + new_word = [] + i = 0 + while i < len(word): + try: + j = word.index(first, i) + except ValueError: + new_word.extend(word[i:]) + break + else: + new_word.extend(word[i:j]) + i = j + + if word[i] == first and i < len(word) - 1 and word[i + 1] == second: + new_word.append(first + second) + i += 2 + else: + new_word.append(word[i]) + i += 1 + new_word = tuple(new_word) + word = new_word + if len(word) == 1: + break + else: + pairs = get_pairs(word) + word = " ".join(word) + if word == "\n ": + word = "\n" + self.cache[token] = word + return word + + def preprocess_text(self, text): + text = text.replace("``", '"').replace("''", '"') + text = convert_to_unicode(text) + text = unicodedata.normalize("NFC", text) + + if self.do_lowercase: + text = text.lower() + + return text + + def _tokenize(self, text, bypass_tokenizer=False): + """ + Tokenize a string given language code using Moses. + + Details of tokenization: + + - [sacremoses](https://github.com/alvations/sacremoses): port of Moses + - Install with `pip install sacremoses` + + Args: + - bypass_tokenizer: Allow users to preprocess and tokenize the sentences externally (default = False) + (bool). If True, we only apply BPE. + + Returns: + List of tokens. + """ + lang = "fr" + if lang and self.lang2id and lang not in self.lang2id: + logger.error( + "Supplied language code not found in lang2id mapping. Please check that your language is supported by" + " the loaded pretrained model." + ) + + if bypass_tokenizer: + text = text.split() + else: + text = self.preprocess_text(text) + text = self.moses_pipeline(text, lang=lang) + text = self.moses_tokenize(text, lang=lang) + + split_tokens = [] + for token in text: + if token: + split_tokens.extend(list(self.bpe(token).split(" "))) + + return split_tokens + + # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer._convert_token_to_id + def _convert_token_to_id(self, token): + """Converts a token (str) in an id using the vocab.""" + return self.encoder.get(token, self.encoder.get(self.unk_token)) + + # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer._convert_id_to_token + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.decoder.get(index, self.unk_token) + + # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.convert_tokens_to_string + def convert_tokens_to_string(self, tokens): + """Converts a sequence of tokens (string) in a single string.""" + out_string = "".join(tokens).replace("", " ").strip() + return out_string + + # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.build_inputs_with_special_tokens + def build_inputs_with_special_tokens( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None + ) -> list[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. An XLM sequence has the following format: + + - single sequence: ` X ` + - pair of sequences: ` A B ` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + + """ + bos = [self.bos_token_id] + sep = [self.sep_token_id] + + if token_ids_1 is None: + return bos + token_ids_0 + sep + return bos + token_ids_0 + sep + token_ids_1 + sep + + # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.get_special_tokens_mask + def get_special_tokens_mask( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False + ) -> list[int]: + """ + Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer `prepare_for_model` method. + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not the token list is already formatted with special tokens for the model. + + Returns: + `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + + if already_has_special_tokens: + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) + + if token_ids_1 is not None: + return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] + return [1] + ([0] * len(token_ids_0)) + [1] + + # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.save_vocabulary + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]: + if not os.path.isdir(save_directory): + logger.error(f"Vocabulary path ({save_directory}) should be a directory") + return + vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + merge_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"] + ) + + with open(vocab_file, "w", encoding="utf-8") as f: + f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n") + + index = 0 + with open(merge_file, "w", encoding="utf-8") as writer: + for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]): + if index != token_index: + logger.warning( + f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive." + " Please check that the tokenizer is not corrupted!" + ) + index = token_index + writer.write(" ".join(bpe_tokens) + "\n") + index += 1 + + return vocab_file, merge_file + + # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.__getstate__ + def __getstate__(self): + state = self.__dict__.copy() + state["sm"] = None + return state + + # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.__setstate__ + def __setstate__(self, d): + self.__dict__ = d + + try: + import sacremoses + except ImportError: + raise ImportError( + "You need to install sacremoses to use XLMTokenizer. " + "See https://pypi.org/project/sacremoses/ for installation." + ) + + self.sm = sacremoses + + +__all__ = ["FlaubertTokenizer"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/florence2/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/florence2/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..686295d1fbd952e124bab885980ac682261a07fb --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/florence2/__init__.py @@ -0,0 +1,28 @@ +# Copyright 2025 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_florence2 import * + from .modeling_florence2 import * + from .processing_florence2 import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/florence2/configuration_florence2.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/florence2/configuration_florence2.py new file mode 100644 index 0000000000000000000000000000000000000000..4bbd4b3a03e9f2c591e77689e1606c60214ab430 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/florence2/configuration_florence2.py @@ -0,0 +1,215 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/florence2/modular_florence2.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_florence2.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright 2025 Microsoft and the HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from ...configuration_utils import PretrainedConfig +from ...utils import logging +from ..auto import CONFIG_MAPPING, AutoConfig + + +logger = logging.get_logger(__name__) + + +class Florence2VisionConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`Florence2VisionModel`]. It is used to instantiate a Florence2VisionModel + according to the specified arguments, defining the model architecture. Instantiating a configuration with the + defaults will yield a similar configuration to that of the Florence2VisionModel architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + in_channels (`int`, *optional*, defaults to 3): + Number of input image channels. + depths (`Tuple[int]`, *optional*, defaults to `(1, 1, 9, 1)`): + The depth of the model. + patch_size (`Tuple[int]`, *optional*, defaults to `(7, 3, 3, 3)`): + The patch size of the image. + patch_stride (`Tuple[int]`, *optional*, defaults to `(4, 2, 2, 2)`): + The patch stride of the image. + patch_padding (`Tuple[int]`, *optional*, defaults to `(3, 1, 1, 1)`): + The patch padding of the image. + patch_prenorm (`Tuple[bool]`, *optional*, defaults to `(False, True, True, True)`): + Whether to apply layer normalization before the patch embedding layer. + embed_dim (`Tuple[int]`, *optional*, defaults to `(128, 256, 512, 1024)`): + The dimension of the embedding layer. + num_heads (`Tuple[int]`, *optional*, defaults to `(4, 8, 16, 32)`): + The number of attention heads. + num_groups (`Tuple[int]`, *optional*, defaults to `(4, 8, 16, 32)`): + The number of groups. + window_size (`int`, *optional*, defaults to 12): + The window size of the model. + drop_path_rate (`float`, *optional*, defaults to 0.1): + The dropout rate of the drop path layer. + mlp_ratio (`int`, *optional*, defaults to 4.0): + Ratio of mlp hidden dim to embedding dim. + qkv_bias (`bool`, *optional*, defaults to `True`): + If True, add a learnable bias to query, key, value. + activation_function (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"silu"` and `"gelu_new"` are supported. + projection_dim (`int`, *optional*, defaults to 1024): + The dimension of the projection layer. + max_temporal_embeddings (`int`, *optional*, defaults to 100): + The configuration of the visual temporal embedding. + max_position_embeddings (`int`, *optional*, defaults to 50): + The configuration of the image position embedding. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + Example: + + ```python + >>> from transformers import Florence2VisionConfig, Florence2VisionModel + + >>> # Initializing a Florence2 Vision style configuration + >>> configuration = Florence2VisionConfig() + + >>> # Initializing a model (with random weights) + >>> model = Florence2VisionModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "florence_vision" + + def __init__( + self, + in_channels=3, + depths=(1, 1, 9, 1), + patch_size=(7, 3, 3, 3), + patch_stride=(4, 2, 2, 2), + patch_padding=(3, 1, 1, 1), + patch_prenorm=(False, True, True, True), + embed_dim=(128, 256, 512, 1024), + num_heads=(4, 8, 16, 32), + num_groups=(4, 8, 16, 32), + window_size=12, + drop_path_rate=0.1, + mlp_ratio=4.0, + qkv_bias=True, + activation_function="gelu", + projection_dim=1024, + max_temporal_embeddings=100, + max_position_embeddings=50, + initializer_range=0.02, + **kwargs, + ): + self.in_channels = in_channels + self.depths = list(depths) + self.patch_size = list(patch_size) + self.patch_stride = list(patch_stride) + self.patch_padding = list(patch_padding) + self.patch_prenorm = list(patch_prenorm) + self.embed_dim = list(embed_dim) + self.num_heads = list(num_heads) + self.num_groups = list(num_groups) + self.window_size = window_size + self.drop_path_rate = drop_path_rate + self.mlp_ratio = mlp_ratio + self.qkv_bias = qkv_bias + self.projection_dim = projection_dim + self.max_temporal_embeddings = max_temporal_embeddings + self.max_position_embeddings = max_position_embeddings + self.initializer_range = initializer_range + self.activation_function = activation_function + + super().__init__(**kwargs) + + +class Florence2Config(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`Florence2ForConditionalGeneration`]. It is used to instantiate an + Florence-2 model according to the specified arguments, defining the model architecture. + + Instantiating a configuration with the defaults will yield a similar configuration to that of the Florence-2 + [microsoft/Florence-2-base](https://huggingface.co/microsoft/Florence-2-base) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + text_config (`dict`, *optional*): + Dictionary of configuration options used to initialize [`AutoConfig`]. + vision_config (`dict`, *optional*): + Dictionary of configuration options used to initialize [`Florence2VisionConfig`]. + image_token_id (`int`, *optional*, defaults to 51289): + The image token index to encode the image prompt. + is_encoder_decoder (bool, optional, *optional*, defaults to `True`): + Whether the model is used as an encoder/decoder or not. + + Example: + + ```python + >>> from transformers import Florence2ForConditionalGeneration, Florence2Config, CLIPVisionConfig, BartConfig + + >>> # Initializing a clip-like vision config + >>> vision_config = CLIPVisionConfig() + + >>> # Initializing a Bart config + >>> text_config = BartConfig() + + >>> # Initializing a Florence-2 configuration + >>> configuration = Florence2Config(vision_config, text_config) + + >>> # Initializing a model from the florence-2 configuration + >>> model = Florence2ForConditionalGeneration(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "florence2" + sub_configs = { + "text_config": AutoConfig, + "vision_config": Florence2VisionConfig, + } + + def __init__( + self, + text_config=None, + vision_config=None, + image_token_id=51289, + is_encoder_decoder=True, + **kwargs, + ): + if isinstance(text_config, dict): + text_config["model_type"] = text_config.get("model_type", "bart") + text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config) + elif text_config is None: + text_config = CONFIG_MAPPING["bart"]() + + if isinstance(vision_config, dict): + vision_config = Florence2VisionConfig(**vision_config) + elif vision_config is None: + logger.info("vision_config is None. Initializing the Florence2VisionConfig with default values.") + vision_config = Florence2VisionConfig() + + self.text_config = text_config + self.vision_config = vision_config + self.image_token_id = image_token_id + + super().__init__( + is_encoder_decoder=is_encoder_decoder, + **kwargs, + ) + + +__all__ = ["Florence2Config", "Florence2VisionConfig"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/florence2/modeling_florence2.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/florence2/modeling_florence2.py new file mode 100644 index 0000000000000000000000000000000000000000..763756faf73f6b9172238ff95ab18285f465c28d --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/florence2/modeling_florence2.py @@ -0,0 +1,1055 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/florence2/modular_florence2.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_florence2.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright 2025 Microsoft and the HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import math +from dataclasses import dataclass +from typing import Any, Callable, Optional, Union + +import torch.nn as nn +import torch.nn.functional as F + +from ...activations import ACT2FN +from ...cache_utils import Cache +from ...generation import GenerationMixin +from ...modeling_outputs import Seq2SeqLMOutput, Seq2SeqModelOutput +from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel +from ...processing_utils import Unpack +from ...utils import ( + TransformersKwargs, + auto_docstring, + can_return_tuple, + is_torch_available, + logging, +) +from ..auto import AutoModel +from .configuration_florence2 import Florence2Config, Florence2VisionConfig + + +if is_torch_available(): + import torch + + +logger = logging.get_logger(__name__) + + +def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor: + """ + Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + + Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks, + however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... + See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the + layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the + argument. + """ + if drop_prob == 0.0 or not training: + return input + keep_prob = 1 - drop_prob + shape = (input.shape[0],) + (1,) * (input.ndim - 1) # work with diff dim tensors, not just 2D ConvNets + random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device) + random_tensor.floor_() # binarize + output = input.div(keep_prob) * random_tensor + return output + + +class Florence2VisionDropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" + + def __init__(self, drop_prob: Optional[float] = None) -> None: + super().__init__() + self.drop_prob = drop_prob + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + return drop_path(hidden_states, self.drop_prob, self.training) + + def extra_repr(self) -> str: + return f"p={self.drop_prob}" + + +class Florence2VisionLearnedAbsolutePositionEmbedding2D(nn.Module): + """ + This module learns positional embeddings up to a fixed maximum size. + """ + + def __init__(self, config: Florence2Config): + super().__init__() + num_pos = config.vision_config.max_position_embeddings + embedding_dim = config.vision_config.embed_dim[-1] + self.row_embeddings = nn.Embedding(num_pos, embedding_dim // 2) + self.column_embeddings = nn.Embedding(num_pos, embedding_dim - (embedding_dim // 2)) + + def forward(self, pixel_values, pixel_mask=None): + height, width = pixel_values.shape[-2:] + width_values = torch.arange(width, device=pixel_values.device) + height_values = torch.arange(height, device=pixel_values.device) + x_emb = self.column_embeddings(width_values) + y_emb = self.row_embeddings(height_values) + pos = torch.cat([x_emb.unsqueeze(0).repeat(height, 1, 1), y_emb.unsqueeze(1).repeat(1, width, 1)], dim=-1) + pos = pos.permute(2, 0, 1) + pos = pos.unsqueeze(0) + pos = pos.repeat(pixel_values.shape[0], 1, 1, 1) + return pos + + +class Florence2VisionPositionalEmbeddingCosine1D(nn.Module): + """ + This module generates 1D cosine positional embeddings using precomputed sinusoidal functions. + """ + + def __init__(self, config: Florence2Config): + super().__init__() + self.embed_dim = config.vision_config.embed_dim[-1] + self.max_seq_len = config.vision_config.max_temporal_embeddings + pos_idx_to_embed = torch.empty((self.max_seq_len, self.embed_dim)) + sine, cosine = self.get_sinusoid_embeddings( + max_positions=self.max_seq_len, + embed_dim=self.embed_dim, + ) + pos_idx_to_embed[:, 0::2] = sine + pos_idx_to_embed[:, 1::2] = cosine + # Save the positional embeddings in a constant buffer. + self.register_buffer("pos_idx_to_embed", pos_idx_to_embed) + + @staticmethod + def get_sinusoid_embeddings(max_positions: int, embed_dim: int): + half_dim = embed_dim // 2 + emb = math.log(10000) / half_dim + emb = torch.exp(torch.arange(half_dim, dtype=torch.int64).float() * -emb) + emb = torch.arange(max_positions, dtype=torch.float).unsqueeze(1) * emb.unsqueeze(0) + return torch.sin(emb), torch.cos(emb) + + def forward(self, seq_embeds: torch.Tensor) -> torch.Tensor: + len_seq = seq_embeds.size(1) + if len_seq > self.max_seq_len: + raise ValueError(f"Maximum sequence length {self.max_seq_len}, got {len_seq}") + pos_embeds = self.pos_idx_to_embed[0:len_seq, :] + return pos_embeds + + +class Florence2VisionMLP(nn.Module): + def __init__(self, config: Florence2VisionConfig, stage_idx: int): + super().__init__() + self.config = config + self.activation_fn = ACT2FN[config.activation_function] + self.fc1 = nn.Linear(config.embed_dim[stage_idx], int(config.embed_dim[stage_idx] * config.mlp_ratio)) + self.fc2 = nn.Linear(int(config.embed_dim[stage_idx] * config.mlp_ratio), config.embed_dim[stage_idx]) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.fc1(hidden_states) + hidden_states = self.activation_fn(hidden_states) + hidden_states = self.fc2(hidden_states) + return hidden_states + + +class Florence2VisionConvEmbed(nn.Module): + """Image to Patch Embedding""" + + def __init__(self, config: Florence2VisionConfig, stage_idx: int): + super().__init__() + self.config = config + self.stage_idx = stage_idx + self.patch_size = config.patch_size[stage_idx] + self.in_channels = config.in_channels if stage_idx == 0 else config.embed_dim[stage_idx - 1] + self.embed_dim = config.embed_dim[stage_idx] + self.stride = config.patch_stride[stage_idx] + self.padding = config.patch_padding[stage_idx] + self.pre_norm = config.patch_prenorm[stage_idx] + + self.conv = nn.Conv2d( + self.in_channels, + self.embed_dim, + kernel_size=self.patch_size, + stride=self.stride, + padding=self.padding, + ) + + dim_norm = self.in_channels if self.pre_norm else self.embed_dim + self.norm = nn.LayerNorm(dim_norm) + + def forward(self, hidden_states: torch.Tensor): + if self.norm and self.pre_norm: + hidden_states = hidden_states.permute(0, 2, 3, 1) + hidden_states = self.norm(hidden_states) + hidden_states = hidden_states.permute(0, 3, 1, 2) + + hidden_states = self.conv(hidden_states) + + if self.norm and not self.pre_norm: + hidden_states = hidden_states.permute(0, 2, 3, 1) + hidden_states = self.norm(hidden_states) + hidden_states = hidden_states.permute(0, 3, 1, 2) + return hidden_states + + +def eager_attention_forward( + module: nn.Module, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attention_mask: Optional[torch.Tensor], + scaling: Optional[float] = None, + dropout: float = 0.0, + head_mask: Optional[torch.Tensor] = None, + **kwargs, +): + if scaling is None: + scaling = query.size(-1) ** -0.5 + + attn_weights = torch.matmul(query, key.transpose(2, 3)) * scaling + if attention_mask is not None: + attn_weights = attn_weights + attention_mask + + attn_weights = nn.functional.softmax(attn_weights, dim=-1) + + if head_mask is not None: + attn_weights = attn_weights * head_mask.view(1, -1, 1, 1) + + attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) + attn_output = torch.matmul(attn_weights, value) + attn_output = attn_output.transpose(1, 2).contiguous() + + return attn_output, attn_weights + + +class Florence2VisionChannelAttention(nn.Module): + def __init__(self, config: Florence2VisionConfig, stage_idx: int): + super().__init__() + self.config = config + self.dim = config.embed_dim[stage_idx] + self.groups = config.num_groups[stage_idx] + self.qkv = nn.Linear(self.dim, self.dim * 3, bias=config.qkv_bias) + self.proj = nn.Linear(self.dim, self.dim) + self.is_causal = False + + def forward(self, hidden_states: torch.Tensor): + batch_size, num_tokens, hidden_size = hidden_states.shape + + # Reshape for grouped channel attention + qkv = self.qkv(hidden_states).reshape(batch_size, num_tokens, 3, self.groups, hidden_size // self.groups) + qkv = qkv.permute(2, 0, 3, 4, 1) + query, key, value = qkv.unbind(0) + + scale = num_tokens**-0.5 + # Channel-to-channel attention within groups: + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + hidden_states, _ = attention_interface( + self, + query, + key, + value, + attention_mask=None, + scaling=scale, + ) + hidden_states = hidden_states.permute(0, 3, 2, 1) + hidden_states = hidden_states.reshape(batch_size, num_tokens, hidden_size) + + # Final projection + hidden_states = self.proj(hidden_states) + return hidden_states + + +class Florence2VisionChannelBlock(nn.Module): + def __init__( + self, + config: Florence2VisionConfig, + stage_idx: int, + drop_path_rate: float, + ): + super().__init__() + + self.config = config + dim_in = config.embed_dim[stage_idx] + + self.conv1 = nn.Conv2d( + dim_in, + dim_in, + kernel_size=3, + padding=1, + groups=dim_in, + ) + self.norm1 = nn.LayerNorm(config.embed_dim[stage_idx]) + self.channel_attn = Florence2VisionChannelAttention(config=config, stage_idx=stage_idx) + self.drop_path1 = Florence2VisionDropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity() + + self.conv2 = nn.Conv2d( + dim_in, + dim_in, + kernel_size=3, + padding=1, + groups=dim_in, + ) + self.norm2 = nn.LayerNorm(config.embed_dim[stage_idx]) + self.ffn = Florence2VisionMLP(config=config, stage_idx=stage_idx) + self.drop_path2 = Florence2VisionDropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity() + + def forward(self, hidden_states: torch.Tensor): + batch_size, embed_dim, height, width = hidden_states.shape + + # First channel block: Depthwise Conv + Channel Attention + hidden_states = self.conv1(hidden_states) + hidden_states + hidden_states = hidden_states.flatten(2).transpose(1, 2) + residual = hidden_states + + # Channel group attention self-attention mechanism + hidden_states = self.norm1(hidden_states) + hidden_states = self.channel_attn(hidden_states) + hidden_states = residual + self.drop_path1(hidden_states) + hidden_states = hidden_states.transpose(1, 2).view(batch_size, embed_dim, height, width) + + # Second channel block: Depthwise Conv + FFN + hidden_states = self.conv2(hidden_states) + hidden_states + hidden_states = hidden_states.flatten(2).transpose(1, 2) + residual = hidden_states + + # FFN + hidden_states = self.norm2(hidden_states) + hidden_states = self.ffn(hidden_states) + hidden_states = residual + self.drop_path2(hidden_states) + hidden_states = hidden_states.transpose(1, 2).view(batch_size, embed_dim, height, width) + + return hidden_states + + +class Florence2VisionWindowAttention(nn.Module): + def __init__(self, config: Florence2VisionConfig, stage_idx: int): + super().__init__() + self.config = config + self.dim = config.embed_dim[stage_idx] + self.window_size = config.window_size + self.num_heads = config.num_heads[stage_idx] + head_dim = self.dim // self.num_heads + self.scale = head_dim**-0.5 + + self.qkv = nn.Linear(self.dim, self.dim * 3, bias=config.qkv_bias) + self.proj = nn.Linear(self.dim, self.dim) + self.is_causal = False + + def forward(self, hidden_states: torch.Tensor): + batch_size, height, width, embed_dim = hidden_states.shape + + # Pad the input if necessary + pad_left = pad_top = 0 + pad_right = (self.window_size - width % self.window_size) % self.window_size + pad_bottom = (self.window_size - height % self.window_size) % self.window_size + hidden_states = F.pad(hidden_states, (0, 0, pad_left, pad_right, pad_top, pad_bottom)) + _, padded_height, padded_width, _ = hidden_states.shape + + # Partition input into non-overlapping windows (for local spatial attention in DaViT) + hidden_states = hidden_states.view( + batch_size, + padded_height // self.window_size, + self.window_size, + padded_width // self.window_size, + self.window_size, + embed_dim, + ) + windowed_hidden_states = hidden_states.permute(0, 1, 3, 2, 4, 5).contiguous() + windowed_hidden_states = windowed_hidden_states.view(-1, self.window_size * self.window_size, embed_dim) + + # Generate Q, K, V for each window + num_windows_per_batch, num_tokens_per_window, embed_dim = windowed_hidden_states.shape + qkv = self.qkv(windowed_hidden_states).reshape( + num_windows_per_batch, num_tokens_per_window, 3, self.num_heads, embed_dim // self.num_heads + ) + qkv = qkv.permute(2, 0, 3, 1, 4) + query, key, value = qkv.unbind(0) + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + windowed_hidden_states, _ = attention_interface( + self, + query, + key, + value, + attention_mask=None, + scaling=self.scale, + ) + windowed_hidden_states = windowed_hidden_states.view(num_windows_per_batch, num_tokens_per_window, embed_dim) + windowed_hidden_states = self.proj(windowed_hidden_states) + + # Merge windows back to original spatial layout + windowed_hidden_states = windowed_hidden_states.view(-1, self.window_size, self.window_size, embed_dim) + hidden_states = windowed_hidden_states.view( + -1, + padded_height // self.window_size, + padded_width // self.window_size, + self.window_size, + self.window_size, + embed_dim, + ) + hidden_states = hidden_states.permute(0, 1, 3, 2, 4, 5).contiguous() + hidden_states = hidden_states.view(-1, padded_height, padded_width, embed_dim) + hidden_states = hidden_states[:, :height, :width, :].contiguous() + hidden_states = hidden_states.view(batch_size, height * width, embed_dim) + + return hidden_states + + +class Florence2VisionSpatialBlock(nn.Module): + def __init__( + self, + config: Florence2VisionConfig, + stage_idx: int, + drop_path_rate: float, + ): + super().__init__() + + self.conv1 = nn.Conv2d( + config.embed_dim[stage_idx], + config.embed_dim[stage_idx], + kernel_size=3, + padding=1, + groups=config.embed_dim[stage_idx], + ) + self.norm1 = nn.LayerNorm(config.embed_dim[stage_idx]) + self.window_attn = Florence2VisionWindowAttention(config=config, stage_idx=stage_idx) + self.drop_path1 = Florence2VisionDropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity() + + self.conv2 = nn.Conv2d( + config.embed_dim[stage_idx], + config.embed_dim[stage_idx], + kernel_size=3, + padding=1, + groups=config.embed_dim[stage_idx], + ) + self.norm2 = nn.LayerNorm(config.embed_dim[stage_idx]) + self.ffn = Florence2VisionMLP(config=config, stage_idx=stage_idx) + self.drop_path2 = Florence2VisionDropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity() + + def forward(self, hidden_states: torch.Tensor): + batch_size, embed_dim, height, width = hidden_states.shape + + # First spatial mixing block: Conv + Window Attention + hidden_states = self.conv1(hidden_states) + hidden_states + hidden_states = hidden_states.flatten(2).transpose(1, 2) + residual = hidden_states + + # Spatial Window-based self-attention mechanism + hidden_states = self.norm1(hidden_states) + hidden_states = hidden_states.view(batch_size, height, width, embed_dim) + hidden_states = self.window_attn(hidden_states) + hidden_states = residual + self.drop_path1(hidden_states) + hidden_states = hidden_states.transpose(1, 2).view(batch_size, embed_dim, height, width) + + # Second spatial mixing block: Conv + FFN + hidden_states = self.conv2(hidden_states) + hidden_states + hidden_states = hidden_states.flatten(2).transpose(1, 2) + residual = hidden_states + + # FFN + hidden_states = self.norm2(hidden_states) + hidden_states = self.ffn(hidden_states) + hidden_states = residual + self.drop_path2(hidden_states) + hidden_states = hidden_states.transpose(1, 2).view(batch_size, embed_dim, height, width) + + return hidden_states + + +class Florence2VisionBlock(nn.Module): + def __init__( + self, + config: Florence2VisionConfig, + stage_idx: int, + spatial_drop_path_rate: float, + channel_drop_path_rate: float, + ): + super().__init__() + self.spatial_block = Florence2VisionSpatialBlock( + config=config, + stage_idx=stage_idx, + drop_path_rate=spatial_drop_path_rate, + ) + self.channel_block = Florence2VisionChannelBlock( + config=config, + stage_idx=stage_idx, + drop_path_rate=channel_drop_path_rate, + ) + + def forward(self, hidden_states: torch.Tensor): + hidden_states = self.spatial_block(hidden_states) + hidden_states = self.channel_block(hidden_states) + return hidden_states + + +@auto_docstring +class Florence2VisionPreTrainedModel(PreTrainedModel): + config_class = Florence2VisionConfig + main_input_name = "pixel_values" + _supports_sdpa = True + _supports_flash_attn = True + _supports_flex_attn = True + + _can_compile_fullgraph = True + + +@auto_docstring +class Florence2VisionBackbone(Florence2VisionPreTrainedModel): + def __init__(self, config: Florence2VisionConfig): + super().__init__(config) + self.config = config + + self.embed_dim = config.embed_dim + self.num_heads = config.num_heads + self.num_groups = config.num_groups + self.num_stages = len(self.embed_dim) + + if not (self.num_stages == len(self.num_heads) == len(self.num_groups)): + raise ValueError( + f"Expected self.num_stages ({self.num_stages}) == " + f"len(self.num_heads) ({len(self.num_heads)}) == " + f"len(self.num_groups) ({len(self.num_groups)})" + ) + + dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths) * 2, device="cpu")] + depth_offset = 0 + + convs = [] + blocks = [] + for stage_idx in range(self.num_stages): + conv_embed = Florence2VisionConvEmbed( + config=config, + stage_idx=stage_idx, + ) + convs.append(conv_embed) + + block = nn.ModuleList( + Florence2VisionBlock( + config=config, + stage_idx=stage_idx, + spatial_drop_path_rate=dpr[depth_offset + block_idx * 2], + channel_drop_path_rate=dpr[depth_offset + block_idx * 2 + 1], + ) + for block_idx in range(config.depths[stage_idx]) + ) + blocks.append(block) + depth_offset += config.depths[stage_idx] * 2 + + self.convs = nn.ModuleList(convs) + self.blocks = nn.ModuleList(blocks) + + # Initialize weights and apply final processing + self.post_init() + + def forward(self, hidden_states: torch.Tensor): + for conv, block in zip(self.convs, self.blocks): + hidden_states = conv(hidden_states) + for layer in block: + hidden_states = layer(hidden_states) + return hidden_states + + +class Florence2MultiModalProjector(nn.Module): + def __init__(self, config: Florence2Config): + super().__init__() + self.vision_embedding_dim = config.vision_config.embed_dim[-1] + self.vision_projection_dim = config.vision_config.projection_dim + self.image_projection = nn.Linear(self.vision_embedding_dim, self.vision_projection_dim, bias=False) + self.image_proj_norm = nn.LayerNorm(self.vision_projection_dim) + self.image_position_embed = Florence2VisionLearnedAbsolutePositionEmbedding2D(config=config) + self.visual_temporal_embed = Florence2VisionPositionalEmbeddingCosine1D(config=config) + + def forward(self, image_features): + position_features = image_features + self.image_position_embed(image_features) + position_features = position_features.flatten(2).transpose(1, 2) + temporal_features = self.visual_temporal_embed(position_features[:, :1, :]) + temporal_features = temporal_features.unsqueeze(1) + visual_token_features = position_features + temporal_features + visual_token_features = visual_token_features.unsqueeze(1) + spatial_image_features = visual_token_features.mean(dim=2) + temporal_image_features = visual_token_features.mean(dim=1) + image_features = torch.cat([spatial_image_features, temporal_image_features], dim=1) + image_features = self.image_projection(image_features) + image_features = self.image_proj_norm(image_features) + return image_features + + +@dataclass +@auto_docstring( + custom_intro=""" + Base class for Florence-2 base model's outputs that also contains : pre-computed hidden states that can speed up sequential + decoding. + """ +) +class Florence2Seq2SeqModelOutput(Seq2SeqModelOutput): + r""" + image_hidden_states (`torch.FloatTensor`, *optional*): + A `torch.FloatTensor` of size `(batch_size, num_image_tokens, hidden_size)`. + image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state. + """ + + image_hidden_states: Optional[torch.FloatTensor] = None + + +@dataclass +@auto_docstring( + custom_intro=""" + Base class for Florence-2 model's outputs that also contains : pre-computed hidden states that can speed up sequential + decoding. + """ +) +class Florence2Seq2SeqLMOutput(Seq2SeqLMOutput): + r""" + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Language modeling loss (for next-token prediction). + logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + image_hidden_states (`torch.FloatTensor`, *optional*): + A `torch.FloatTensor` of size `(batch_size, num_image_tokens, hidden_size)`. + image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state. + """ + + image_hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None + + +@auto_docstring +class Florence2PreTrainedModel(PreTrainedModel): + config: Florence2Config + base_model_prefix = "" + supports_gradient_checkpointing = True + _skip_keys_device_placement = "past_key_values" + + _supports_flash_attn = True + _supports_sdpa = True + + _can_compile_fullgraph = True + _supports_flex_attn = True + + _supports_attention_backend = False + config_class = Florence2Config + + +@auto_docstring( + custom_intro=""" + Florence-2 is a vision model for captioning, detection, and segmentation. + """ +) +class Florence2Model(Florence2PreTrainedModel): + _checkpoint_conversion_mapping = {} + _tied_weights_keys = [ + "language_model.encoder.embed_tokens.weight", + "language_model.decoder.embed_tokens.weight", + ] + + def __init__(self, config: Florence2Config): + super().__init__(config) + self.vision_tower = Florence2VisionBackbone(config=config.vision_config) + + self.multi_modal_projector = Florence2MultiModalProjector(config) + self.language_model = AutoModel.from_config(config.text_config) + self.post_init() + + def get_input_embeddings(self): + return self.language_model.get_input_embeddings() + + def set_input_embeddings(self, value): + self.language_model.set_input_embeddings(value) + + def set_decoder(self, decoder): + self.language_model = decoder + + def get_decoder(self): + return self.language_model.get_decoder() + + def get_image_features(self, pixel_values: torch.Tensor, **kwargs): + """ + Obtains image last hidden states from the vision tower and apply multimodal projection. + + Args: + pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`): + The tensors corresponding to the input images. + Returns: + image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`). + """ + image_features = self.vision_tower(pixel_values, **kwargs) + image_embeds = self.multi_modal_projector(image_features) + return image_embeds + + def get_placeholder_mask( + self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor, image_features: torch.FloatTensor + ): + """ + Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is + equal to the length of multimodal features. If the lengths are different, an error is raised. + """ + if input_ids is None: + special_image_mask = inputs_embeds == self.get_input_embeddings()( + torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device) + ) + special_image_mask = special_image_mask.all(-1) + else: + special_image_mask = input_ids == self.config.image_token_id + + n_image_tokens = special_image_mask.sum() + special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device) + n_image_features = image_features.shape[0] * image_features.shape[1] + if inputs_embeds[special_image_mask].numel() != image_features.numel(): + raise ValueError( + f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}" + ) + return special_image_mask + + @can_return_tuple + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + decoder_input_ids: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.LongTensor] = None, + decoder_head_mask: Optional[torch.Tensor] = None, + cross_attn_head_mask: Optional[torch.Tensor] = None, + decoder_inputs_embeds: Optional[torch.FloatTensor] = None, + encoder_outputs: Optional[list[torch.FloatTensor]] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + ) -> Union[tuple, Florence2Seq2SeqModelOutput]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if encoder_outputs is None: + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") + + if inputs_embeds is None: + inputs_embeds = self.get_input_embeddings()(input_ids) + + if pixel_values is not None: + image_features = self.get_image_features(pixel_values) + image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype) + special_image_mask = self.get_placeholder_mask( + input_ids, inputs_embeds=inputs_embeds, image_features=image_features + ) + inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features) + + encoder_outputs = self.language_model.encoder( + attention_mask=attention_mask, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=True, + ) + + if decoder_input_ids is None: + decoder_start_token_id = self.config.text_config.decoder_start_token_id + decoder_input_ids = torch.ones((inputs_embeds.size()[0], 1), dtype=torch.long, device=inputs_embeds.device) + decoder_input_ids *= decoder_start_token_id + + decoder_outputs = self.language_model.decoder( + input_ids=decoder_input_ids, + attention_mask=decoder_attention_mask, + encoder_hidden_states=encoder_outputs[0], + encoder_attention_mask=attention_mask, + head_mask=decoder_head_mask, + cross_attn_head_mask=cross_attn_head_mask, + past_key_values=past_key_values, + inputs_embeds=decoder_inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + cache_position=cache_position, + return_dict=True, + ) + + return Florence2Seq2SeqModelOutput( + last_hidden_state=decoder_outputs.last_hidden_state, + past_key_values=decoder_outputs.past_key_values, + decoder_hidden_states=decoder_outputs.hidden_states, + decoder_attentions=decoder_outputs.attentions, + cross_attentions=decoder_outputs.cross_attentions, + encoder_last_hidden_state=encoder_outputs.last_hidden_state, + encoder_hidden_states=encoder_outputs.hidden_states, + encoder_attentions=encoder_outputs.attentions, + image_hidden_states=image_features if pixel_values is not None else None, + ) + + def get_encoder(self): + return self.language_model.get_encoder() + + +def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int): + """ + Shift input ids one token to the right. + """ + shifted_input_ids = input_ids.new_zeros(input_ids.shape) + shifted_input_ids[:, 1:] = input_ids[:, :-1].clone() + shifted_input_ids[:, 0] = decoder_start_token_id + + if pad_token_id is None: + raise ValueError("self.model.config.pad_token_id has to be defined.") + # replace possible -100 values in labels by `pad_token_id` + shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id) + + return shifted_input_ids + + +@auto_docstring( + custom_intro=""" + Florence-2 is a vision model for captioning, detection, and segmentation. + """ +) +class Florence2ForConditionalGeneration(Florence2PreTrainedModel, GenerationMixin): + _checkpoint_conversion_mapping = {} + _tied_weights_keys = [ + "model.language_model.encoder.embed_tokens.weight", + "model.language_model.decoder.embed_tokens.weight", + "lm_head.weight", + ] + + def __init__(self, config: Florence2Config): + super().__init__(config) + self.model = Florence2Model(config) + self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False) + self.post_init() + + def get_input_embeddings(self): + return self.model.get_input_embeddings() + + def set_input_embeddings(self, value): + self.model.set_input_embeddings(value) + + def get_output_embeddings(self) -> nn.Module: + return self.lm_head + + def set_decoder(self, decoder): + self.model.set_decoder(decoder) + + def get_decoder(self): + return self.model.get_decoder() + + def get_image_features(self, pixel_values: torch.Tensor, **kwargs): + return self.model.get_image_features(pixel_values=pixel_values, **kwargs) + + # Make modules available through conditional class for BC + @property + def language_model(self): + return self.model.language_model + + @property + def vision_tower(self): + return self.model.vision_tower + + @property + def multi_modal_projector(self): + return self.model.multi_modal_projector + + @can_return_tuple + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + decoder_input_ids: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.Tensor] = None, + decoder_head_mask: Optional[torch.Tensor] = None, + cross_attn_head_mask: Optional[torch.Tensor] = None, + encoder_outputs: Optional[list[torch.FloatTensor]] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + decoder_inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + logits_to_keep: Union[int, torch.Tensor] = 0, + **kwargs: Unpack[TransformersKwargs], + ) -> Union[tuple, Florence2Seq2SeqLMOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Example: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, Florence2ForConditionalGeneration + + >>> model = Florence2ForConditionalGeneration.from_pretrained("microsoft/Florence-2-large") + >>> processor = AutoProcessor.from_pretrained("microsoft/Florence-2-large") + + >>> prompt = "" + >>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> inputs = processor(text=prompt, images=image, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(**inputs, max_length=100) + >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "A green car parked in front of a yellow building." + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if labels is not None: + if use_cache: + logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.") + use_cache = False + if decoder_input_ids is None and decoder_inputs_embeds is None: + decoder_input_ids = shift_tokens_right( + labels, self.config.text_config.pad_token_id, self.config.text_config.decoder_start_token_id + ) + + outputs = self.model( + input_ids=input_ids, + pixel_values=pixel_values, + attention_mask=attention_mask, + decoder_input_ids=decoder_input_ids, + encoder_outputs=encoder_outputs, + decoder_attention_mask=decoder_attention_mask, + head_mask=head_mask, + decoder_head_mask=decoder_head_mask, + cross_attn_head_mask=cross_attn_head_mask, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + decoder_inputs_embeds=decoder_inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=True, + cache_position=cache_position, + # **kwargs, ## TODO: add back when Bart attention is refactored and takes kwargs + ) + + hidden_states = outputs[0] + # Only compute necessary logits, and do not upcast them to float if we are not computing the loss + slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep + logits = self.lm_head(hidden_states[:, slice_indices, :]) + + loss = None + if labels is not None: + loss = self.loss_function( + logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **kwargs + ) + + return Florence2Seq2SeqLMOutput( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + decoder_hidden_states=outputs.decoder_hidden_states, + decoder_attentions=outputs.decoder_attentions, + cross_attentions=outputs.cross_attentions, + encoder_last_hidden_state=outputs.encoder_last_hidden_state, + encoder_hidden_states=outputs.encoder_hidden_states, + encoder_attentions=outputs.encoder_attentions, + image_hidden_states=outputs.image_hidden_states, + ) + + def prepare_inputs_for_generation( + self, + input_ids, + past_key_values=None, + inputs_embeds=None, + pixel_values=None, + attention_mask=None, + cache_position=None, + logits_to_keep=None, + **kwargs, + ): + # Overwritten -- in specific circumstances we don't want to forward image inputs to the model + + model_inputs = super().prepare_inputs_for_generation( + input_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + cache_position=cache_position, + logits_to_keep=logits_to_keep, + **kwargs, + ) + + if cache_position[0] == 0: + # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore + # Otherwise we need pixel values to be passed to model + model_inputs["pixel_values"] = pixel_values + + return model_inputs + + def get_encoder(self): + return self.model.get_encoder() + + def get_placeholder_mask( + self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor, image_features: torch.FloatTensor + ): + return self.model.get_placeholder_mask( + input_ids=input_ids, inputs_embeds=inputs_embeds, image_features=image_features + ) + + def _prepare_encoder_decoder_kwargs_for_generation( + self, + inputs_tensor: torch.Tensor, + model_kwargs, + model_input_name: Optional[str], + generation_config, + ) -> dict[str, Any]: + # override to handle merging image and text embeddings before passing to language encoder + inputs_embeds = model_kwargs.pop("inputs_embeds", None) + pixel_values = model_kwargs.pop("pixel_values", None) + + if inputs_embeds is None: + inputs_embeds = self.get_input_embeddings()(inputs_tensor) + + if pixel_values is not None: + image_features = self.get_image_features(pixel_values) + image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype) + special_image_mask = self.get_placeholder_mask( + inputs_tensor, inputs_embeds=inputs_embeds, image_features=image_features + ) + inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features) + + model_kwargs["inputs_embeds"] = inputs_embeds + model_kwargs = super()._prepare_encoder_decoder_kwargs_for_generation( + None, model_kwargs, model_input_name, generation_config + ) + model_kwargs.pop("inputs_embeds", None) + return model_kwargs + + +__all__ = [ + "Florence2Model", + "Florence2ForConditionalGeneration", + "Florence2PreTrainedModel", + "Florence2VisionBackbone", + "Florence2VisionPreTrainedModel", +] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/florence2/modular_florence2.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/florence2/modular_florence2.py new file mode 100644 index 0000000000000000000000000000000000000000..f8732257f10249a847f39fc94630d2f40718cfc9 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/florence2/modular_florence2.py @@ -0,0 +1,1806 @@ +# coding=utf-8 +# Copyright 2025 Microsoft and the HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import math +import re +from dataclasses import dataclass +from typing import Any, Callable, Optional, Union + +import numpy as np +import torch.nn as nn +import torch.nn.functional as F + +from ...activations import ACT2FN +from ...cache_utils import Cache +from ...configuration_utils import PretrainedConfig +from ...feature_extraction_utils import BatchFeature +from ...image_utils import ImageInput +from ...modeling_outputs import Seq2SeqLMOutput, Seq2SeqModelOutput +from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel +from ...processing_utils import MultiModalData, ProcessorMixin, Unpack +from ...tokenization_utils_base import PreTokenizedInput, TextInput +from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, is_torch_available, logging +from ..auto import CONFIG_MAPPING, AutoConfig +from ..bart.modeling_bart import eager_attention_forward, shift_tokens_right +from ..beit.modeling_beit import BeitDropPath +from ..llama4.modeling_llama4 import Llama4VisionMLP +from ..llava.modeling_llava import LlavaForConditionalGeneration, LlavaModel, LlavaPreTrainedModel +from ..llava.processing_llava import LlavaProcessorKwargs + + +if is_torch_available(): + import torch + +logger = logging.get_logger(__name__) + + +class Florence2VisionConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`Florence2VisionModel`]. It is used to instantiate a Florence2VisionModel + according to the specified arguments, defining the model architecture. Instantiating a configuration with the + defaults will yield a similar configuration to that of the Florence2VisionModel architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + in_channels (`int`, *optional*, defaults to 3): + Number of input image channels. + depths (`Tuple[int]`, *optional*, defaults to `(1, 1, 9, 1)`): + The depth of the model. + patch_size (`Tuple[int]`, *optional*, defaults to `(7, 3, 3, 3)`): + The patch size of the image. + patch_stride (`Tuple[int]`, *optional*, defaults to `(4, 2, 2, 2)`): + The patch stride of the image. + patch_padding (`Tuple[int]`, *optional*, defaults to `(3, 1, 1, 1)`): + The patch padding of the image. + patch_prenorm (`Tuple[bool]`, *optional*, defaults to `(False, True, True, True)`): + Whether to apply layer normalization before the patch embedding layer. + embed_dim (`Tuple[int]`, *optional*, defaults to `(128, 256, 512, 1024)`): + The dimension of the embedding layer. + num_heads (`Tuple[int]`, *optional*, defaults to `(4, 8, 16, 32)`): + The number of attention heads. + num_groups (`Tuple[int]`, *optional*, defaults to `(4, 8, 16, 32)`): + The number of groups. + window_size (`int`, *optional*, defaults to 12): + The window size of the model. + drop_path_rate (`float`, *optional*, defaults to 0.1): + The dropout rate of the drop path layer. + mlp_ratio (`int`, *optional*, defaults to 4.0): + Ratio of mlp hidden dim to embedding dim. + qkv_bias (`bool`, *optional*, defaults to `True`): + If True, add a learnable bias to query, key, value. + activation_function (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"silu"` and `"gelu_new"` are supported. + projection_dim (`int`, *optional*, defaults to 1024): + The dimension of the projection layer. + max_temporal_embeddings (`int`, *optional*, defaults to 100): + The configuration of the visual temporal embedding. + max_position_embeddings (`int`, *optional*, defaults to 50): + The configuration of the image position embedding. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + Example: + + ```python + >>> from transformers import Florence2VisionConfig, Florence2VisionModel + + >>> # Initializing a Florence2 Vision style configuration + >>> configuration = Florence2VisionConfig() + + >>> # Initializing a model (with random weights) + >>> model = Florence2VisionModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "florence_vision" + + def __init__( + self, + in_channels=3, + depths=(1, 1, 9, 1), + patch_size=(7, 3, 3, 3), + patch_stride=(4, 2, 2, 2), + patch_padding=(3, 1, 1, 1), + patch_prenorm=(False, True, True, True), + embed_dim=(128, 256, 512, 1024), + num_heads=(4, 8, 16, 32), + num_groups=(4, 8, 16, 32), + window_size=12, + drop_path_rate=0.1, + mlp_ratio=4.0, + qkv_bias=True, + activation_function="gelu", + projection_dim=1024, + max_temporal_embeddings=100, + max_position_embeddings=50, + initializer_range=0.02, + **kwargs, + ): + self.in_channels = in_channels + self.depths = list(depths) + self.patch_size = list(patch_size) + self.patch_stride = list(patch_stride) + self.patch_padding = list(patch_padding) + self.patch_prenorm = list(patch_prenorm) + self.embed_dim = list(embed_dim) + self.num_heads = list(num_heads) + self.num_groups = list(num_groups) + self.window_size = window_size + self.drop_path_rate = drop_path_rate + self.mlp_ratio = mlp_ratio + self.qkv_bias = qkv_bias + self.projection_dim = projection_dim + self.max_temporal_embeddings = max_temporal_embeddings + self.max_position_embeddings = max_position_embeddings + self.initializer_range = initializer_range + self.activation_function = activation_function + + super().__init__(**kwargs) + + +class Florence2Config(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`Florence2ForConditionalGeneration`]. It is used to instantiate an + Florence-2 model according to the specified arguments, defining the model architecture. + + Instantiating a configuration with the defaults will yield a similar configuration to that of the Florence-2 + [microsoft/Florence-2-base](https://huggingface.co/microsoft/Florence-2-base) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + text_config (`dict`, *optional*): + Dictionary of configuration options used to initialize [`AutoConfig`]. + vision_config (`dict`, *optional*): + Dictionary of configuration options used to initialize [`Florence2VisionConfig`]. + image_token_id (`int`, *optional*, defaults to 51289): + The image token index to encode the image prompt. + is_encoder_decoder (bool, optional, *optional*, defaults to `True`): + Whether the model is used as an encoder/decoder or not. + + Example: + + ```python + >>> from transformers import Florence2ForConditionalGeneration, Florence2Config, CLIPVisionConfig, BartConfig + + >>> # Initializing a clip-like vision config + >>> vision_config = CLIPVisionConfig() + + >>> # Initializing a Bart config + >>> text_config = BartConfig() + + >>> # Initializing a Florence-2 configuration + >>> configuration = Florence2Config(vision_config, text_config) + + >>> # Initializing a model from the florence-2 configuration + >>> model = Florence2ForConditionalGeneration(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "florence2" + sub_configs = { + "text_config": AutoConfig, + "vision_config": Florence2VisionConfig, + } + + def __init__( + self, + text_config=None, + vision_config=None, + image_token_id=51289, + is_encoder_decoder=True, + **kwargs, + ): + if isinstance(text_config, dict): + text_config["model_type"] = text_config.get("model_type", "bart") + text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config) + elif text_config is None: + text_config = CONFIG_MAPPING["bart"]() + + if isinstance(vision_config, dict): + vision_config = Florence2VisionConfig(**vision_config) + elif vision_config is None: + logger.info("vision_config is None. Initializing the Florence2VisionConfig with default values.") + vision_config = Florence2VisionConfig() + + self.text_config = text_config + self.vision_config = vision_config + self.image_token_id = image_token_id + + super().__init__( + is_encoder_decoder=is_encoder_decoder, + **kwargs, + ) + + +class Florence2ProcessorKwargs(LlavaProcessorKwargs): + pass + + +class Florence2Processor(ProcessorMixin): + r""" + Constructs a Florence2 processor which wraps a Florence2 image processor and a Florence2 tokenizer into a single processor. + + [`Florence2Processor`] offers all the functionalities of [`AutoImageProcessor`] and [`BartTokenizerFast`]. See the + [`~Florence2Processor.__call__`] and [`~Florence2Processor.decode`] for more information. + + Args: + image_processor (`AutoImageProcessor`, *optional*): + The image processor is a required input. + tokenizer (`Union[BartTokenizer, BartTokenizerFast]`, *optional*): + The tokenizer is a required input. + num_additional_image_tokens (`int`, *optional*, defaults to 0): + Number of additional tokens added to the image embeddings, such as CLS (+1). If the backbone has no CLS or other + extra tokens appended, no need to set this arg. + post_processor_config (`dict`, *optional*, defaults to 0): + Task-specific parsing rules for [`Florence2PostProcessor`], e.g. regex patterns, + thresholds, or banned tokens. + """ + + attributes = ["image_processor", "tokenizer"] + image_processor_class = "AutoImageProcessor" + tokenizer_class = ("BartTokenizer", "BartTokenizerFast") + + def __init__( + self, + image_processor=None, + tokenizer=None, + num_additional_image_tokens: int = 0, + post_processor_config: Optional[dict] = None, + **kwargs, + ): + self.tasks_answer_post_processing_type = { + "": "pure_text", + "": "ocr", + "": "pure_text", + "": "pure_text", + "": "pure_text", + "": "description_with_bboxes", + "": "description_with_bboxes", + "": "phrase_grounding", + "": "polygons", + "": "polygons", + "": "description_with_bboxes_or_polygons", + "": "pure_text", + "": "pure_text", + "": "pure_text", + "": "bboxes", + } + + self.task_prompts_without_inputs = { + "": "What is the text in the image?", + "": "What is the text in the image, with regions?", + "": "What does the image describe?", + "": "Describe in detail what is shown in the image.", + "": "Describe with a paragraph what is shown in the image.", + "": "Locate the objects with category name in the image.", + "": "Locate the objects in the image, with their descriptions.", + "": "Locate the region proposals in the image.", + } + + self.task_prompts_with_input = { + "": "Locate the phrases in the caption: {input}", + "": "Locate {input} in the image with mask", + "": "What is the polygon mask of region {input}", + "": "Locate {input} in the image.", + "": "What is the region {input}?", + "": "What does the region {input} describe?", + "": "What text is in the region {input}?", + } + + self.num_image_tokens = image_processor.image_seq_length + self.num_additional_image_tokens = num_additional_image_tokens + self.post_processor_config = post_processor_config + self.post_processor = Florence2PostProcessor(config=post_processor_config, tokenizer=tokenizer) + self.image_token = tokenizer.image_token + self.image_token_id = tokenizer.image_token_id + + super().__init__(image_processor, tokenizer, **kwargs) + + def _construct_prompts(self, text: Union[str, list[str]]) -> list[str]: + """ + Construct prompts by replacing task tokens with corresponding prompt strings. + """ + if isinstance(text, str): + text = [text] + + prompts = [] + for prompt in text: + # Check for tasks without inputs + for task_token, task_prompt in self.task_prompts_without_inputs.items(): + if task_token in prompt: + if prompt != task_token: + raise ValueError(f"Task token {task_token} should be the only content in the prompt.") + prompt = task_prompt + break + # Check for tasks with inputs + for task_token, task_prompt in self.task_prompts_with_input.items(): + if task_token in prompt: + input_text = prompt.replace(task_token, "").strip() + prompt = task_prompt.format(input=input_text) + break + prompts.append(prompt) + return prompts + + def __call__( + self, + images: Optional[ImageInput] = None, + text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None, + **kwargs: Unpack[Florence2ProcessorKwargs], + ) -> BatchFeature: + """ + Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text` + and `kwargs` arguments to BartTokenizerFast's [`~BartTokenizerFast.__call__`] if `text` is not `None` to encode + the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to + CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring + of the above two methods for more information. + + Args: + images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`): + The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch + tensor. Both channels-first and channels-last formats are supported. + text (`str`, `list[str]`, `list[list[str]]`): + The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings + (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set + `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). + return_tensors (`str` or [`~utils.TensorType`], *optional*): + If set, will return tensors of a particular framework. Acceptable values are: + - `'tf'`: Return TensorFlow `tf.constant` objects. + - `'pt'`: Return PyTorch `torch.Tensor` objects. + - `'np'`: Return NumPy `np.ndarray` objects. + - `'jax'`: Return JAX `jnp.ndarray` objects. + + Returns: + [`BatchFeature`]: A [`BatchFeature`] with the following fields: + + - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`. + - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when + `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not + `None`). + - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`. + """ + if images is None and text is None: + raise ValueError("You have to specify at least one of `images` or `text`.") + + output_kwargs = self._merge_kwargs( + Florence2ProcessorKwargs, + tokenizer_init_kwargs=self.tokenizer.init_kwargs, + **kwargs, + ) + + image_inputs = {} + if images is not None: + image_inputs = self.image_processor(images, **output_kwargs["images_kwargs"]) + + if text is None: + logger.warning_once("You are using Florence-2 without a text prefix.") + text = [""] * (1 if not isinstance(images, list) else len(images)) + elif isinstance(text, str): + text = [text] + + if not isinstance(text, list) or not all(isinstance(token, str) for token in text): + raise ValueError("`text` must be a string or list of strings.") + + if isinstance(images, list) and len(images) != len(text): + raise ValueError(f"Number of images ({len(images)}) must match number of texts ({len(text)}).") + + prompt_strings = self._construct_prompts(text) + + # Add image tokens and special tokens if images are provided + if image_inputs.get("pixel_values") is not None: + # Replace the image token with the expanded image token sequence + expanded_image_prompts = [] + for sample in prompt_strings: + sample = ( + self.image_token * self.num_image_tokens + + self.tokenizer.bos_token + + sample + + self.tokenizer.eos_token + ) + expanded_image_prompts.append(sample) + prompt_strings = expanded_image_prompts + + # Construct and tokenize prompts + output_kwargs["text_kwargs"].pop("add_special_tokens", None) + return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None) + return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", False) + text_inputs = self.tokenizer( + prompt_strings, **output_kwargs["text_kwargs"], add_special_tokens=False, return_tensors=None + ) + self._check_special_mm_tokens(prompt_strings, text_inputs, modalities=["image"]) + + if return_mm_token_type_ids: + array_ids = np.array(text_inputs["input_ids"]) + mm_token_type_ids = np.zeros_like(text_inputs["input_ids"]) + mm_token_type_ids[array_ids == self.image_token_id] = 1 + text_inputs["mm_token_type_ids"] = mm_token_type_ids.tolist() + + return BatchFeature(data={**image_inputs, **text_inputs}, tensor_type=return_tensors) + + def batch_decode(self, *args, **kwargs): + """ + This method forwards all its arguments to BartTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please + refer to the docstring of this method for more information. + """ + return self.tokenizer.batch_decode(*args, **kwargs) + + def decode(self, *args, **kwargs): + """ + This method forwards all its arguments to BartTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to + the docstring of this method for more information. + """ + return self.tokenizer.decode(*args, **kwargs) + + @property + def model_input_names(self): + tokenizer_input_names = self.tokenizer.model_input_names + image_processor_input_names = self.image_processor.model_input_names + return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names)) + + def _get_num_multimodal_tokens(self, image_sizes=None, **kwargs): + """ + Computes the number of placeholder tokens needed for multimodal inputs with the given sizes. + + Args: + image_sizes (`list[list[int]]`, *optional*): + The input sizes formatted as (height, width) per each image. + + Returns: + `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided + input modalities, along with other useful data. + """ + + vision_data = {} + if image_sizes is not None: + num_image_tokens = [self.image_seq_length] * len(image_sizes) + num_image_patches = [1] * len(image_sizes) + + vision_data.update({"num_image_tokens": num_image_tokens, "num_image_patches": num_image_patches}) + + return MultiModalData(**vision_data) + + def post_process_image_text_to_text(self, generated_outputs, skip_special_tokens=False, **kwargs): + """ + Post-processes the output of `FuyuForConditionalGeneration` to only return the text output. + + Args: + generated_outputs (`torch.Tensor` or `np.ndarray`): + The output of the model. The output is expected to be a tensor of shape `(batch_size, sequence_length)` + containing the token ids of the generated sequences. + skip_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not to remove special tokens in the output. Argument passed to the tokenizer's `batch_decode` method. + **kwargs: + Additional arguments to be passed to the tokenizer's `batch_decode method`. + + Returns: + `list[str]`: The decoded text output. + """ + return self.batch_decode(generated_outputs, skip_special_tokens=skip_special_tokens, **kwargs) + + def post_process_generation(self, text=None, sequence=None, task=None, image_size=None) -> dict[str, Any]: + """ + Post-process generation outputs based on the task. + + Args: + text (`str`, *optional*): + Generated text. + sequence (`Union[List[int], torch.Tensor]`, *optional*): + Generated token sequence. + task (`str`, *optional*): + The task for post-processing. + image_size (`Tuple[int, int]`, *optional*): + Image size for dequantization. + + Returns: + `Dict[str, Any]`: Post-processed results keyed by task. + """ + if task is None: + raise ValueError("`task` must be provided for post-processing.") + + post_proc_type = self.tasks_answer_post_processing_type.get(task, "pure_text") + parsed = self.post_processor( + text=text, + sequence=sequence, + image_size=image_size, + parse_tasks=[post_proc_type], + )[post_proc_type] + + if post_proc_type == "pure_text": + final_answer = parsed.replace("", "").replace("", "").strip() + elif post_proc_type in ["description_with_bboxes", "bboxes"]: + bboxes = [inst["bbox"] for inst in parsed] + labels = [inst["cat_name"] for inst in parsed] + final_answer = {"bboxes": bboxes, "labels": labels} + if parsed and "score" in parsed[0]: + final_answer["scores"] = [inst["score"] for inst in parsed] + elif post_proc_type == "ocr": + quad_boxes = [inst["quad_box"] for inst in parsed] + labels = [inst["text"] for inst in parsed] + final_answer = {"quad_boxes": quad_boxes, "labels": labels} + elif post_proc_type == "phrase_grounding": + bboxes = [] + labels = [] + for inst in parsed: + for bbox in inst["bbox"]: + bboxes.append(bbox) + labels.append(inst["cat_name"]) + final_answer = {"bboxes": bboxes, "labels": labels} + elif post_proc_type in ["description_with_polygons", "polygons"]: + polygons = [inst["polygons"] for inst in parsed] + labels = [inst["cat_name"] for inst in parsed] + final_answer = {"polygons": polygons, "labels": labels} + elif post_proc_type == "description_with_bboxes_or_polygons": + bboxes = [] + bboxes_labels = [] + polygons = [] + polygons_labels = [] + for inst in parsed: + label = inst["cat_name"] + if "polygons" in inst: + polygons.append(inst["polygons"]) + polygons_labels.append(label) + else: + bboxes.append(inst["bbox"]) + bboxes_labels.append(label) + final_answer = { + "bboxes": bboxes, + "bboxes_labels": bboxes_labels, + "polygons": polygons, + "polygons_labels": polygons_labels, + } + else: + raise ValueError(f"Unknown post-processing type: {post_proc_type}") + + return {task: final_answer} + + +class Florence2PostProcessor: + """ + Post-processor for Florence-2 model outputs. Parses generated text into structured results for various tasks + like object detection, OCR, phrase grounding, etc. + + Args: + tokenizer (`PreTrainedTokenizer`): + The tokenizer used for decoding model outputs. + """ + + def __init__(self, config, tokenizer): + self.tokenizer = tokenizer + self.parse_task_config = config or {} + self.banned_grounding_tokens = set( + self.parse_task_config.get("phrase_grounding", {}).get("banned_grounding_tokens", []) + ) + self.all_special_tokens = set(self.tokenizer.all_special_tokens) + self.quantize_bins = (1000, 1000) + + def quantize(self, locations: "torch.Tensor", size: tuple[int, int]) -> "torch.Tensor": + """ + Quantize locations. + + Args: + locations (`torch.Tensor`): + Tensor of shape (N, 4) for boxes or (N, 2) for points/coordinates. + size (`tuple[int, int]`): + Original image size (width, height). + + Returns: + `torch.Tensor`: Quantized locations as integers. + """ + bins_w, bins_h = self.quantize_bins + size_w, size_h = size + per_bin_w = size_w / bins_w + per_bin_h = size_h / bins_h + + if locations.shape[-1] == 4: # Bounding boxes: [xmin, ymin, xmax, ymax] + xmin, ymin, xmax, ymax = locations.split(1, dim=-1) + q_xmin = (xmin / per_bin_w).floor().clamp(0, bins_w - 1) + q_ymin = (ymin / per_bin_h).floor().clamp(0, bins_h - 1) + q_xmax = (xmax / per_bin_w).floor().clamp(0, bins_w - 1) + q_ymax = (ymax / per_bin_h).floor().clamp(0, bins_h - 1) + return torch.cat([q_xmin, q_ymin, q_xmax, q_ymax], dim=-1).int() + + elif locations.shape[-1] == 2: # Points/coordinates: [x, y] + x, y = locations.split(1, dim=-1) + q_x = (x / per_bin_w).floor().clamp(0, bins_w - 1) + q_y = (y / per_bin_h).floor().clamp(0, bins_h - 1) + return torch.cat([q_x, q_y], dim=-1).int() + + else: + raise ValueError(f"Unsupported location shape: last dim must be 2 or 4, got {locations.shape[-1]}.") + + def dequantize(self, locations: "torch.Tensor", size: tuple[int, int]) -> "torch.Tensor": + """ + Dequantize locations back to original scale. + + Args: + locations (`torch.Tensor`): + Quantized tensor of shape (N, 4) for boxes or (N, 2) for points/coordinates. + size (`tuple[int, int]`): + Original image size (width, height). + + Returns: + `torch.Tensor`: Dequantized locations as floats. + """ + bins_w, bins_h = self.quantize_bins + size_w, size_h = size + per_bin_w = size_w / bins_w + per_bin_h = size_h / bins_h + + # Add 0.5 to use the center position of the bin as the coordinate. + if locations.shape[-1] == 4: # Bounding boxes + xmin, ymin, xmax, ymax = locations.split(1, dim=-1) + dq_xmin = (xmin + 0.5) * per_bin_w + dq_ymin = (ymin + 0.5) * per_bin_h + dq_xmax = (xmax + 0.5) * per_bin_w + dq_ymax = (ymax + 0.5) * per_bin_h + return torch.cat([dq_xmin, dq_ymin, dq_xmax, dq_ymax], dim=-1).int() + + elif locations.shape[-1] == 2: # Points/coordinates + x, y = locations.split(1, dim=-1) + dq_x = (x + 0.5) * per_bin_w + dq_y = (y + 0.5) * per_bin_h + return torch.cat([dq_x, dq_y], dim=-1).int() + + else: + raise ValueError(f"Unsupported location shape: last dim must be 2 or 4, got {locations.shape[-1]}.") + + def decode_with_spans(self, token_ids: list[int]) -> tuple[str, list[tuple[int, int]]]: + """ + Decode token IDs to text and compute character spans. + + Args: + token_ids (`list[int]`): + list of token IDs to decode. + + Returns: + `tuple[str, list[tuple[int, int]]]`: Decoded text and list of spans (start, end) for each token. + """ + filtered_tokens = self.tokenizer.convert_ids_to_tokens(token_ids, skip_special_tokens=False) + text = "" + spans = [] + for token in filtered_tokens: + if token in self.all_special_tokens: + sub_text = token + else: + sub_text = self.tokenizer.convert_tokens_to_string([token]) + span = (len(text), len(text) + len(sub_text)) + text += sub_text + spans.append(span) + return text, spans + + def parse_ocr_from_text_and_spans( + self, text: str, pattern: Optional[str], image_size: tuple[int, int], area_threshold: float = 0.0 + ) -> list[dict[str, Any]]: + """ + Parse OCR results with quadrilateral boxes. + + Args: + text (`str`): + The generated text. + pattern (`str`): + Regex pattern for matching. + image_size (`tuple[int, int]`): + Image size (width, height). + area_threshold (`float`, *optional*, defaults to 0.0): + Minimum area threshold for filtering boxes. + + Returns: + `list[dict[str, Any]]`: list of instances with 'quad_box' and 'text'. + """ + text = text.replace("", "").replace("", "").replace("", "") + if pattern is None: + pattern = r"(.+?)" + + matches = re.findall(pattern, text) + instances = [] + width, height = image_size + + for content, *quad_str in matches: + quad_bins = [int(i) for i in quad_str] + quad_box = self.dequantize(torch.tensor(quad_bins).reshape(-1, 2), size=image_size).flatten().tolist() + + if area_threshold > 0: + x_coords = quad_box[0::2] + y_coords = quad_box[1::2] + # Apply the Shoelace formula + area = 0.5 * abs( + sum(x_coords[i] * y_coords[i + 1] - x_coords[i + 1] * y_coords[i] for i in range(4 - 1)) + ) + + if area < (width * height) * area_threshold: + continue + + instances.append({"quad_box": quad_box, "text": content.strip()}) + return instances + + def parse_phrase_grounding_from_text_and_spans( + self, text: str, image_size: tuple[int, int] + ) -> list[dict[str, Any]]: + """ + Parse phrase grounding results. + + Args: + text (`str`): + The generated text. + image_size (`tuple[int, int]`): + Image size (width, height). + + Returns: + `list[dict[str, Any]]`: list of instances with 'bbox' and 'cat_name'. + """ + text = text.replace("", "").replace("", "").replace("", "") + phrase_pattern = r"([^<]+(?:){4,})" + phrases = re.findall(phrase_pattern, text) + text_pattern = r"^\s*(.*?)(?=||||||" + + instances = [] + for phrase_text in phrases: + phrase_text = phrase_text.replace("", "", 1).replace("", "", 1) + if not phrase_text: + continue + match = re.search(text_pattern, phrase_text) + if not match: + continue + phrase = match.group().strip() + if phrase in self.banned_grounding_tokens: + continue + boxes_matches = list(re.finditer(box_pattern, phrase_text)) + if not boxes_matches: + continue + bbox_bins = [[int(m.group(j)) for j in range(1, 5)] for m in boxes_matches] + bboxes = self.dequantize(torch.tensor(bbox_bins), size=image_size).tolist() + phrase = phrase.encode("ascii", "ignore").decode("ascii") + instances.append({"bbox": bboxes, "cat_name": phrase}) + return instances + + def _find_matched_token_indices(self, cur_span: tuple[int, int], token_spans: list[tuple[int, int]]) -> list[int]: + return [i for i, span in enumerate(token_spans) if not (span[1] <= cur_span[0] or span[0] >= cur_span[1])] + + def parse_description_with_bboxes_from_text_and_spans( + self, + text: str, + image_size: tuple[int, int], + allow_empty_phrase: bool = False, + ) -> list[dict[str, Any]]: + """ + Parse descriptions with bounding boxes. + + Args: + text (`str`): + The generated text. + image_size (`tuple[int, int]`): + Image size (width, height). + allow_empty_phrase (`bool`, *optional*, defaults to `False`): + Allow phrases without text. + + Returns: + `list[dict[str, Any]]`: list of instances with 'bbox', 'cat_name', and optional 'score'. + """ + text = text.replace("", "").replace("", "").replace("", "") + + if allow_empty_phrase: + pattern = r"(?:(?:){4,})" + else: + pattern = r"([^<]+(?:){4,})" + phrases = re.findall(pattern, text) + + text_pattern = r"^\s*(.*?)(?=||||||" + + instances = [] + for phrase_text in phrases: + phrase_text = phrase_text.replace("", "", 1).replace("", "", 1) + if not phrase_text and not allow_empty_phrase: + continue + match = re.search(text_pattern, phrase_text) + if not match: + continue + phrase = match.group().strip() + boxes_matches = list(re.finditer(box_pattern, phrase_text)) + if not boxes_matches: + continue + bbox_bins = [[int(m.group(j)) for j in range(1, 5)] for m in boxes_matches] + bboxes = self.dequantize(torch.tensor(bbox_bins), size=image_size).tolist() + + phrase = phrase.encode("ascii", "ignore").decode("ascii") + for bbox in bboxes: + instance = {"bbox": bbox, "cat_name": phrase} + instances.append(instance) + + return instances + + def parse_description_with_polygons_from_text_and_spans( + self, + text: str, + image_size: tuple[int, int], + allow_empty_phrase: bool = False, + polygon_sep_token: str = "", + polygon_start_token: str = "", + polygon_end_token: str = "", + with_box_at_start: bool = False, + ) -> list[dict[str, Any]]: + """ + Parse descriptions with polygons. + + Args: + text (`str`): + The generated text. + image_size (`tuple[int, int]`): + Image size (width, height). + allow_empty_phrase (`bool`, *optional*, defaults to `False`): + Allow phrases without text. + polygon_sep_token (`str`, *optional*, defaults to ""): + Token separating polygons. + polygon_start_token (`str`, *optional*, defaults to ""): + Start token for polygons. + polygon_end_token (`str`, *optional*, defaults to ""): + End token for polygons. + with_box_at_start (`bool`, *optional*, defaults to `False`): + Whether a bounding box is at the start of polygons. + + Returns: + `list[dict[str, Any]]`: list of instances with 'polygons', 'cat_name', and optional 'bbox'. + """ + text = text.replace("", "").replace("", "").replace("", "") + + if allow_empty_phrase: + pattern = rf"(?:(?:|{re.escape(polygon_sep_token)}|{re.escape(polygon_start_token)}|{re.escape(polygon_end_token)}){{4,}})" + else: + pattern = rf"([^<]+(?:|{re.escape(polygon_sep_token)}|{re.escape(polygon_start_token)}|{re.escape(polygon_end_token)}){{4,}})" + phrases = re.findall(pattern, text) + phrase_pattern = r"^\s*(.*?)(?=||||||)" + poly_instance_pattern = rf"{re.escape(polygon_start_token)}(.*?){re.escape(polygon_end_token)}" + box_pattern = rf"((?:)+)(?:{re.escape(polygon_sep_token)}|$)" + + instances = [] + for phrase_text in phrases: + phrase_text_strip = re.sub(r"^", "", phrase_text, count=1) + if not phrase_text_strip and not allow_empty_phrase: + continue + match = re.search(phrase_pattern, phrase_text_strip) + if not match: + continue + phrase = match.group().strip() + + if polygon_start_token in phrase_text and polygon_end_token in phrase_text: + poly_instances = [m.group(1) for m in re.finditer(poly_instance_pattern, phrase_text)] + else: + poly_instances = [phrase_text] + + for poly_inst in poly_instances: + poly_matches = list(re.finditer(box_pattern, poly_inst)) + if len(poly_matches) == 0: + continue + bbox = [] + polygons = [] + for poly_match in poly_matches: + poly_str = poly_match.group(1) + poly_bins = [int(m.group(1)) for m in re.finditer(r"", poly_str)] + if with_box_at_start and not bbox: + if len(poly_bins) > 4: + bbox = poly_bins[:4] + poly_bins = poly_bins[4:] + else: + bbox = [0, 0, 0, 0] + if len(poly_bins) % 2 == 1: + poly_bins = poly_bins[:-1] + poly_coords = ( + self.dequantize(torch.tensor(poly_bins).reshape(-1, 2), size=image_size).flatten().tolist() + ) + polygons.append(poly_coords) + + instance = {"cat_name": phrase, "polygons": polygons} + if bbox: + instance["bbox"] = self.dequantize(torch.tensor([bbox]), size=image_size)[0].tolist() + instances.append(instance) + return instances + + def __call__(self, text=None, sequence=None, image_size=None, parse_tasks=None) -> dict[str, Any]: + """ + Process model output and parse into task-specific results. + + Args: + text (`Optional[str]`, *optional*): + Generated text. Either this or `sequence` must be provided. + sequence (`Optional[Union[list[int], torch.Tensor]]`, *optional*): + Token sequence. Either this or `text` must be provided. + image_size (`Optional[tuple[int, int]]`, *optional*): + Image size (width, height) required for dequantization. + parse_tasks (`Optional[Union[str, list[str]]]`, *optional*): + Specific tasks to parse. If None, parse all supported tasks. + + Returns: + `dict[str, Any]`: Parsed results for each task, including the raw 'text'. + """ + if parse_tasks is not None: + parse_tasks = [parse_tasks] if isinstance(parse_tasks, str) else parse_tasks + for task in parse_tasks: + if task not in self.parse_task_config.keys(): + raise ValueError(f"Unsupported parse task: {task}") + + if (text is None and sequence is None) or (text is not None and sequence is not None): + raise ValueError("Exactly one of 'text' or 'sequence' must be provided.") + + if sequence is not None: + if isinstance(sequence, torch.Tensor): + sequence = sequence.tolist() + sequence = sequence[1:] if sequence[0] == self.tokenizer.bos_token_id else sequence # Skip BOS if present + text, _ = self.decode_with_spans(sequence) + + parsed_dict = {"text": text} + + tasks_to_parse = parse_tasks or self.parse_task_config.keys() + for task in tasks_to_parse: + config = self.parse_task_config[task] + pattern = config.get("PATTERN") + + if task == "ocr": + parsed_dict["ocr"] = self.parse_ocr_from_text_and_spans( + text, pattern=pattern, image_size=image_size, area_threshold=config.get("AREA_THRESHOLD", 0.0) + ) + elif task == "phrase_grounding": + parsed_dict["phrase_grounding"] = self.parse_phrase_grounding_from_text_and_spans( + text, image_size=image_size + ) + elif task == "pure_text": + parsed_dict["pure_text"] = text + elif task == "description_with_bboxes": + parsed_dict["description_with_bboxes"] = self.parse_description_with_bboxes_from_text_and_spans( + text, image_size=image_size + ) + elif task == "description_with_polygons": + parsed_dict["description_with_polygons"] = self.parse_description_with_polygons_from_text_and_spans( + text, image_size=image_size + ) + elif task == "polygons": + parsed_dict["polygons"] = self.parse_description_with_polygons_from_text_and_spans( + text, image_size=image_size, allow_empty_phrase=True + ) + elif task == "bboxes": + parsed_dict["bboxes"] = self.parse_description_with_bboxes_from_text_and_spans( + text, image_size=image_size, allow_empty_phrase=True + ) + elif task == "description_with_bboxes_or_polygons": + if "" in text: + instances = self.parse_description_with_polygons_from_text_and_spans(text, image_size=image_size) + else: + instances = self.parse_description_with_bboxes_from_text_and_spans(text, image_size=image_size) + parsed_dict["description_with_bboxes_or_polygons"] = instances + else: + raise ValueError(f"task {task} is not supported") + + return parsed_dict + + +class Florence2VisionDropPath(BeitDropPath): + pass + + +class Florence2VisionLearnedAbsolutePositionEmbedding2D(nn.Module): + """ + This module learns positional embeddings up to a fixed maximum size. + """ + + def __init__(self, config: Florence2Config): + super().__init__() + num_pos = config.vision_config.max_position_embeddings + embedding_dim = config.vision_config.embed_dim[-1] + self.row_embeddings = nn.Embedding(num_pos, embedding_dim // 2) + self.column_embeddings = nn.Embedding(num_pos, embedding_dim - (embedding_dim // 2)) + + def forward(self, pixel_values, pixel_mask=None): + height, width = pixel_values.shape[-2:] + width_values = torch.arange(width, device=pixel_values.device) + height_values = torch.arange(height, device=pixel_values.device) + x_emb = self.column_embeddings(width_values) + y_emb = self.row_embeddings(height_values) + pos = torch.cat([x_emb.unsqueeze(0).repeat(height, 1, 1), y_emb.unsqueeze(1).repeat(1, width, 1)], dim=-1) + pos = pos.permute(2, 0, 1) + pos = pos.unsqueeze(0) + pos = pos.repeat(pixel_values.shape[0], 1, 1, 1) + return pos + + +class Florence2VisionPositionalEmbeddingCosine1D(nn.Module): + """ + This module generates 1D cosine positional embeddings using precomputed sinusoidal functions. + """ + + def __init__(self, config: Florence2Config): + super().__init__() + self.embed_dim = config.vision_config.embed_dim[-1] + self.max_seq_len = config.vision_config.max_temporal_embeddings + pos_idx_to_embed = torch.empty((self.max_seq_len, self.embed_dim)) + sine, cosine = self.get_sinusoid_embeddings( + max_positions=self.max_seq_len, + embed_dim=self.embed_dim, + ) + pos_idx_to_embed[:, 0::2] = sine + pos_idx_to_embed[:, 1::2] = cosine + # Save the positional embeddings in a constant buffer. + self.register_buffer("pos_idx_to_embed", pos_idx_to_embed) + + @staticmethod + def get_sinusoid_embeddings(max_positions: int, embed_dim: int): + half_dim = embed_dim // 2 + emb = math.log(10000) / half_dim + emb = torch.exp(torch.arange(half_dim, dtype=torch.int64).float() * -emb) + emb = torch.arange(max_positions, dtype=torch.float).unsqueeze(1) * emb.unsqueeze(0) + return torch.sin(emb), torch.cos(emb) + + def forward(self, seq_embeds: torch.Tensor) -> torch.Tensor: + len_seq = seq_embeds.size(1) + if len_seq > self.max_seq_len: + raise ValueError(f"Maximum sequence length {self.max_seq_len}, got {len_seq}") + pos_embeds = self.pos_idx_to_embed[0:len_seq, :] + return pos_embeds + + +class Florence2VisionMLP(Llama4VisionMLP): + def __init__(self, config: Florence2VisionConfig, stage_idx: int): + super().__init__(config) + self.fc1 = nn.Linear(config.embed_dim[stage_idx], int(config.embed_dim[stage_idx] * config.mlp_ratio)) + self.activation_fn = ACT2FN[config.activation_function] + self.fc2 = nn.Linear(int(config.embed_dim[stage_idx] * config.mlp_ratio), config.embed_dim[stage_idx]) + + +class Florence2VisionConvEmbed(nn.Module): + """Image to Patch Embedding""" + + def __init__(self, config: Florence2VisionConfig, stage_idx: int): + super().__init__() + self.config = config + self.stage_idx = stage_idx + self.patch_size = config.patch_size[stage_idx] + self.in_channels = config.in_channels if stage_idx == 0 else config.embed_dim[stage_idx - 1] + self.embed_dim = config.embed_dim[stage_idx] + self.stride = config.patch_stride[stage_idx] + self.padding = config.patch_padding[stage_idx] + self.pre_norm = config.patch_prenorm[stage_idx] + + self.conv = nn.Conv2d( + self.in_channels, + self.embed_dim, + kernel_size=self.patch_size, + stride=self.stride, + padding=self.padding, + ) + + dim_norm = self.in_channels if self.pre_norm else self.embed_dim + self.norm = nn.LayerNorm(dim_norm) + + def forward(self, hidden_states: torch.Tensor): + if self.norm and self.pre_norm: + hidden_states = hidden_states.permute(0, 2, 3, 1) + hidden_states = self.norm(hidden_states) + hidden_states = hidden_states.permute(0, 3, 1, 2) + + hidden_states = self.conv(hidden_states) + + if self.norm and not self.pre_norm: + hidden_states = hidden_states.permute(0, 2, 3, 1) + hidden_states = self.norm(hidden_states) + hidden_states = hidden_states.permute(0, 3, 1, 2) + return hidden_states + + +class Florence2VisionChannelAttention(nn.Module): + def __init__(self, config: Florence2VisionConfig, stage_idx: int): + super().__init__() + self.config = config + self.dim = config.embed_dim[stage_idx] + self.groups = config.num_groups[stage_idx] + self.qkv = nn.Linear(self.dim, self.dim * 3, bias=config.qkv_bias) + self.proj = nn.Linear(self.dim, self.dim) + self.is_causal = False + + def forward(self, hidden_states: torch.Tensor): + batch_size, num_tokens, hidden_size = hidden_states.shape + + # Reshape for grouped channel attention + qkv = self.qkv(hidden_states).reshape(batch_size, num_tokens, 3, self.groups, hidden_size // self.groups) + qkv = qkv.permute(2, 0, 3, 4, 1) + query, key, value = qkv.unbind(0) + + scale = num_tokens**-0.5 + # Channel-to-channel attention within groups: + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + hidden_states, _ = attention_interface( + self, + query, + key, + value, + attention_mask=None, + scaling=scale, + ) + hidden_states = hidden_states.permute(0, 3, 2, 1) + hidden_states = hidden_states.reshape(batch_size, num_tokens, hidden_size) + + # Final projection + hidden_states = self.proj(hidden_states) + return hidden_states + + +class Florence2VisionChannelBlock(nn.Module): + def __init__( + self, + config: Florence2VisionConfig, + stage_idx: int, + drop_path_rate: float, + ): + super().__init__() + + self.config = config + dim_in = config.embed_dim[stage_idx] + + self.conv1 = nn.Conv2d( + dim_in, + dim_in, + kernel_size=3, + padding=1, + groups=dim_in, + ) + self.norm1 = nn.LayerNorm(config.embed_dim[stage_idx]) + self.channel_attn = Florence2VisionChannelAttention(config=config, stage_idx=stage_idx) + self.drop_path1 = Florence2VisionDropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity() + + self.conv2 = nn.Conv2d( + dim_in, + dim_in, + kernel_size=3, + padding=1, + groups=dim_in, + ) + self.norm2 = nn.LayerNorm(config.embed_dim[stage_idx]) + self.ffn = Florence2VisionMLP(config=config, stage_idx=stage_idx) + self.drop_path2 = Florence2VisionDropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity() + + def forward(self, hidden_states: torch.Tensor): + batch_size, embed_dim, height, width = hidden_states.shape + + # First channel block: Depthwise Conv + Channel Attention + hidden_states = self.conv1(hidden_states) + hidden_states + hidden_states = hidden_states.flatten(2).transpose(1, 2) + residual = hidden_states + + # Channel group attention self-attention mechanism + hidden_states = self.norm1(hidden_states) + hidden_states = self.channel_attn(hidden_states) + hidden_states = residual + self.drop_path1(hidden_states) + hidden_states = hidden_states.transpose(1, 2).view(batch_size, embed_dim, height, width) + + # Second channel block: Depthwise Conv + FFN + hidden_states = self.conv2(hidden_states) + hidden_states + hidden_states = hidden_states.flatten(2).transpose(1, 2) + residual = hidden_states + + # FFN + hidden_states = self.norm2(hidden_states) + hidden_states = self.ffn(hidden_states) + hidden_states = residual + self.drop_path2(hidden_states) + hidden_states = hidden_states.transpose(1, 2).view(batch_size, embed_dim, height, width) + + return hidden_states + + +class Florence2VisionWindowAttention(nn.Module): + def __init__(self, config: Florence2VisionConfig, stage_idx: int): + super().__init__() + self.config = config + self.dim = config.embed_dim[stage_idx] + self.window_size = config.window_size + self.num_heads = config.num_heads[stage_idx] + head_dim = self.dim // self.num_heads + self.scale = head_dim**-0.5 + + self.qkv = nn.Linear(self.dim, self.dim * 3, bias=config.qkv_bias) + self.proj = nn.Linear(self.dim, self.dim) + self.is_causal = False + + def forward(self, hidden_states: torch.Tensor): + batch_size, height, width, embed_dim = hidden_states.shape + + # Pad the input if necessary + pad_left = pad_top = 0 + pad_right = (self.window_size - width % self.window_size) % self.window_size + pad_bottom = (self.window_size - height % self.window_size) % self.window_size + hidden_states = F.pad(hidden_states, (0, 0, pad_left, pad_right, pad_top, pad_bottom)) + _, padded_height, padded_width, _ = hidden_states.shape + + # Partition input into non-overlapping windows (for local spatial attention in DaViT) + hidden_states = hidden_states.view( + batch_size, + padded_height // self.window_size, + self.window_size, + padded_width // self.window_size, + self.window_size, + embed_dim, + ) + windowed_hidden_states = hidden_states.permute(0, 1, 3, 2, 4, 5).contiguous() + windowed_hidden_states = windowed_hidden_states.view(-1, self.window_size * self.window_size, embed_dim) + + # Generate Q, K, V for each window + num_windows_per_batch, num_tokens_per_window, embed_dim = windowed_hidden_states.shape + qkv = self.qkv(windowed_hidden_states).reshape( + num_windows_per_batch, num_tokens_per_window, 3, self.num_heads, embed_dim // self.num_heads + ) + qkv = qkv.permute(2, 0, 3, 1, 4) + query, key, value = qkv.unbind(0) + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + windowed_hidden_states, _ = attention_interface( + self, + query, + key, + value, + attention_mask=None, + scaling=self.scale, + ) + windowed_hidden_states = windowed_hidden_states.view(num_windows_per_batch, num_tokens_per_window, embed_dim) + windowed_hidden_states = self.proj(windowed_hidden_states) + + # Merge windows back to original spatial layout + windowed_hidden_states = windowed_hidden_states.view(-1, self.window_size, self.window_size, embed_dim) + hidden_states = windowed_hidden_states.view( + -1, + padded_height // self.window_size, + padded_width // self.window_size, + self.window_size, + self.window_size, + embed_dim, + ) + hidden_states = hidden_states.permute(0, 1, 3, 2, 4, 5).contiguous() + hidden_states = hidden_states.view(-1, padded_height, padded_width, embed_dim) + hidden_states = hidden_states[:, :height, :width, :].contiguous() + hidden_states = hidden_states.view(batch_size, height * width, embed_dim) + + return hidden_states + + +class Florence2VisionSpatialBlock(nn.Module): + def __init__( + self, + config: Florence2VisionConfig, + stage_idx: int, + drop_path_rate: float, + ): + super().__init__() + + self.conv1 = nn.Conv2d( + config.embed_dim[stage_idx], + config.embed_dim[stage_idx], + kernel_size=3, + padding=1, + groups=config.embed_dim[stage_idx], + ) + self.norm1 = nn.LayerNorm(config.embed_dim[stage_idx]) + self.window_attn = Florence2VisionWindowAttention(config=config, stage_idx=stage_idx) + self.drop_path1 = Florence2VisionDropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity() + + self.conv2 = nn.Conv2d( + config.embed_dim[stage_idx], + config.embed_dim[stage_idx], + kernel_size=3, + padding=1, + groups=config.embed_dim[stage_idx], + ) + self.norm2 = nn.LayerNorm(config.embed_dim[stage_idx]) + self.ffn = Florence2VisionMLP(config=config, stage_idx=stage_idx) + self.drop_path2 = Florence2VisionDropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity() + + def forward(self, hidden_states: torch.Tensor): + batch_size, embed_dim, height, width = hidden_states.shape + + # First spatial mixing block: Conv + Window Attention + hidden_states = self.conv1(hidden_states) + hidden_states + hidden_states = hidden_states.flatten(2).transpose(1, 2) + residual = hidden_states + + # Spatial Window-based self-attention mechanism + hidden_states = self.norm1(hidden_states) + hidden_states = hidden_states.view(batch_size, height, width, embed_dim) + hidden_states = self.window_attn(hidden_states) + hidden_states = residual + self.drop_path1(hidden_states) + hidden_states = hidden_states.transpose(1, 2).view(batch_size, embed_dim, height, width) + + # Second spatial mixing block: Conv + FFN + hidden_states = self.conv2(hidden_states) + hidden_states + hidden_states = hidden_states.flatten(2).transpose(1, 2) + residual = hidden_states + + # FFN + hidden_states = self.norm2(hidden_states) + hidden_states = self.ffn(hidden_states) + hidden_states = residual + self.drop_path2(hidden_states) + hidden_states = hidden_states.transpose(1, 2).view(batch_size, embed_dim, height, width) + + return hidden_states + + +class Florence2VisionBlock(nn.Module): + def __init__( + self, + config: Florence2VisionConfig, + stage_idx: int, + spatial_drop_path_rate: float, + channel_drop_path_rate: float, + ): + super().__init__() + self.spatial_block = Florence2VisionSpatialBlock( + config=config, + stage_idx=stage_idx, + drop_path_rate=spatial_drop_path_rate, + ) + self.channel_block = Florence2VisionChannelBlock( + config=config, + stage_idx=stage_idx, + drop_path_rate=channel_drop_path_rate, + ) + + def forward(self, hidden_states: torch.Tensor): + hidden_states = self.spatial_block(hidden_states) + hidden_states = self.channel_block(hidden_states) + return hidden_states + + +@auto_docstring +class Florence2VisionPreTrainedModel(PreTrainedModel): + config_class = Florence2VisionConfig + main_input_name = "pixel_values" + _supports_sdpa = True + _supports_flash_attn = True + _supports_flex_attn = True + + _can_compile_fullgraph = True + + +@auto_docstring +class Florence2VisionBackbone(Florence2VisionPreTrainedModel): + def __init__(self, config: Florence2VisionConfig): + super().__init__(config) + self.config = config + + self.embed_dim = config.embed_dim + self.num_heads = config.num_heads + self.num_groups = config.num_groups + self.num_stages = len(self.embed_dim) + + if not (self.num_stages == len(self.num_heads) == len(self.num_groups)): + raise ValueError( + f"Expected self.num_stages ({self.num_stages}) == " + f"len(self.num_heads) ({len(self.num_heads)}) == " + f"len(self.num_groups) ({len(self.num_groups)})" + ) + + dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths) * 2, device="cpu")] + depth_offset = 0 + + convs = [] + blocks = [] + for stage_idx in range(self.num_stages): + conv_embed = Florence2VisionConvEmbed( + config=config, + stage_idx=stage_idx, + ) + convs.append(conv_embed) + + block = nn.ModuleList( + Florence2VisionBlock( + config=config, + stage_idx=stage_idx, + spatial_drop_path_rate=dpr[depth_offset + block_idx * 2], + channel_drop_path_rate=dpr[depth_offset + block_idx * 2 + 1], + ) + for block_idx in range(config.depths[stage_idx]) + ) + blocks.append(block) + depth_offset += config.depths[stage_idx] * 2 + + self.convs = nn.ModuleList(convs) + self.blocks = nn.ModuleList(blocks) + + # Initialize weights and apply final processing + self.post_init() + + def forward(self, hidden_states: torch.Tensor): + for conv, block in zip(self.convs, self.blocks): + hidden_states = conv(hidden_states) + for layer in block: + hidden_states = layer(hidden_states) + return hidden_states + + +class Florence2MultiModalProjector(nn.Module): + def __init__(self, config: Florence2Config): + super().__init__() + self.vision_embedding_dim = config.vision_config.embed_dim[-1] + self.vision_projection_dim = config.vision_config.projection_dim + self.image_projection = nn.Linear(self.vision_embedding_dim, self.vision_projection_dim, bias=False) + self.image_proj_norm = nn.LayerNorm(self.vision_projection_dim) + self.image_position_embed = Florence2VisionLearnedAbsolutePositionEmbedding2D(config=config) + self.visual_temporal_embed = Florence2VisionPositionalEmbeddingCosine1D(config=config) + + def forward(self, image_features): + position_features = image_features + self.image_position_embed(image_features) + position_features = position_features.flatten(2).transpose(1, 2) + temporal_features = self.visual_temporal_embed(position_features[:, :1, :]) + temporal_features = temporal_features.unsqueeze(1) + visual_token_features = position_features + temporal_features + visual_token_features = visual_token_features.unsqueeze(1) + spatial_image_features = visual_token_features.mean(dim=2) + temporal_image_features = visual_token_features.mean(dim=1) + image_features = torch.cat([spatial_image_features, temporal_image_features], dim=1) + image_features = self.image_projection(image_features) + image_features = self.image_proj_norm(image_features) + return image_features + + +@dataclass +@auto_docstring( + custom_intro=""" + Base class for Florence-2 base model's outputs that also contains : pre-computed hidden states that can speed up sequential + decoding. + """ +) +class Florence2Seq2SeqModelOutput(Seq2SeqModelOutput): + r""" + image_hidden_states (`torch.FloatTensor`, *optional*): + A `torch.FloatTensor` of size `(batch_size, num_image_tokens, hidden_size)`. + image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state. + """ + + image_hidden_states: Optional[torch.FloatTensor] = None + + +@dataclass +@auto_docstring( + custom_intro=""" + Base class for Florence-2 model's outputs that also contains : pre-computed hidden states that can speed up sequential + decoding. + """ +) +class Florence2Seq2SeqLMOutput(Seq2SeqLMOutput): + r""" + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Language modeling loss (for next-token prediction). + logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + image_hidden_states (`torch.FloatTensor`, *optional*): + A `torch.FloatTensor` of size `(batch_size, num_image_tokens, hidden_size)`. + image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state. + """ + + image_hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None + + +@auto_docstring +class Florence2PreTrainedModel(LlavaPreTrainedModel): + config_class = Florence2Config + + _supports_attention_backend = False + + +@auto_docstring( + custom_intro=""" + Florence-2 is a vision model for captioning, detection, and segmentation. + """ +) +class Florence2Model(LlavaModel): + _checkpoint_conversion_mapping = {} + _tied_weights_keys = [ + "language_model.encoder.embed_tokens.weight", + "language_model.decoder.embed_tokens.weight", + ] + + def __init__(self, config: Florence2Config): + super().__init__(config) + self.vision_tower = Florence2VisionBackbone(config=config.vision_config) + + def get_encoder(self): + return self.language_model.get_encoder() + + def get_decoder(self): + return self.language_model.get_decoder() + + def get_image_features(self, pixel_values: torch.Tensor, **kwargs): + """ + Obtains image last hidden states from the vision tower and apply multimodal projection. + + Args: + pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`): + The tensors corresponding to the input images. + Returns: + image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`). + """ + image_features = self.vision_tower(pixel_values, **kwargs) + image_embeds = self.multi_modal_projector(image_features) + return image_embeds + + @can_return_tuple + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + decoder_input_ids: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.LongTensor] = None, + decoder_head_mask: Optional[torch.Tensor] = None, + cross_attn_head_mask: Optional[torch.Tensor] = None, + decoder_inputs_embeds: Optional[torch.FloatTensor] = None, + encoder_outputs: Optional[list[torch.FloatTensor]] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + ) -> Union[tuple, Florence2Seq2SeqModelOutput]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if encoder_outputs is None: + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") + + if inputs_embeds is None: + inputs_embeds = self.get_input_embeddings()(input_ids) + + if pixel_values is not None: + image_features = self.get_image_features(pixel_values) + image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype) + special_image_mask = self.get_placeholder_mask( + input_ids, inputs_embeds=inputs_embeds, image_features=image_features + ) + inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features) + + encoder_outputs = self.language_model.encoder( + attention_mask=attention_mask, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=True, + ) + + if decoder_input_ids is None: + decoder_start_token_id = self.config.text_config.decoder_start_token_id + decoder_input_ids = torch.ones((inputs_embeds.size()[0], 1), dtype=torch.long, device=inputs_embeds.device) + decoder_input_ids *= decoder_start_token_id + + decoder_outputs = self.language_model.decoder( + input_ids=decoder_input_ids, + attention_mask=decoder_attention_mask, + encoder_hidden_states=encoder_outputs[0], + encoder_attention_mask=attention_mask, + head_mask=decoder_head_mask, + cross_attn_head_mask=cross_attn_head_mask, + past_key_values=past_key_values, + inputs_embeds=decoder_inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + cache_position=cache_position, + return_dict=True, + ) + + return Florence2Seq2SeqModelOutput( + last_hidden_state=decoder_outputs.last_hidden_state, + past_key_values=decoder_outputs.past_key_values, + decoder_hidden_states=decoder_outputs.hidden_states, + decoder_attentions=decoder_outputs.attentions, + cross_attentions=decoder_outputs.cross_attentions, + encoder_last_hidden_state=encoder_outputs.last_hidden_state, + encoder_hidden_states=encoder_outputs.hidden_states, + encoder_attentions=encoder_outputs.attentions, + image_hidden_states=image_features if pixel_values is not None else None, + ) + + +@auto_docstring( + custom_intro=""" + Florence-2 is a vision model for captioning, detection, and segmentation. + """ +) +class Florence2ForConditionalGeneration(LlavaForConditionalGeneration): + _checkpoint_conversion_mapping = {} + _tied_weights_keys = [ + "model.language_model.encoder.embed_tokens.weight", + "model.language_model.decoder.embed_tokens.weight", + "lm_head.weight", + ] + + def get_encoder(self): + return self.model.get_encoder() + + def get_image_features(self, pixel_values: torch.Tensor, **kwargs): + return self.model.get_image_features(pixel_values=pixel_values, **kwargs) + + @can_return_tuple + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + decoder_input_ids: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.Tensor] = None, + decoder_head_mask: Optional[torch.Tensor] = None, + cross_attn_head_mask: Optional[torch.Tensor] = None, + encoder_outputs: Optional[list[torch.FloatTensor]] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + decoder_inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + logits_to_keep: Union[int, torch.Tensor] = 0, + **kwargs: Unpack[TransformersKwargs], + ) -> Union[tuple, Florence2Seq2SeqLMOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Example: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, Florence2ForConditionalGeneration + + >>> model = Florence2ForConditionalGeneration.from_pretrained("microsoft/Florence-2-large") + >>> processor = AutoProcessor.from_pretrained("microsoft/Florence-2-large") + + >>> prompt = "" + >>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> inputs = processor(text=prompt, images=image, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(**inputs, max_length=100) + >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "A green car parked in front of a yellow building." + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if labels is not None: + if use_cache: + logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.") + use_cache = False + if decoder_input_ids is None and decoder_inputs_embeds is None: + decoder_input_ids = shift_tokens_right( + labels, self.config.text_config.pad_token_id, self.config.text_config.decoder_start_token_id + ) + + outputs = self.model( + input_ids=input_ids, + pixel_values=pixel_values, + attention_mask=attention_mask, + decoder_input_ids=decoder_input_ids, + encoder_outputs=encoder_outputs, + decoder_attention_mask=decoder_attention_mask, + head_mask=head_mask, + decoder_head_mask=decoder_head_mask, + cross_attn_head_mask=cross_attn_head_mask, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + decoder_inputs_embeds=decoder_inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=True, + cache_position=cache_position, + # **kwargs, ## TODO: add back when Bart attention is refactored and takes kwargs + ) + + hidden_states = outputs[0] + # Only compute necessary logits, and do not upcast them to float if we are not computing the loss + slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep + logits = self.lm_head(hidden_states[:, slice_indices, :]) + + loss = None + if labels is not None: + loss = self.loss_function( + logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **kwargs + ) + + return Florence2Seq2SeqLMOutput( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + decoder_hidden_states=outputs.decoder_hidden_states, + decoder_attentions=outputs.decoder_attentions, + cross_attentions=outputs.cross_attentions, + encoder_last_hidden_state=outputs.encoder_last_hidden_state, + encoder_hidden_states=outputs.encoder_hidden_states, + encoder_attentions=outputs.encoder_attentions, + image_hidden_states=outputs.image_hidden_states, + ) + + def get_placeholder_mask( + self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor, image_features: torch.FloatTensor + ): + return self.model.get_placeholder_mask( + input_ids=input_ids, inputs_embeds=inputs_embeds, image_features=image_features + ) + + def _prepare_encoder_decoder_kwargs_for_generation( + self, + inputs_tensor: torch.Tensor, + model_kwargs, + model_input_name: Optional[str], + generation_config, + ) -> dict[str, Any]: + # override to handle merging image and text embeddings before passing to language encoder + inputs_embeds = model_kwargs.pop("inputs_embeds", None) + pixel_values = model_kwargs.pop("pixel_values", None) + + if inputs_embeds is None: + inputs_embeds = self.get_input_embeddings()(inputs_tensor) + + if pixel_values is not None: + image_features = self.get_image_features(pixel_values) + image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype) + special_image_mask = self.get_placeholder_mask( + inputs_tensor, inputs_embeds=inputs_embeds, image_features=image_features + ) + inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features) + + model_kwargs["inputs_embeds"] = inputs_embeds + model_kwargs = super()._prepare_encoder_decoder_kwargs_for_generation( + None, model_kwargs, model_input_name, generation_config + ) + model_kwargs.pop("inputs_embeds", None) + return model_kwargs + + +__all__ = [ + "Florence2Config", + "Florence2Processor", + "Florence2VisionConfig", + "Florence2Model", + "Florence2ForConditionalGeneration", + "Florence2PreTrainedModel", + "Florence2VisionBackbone", + "Florence2VisionPreTrainedModel", +] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/florence2/processing_florence2.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/florence2/processing_florence2.py new file mode 100644 index 0000000000000000000000000000000000000000..91b63e9da7db8495ff9172f67a8fbd7597156a20 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/florence2/processing_florence2.py @@ -0,0 +1,802 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/florence2/modular_florence2.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_florence2.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright 2025 Microsoft and the HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import re +from typing import Any, Optional, Union + +import numpy as np + +from ...feature_extraction_utils import BatchFeature +from ...image_utils import ImageInput +from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack +from ...tokenization_utils_base import PreTokenizedInput, TextInput +from ...utils import is_torch_available, logging + + +if is_torch_available(): + import torch + +logger = logging.get_logger(__name__) + + +class Florence2ProcessorKwargs(ProcessingKwargs, total=False): + _defaults = { + "text_kwargs": {"padding": False, "return_mm_token_type_ids": False}, + "images_kwargs": {}, + } + + +class Florence2Processor(ProcessorMixin): + r""" + Constructs a Florence2 processor which wraps a Florence2 image processor and a Florence2 tokenizer into a single processor. + + [`Florence2Processor`] offers all the functionalities of [`AutoImageProcessor`] and [`BartTokenizerFast`]. See the + [`~Florence2Processor.__call__`] and [`~Florence2Processor.decode`] for more information. + + Args: + image_processor (`AutoImageProcessor`, *optional*): + The image processor is a required input. + tokenizer (`Union[BartTokenizer, BartTokenizerFast]`, *optional*): + The tokenizer is a required input. + num_additional_image_tokens (`int`, *optional*, defaults to 0): + Number of additional tokens added to the image embeddings, such as CLS (+1). If the backbone has no CLS or other + extra tokens appended, no need to set this arg. + post_processor_config (`dict`, *optional*, defaults to 0): + Task-specific parsing rules for [`Florence2PostProcessor`], e.g. regex patterns, + thresholds, or banned tokens. + """ + + attributes = ["image_processor", "tokenizer"] + image_processor_class = "AutoImageProcessor" + tokenizer_class = ("BartTokenizer", "BartTokenizerFast") + + def __init__( + self, + image_processor=None, + tokenizer=None, + num_additional_image_tokens: int = 0, + post_processor_config: Optional[dict] = None, + **kwargs, + ): + self.tasks_answer_post_processing_type = { + "": "pure_text", + "": "ocr", + "": "pure_text", + "": "pure_text", + "": "pure_text", + "": "description_with_bboxes", + "": "description_with_bboxes", + "": "phrase_grounding", + "": "polygons", + "": "polygons", + "": "description_with_bboxes_or_polygons", + "": "pure_text", + "": "pure_text", + "": "pure_text", + "": "bboxes", + } + + self.task_prompts_without_inputs = { + "": "What is the text in the image?", + "": "What is the text in the image, with regions?", + "": "What does the image describe?", + "": "Describe in detail what is shown in the image.", + "": "Describe with a paragraph what is shown in the image.", + "": "Locate the objects with category name in the image.", + "": "Locate the objects in the image, with their descriptions.", + "": "Locate the region proposals in the image.", + } + + self.task_prompts_with_input = { + "": "Locate the phrases in the caption: {input}", + "": "Locate {input} in the image with mask", + "": "What is the polygon mask of region {input}", + "": "Locate {input} in the image.", + "": "What is the region {input}?", + "": "What does the region {input} describe?", + "": "What text is in the region {input}?", + } + + self.num_image_tokens = image_processor.image_seq_length + self.num_additional_image_tokens = num_additional_image_tokens + self.post_processor_config = post_processor_config + self.post_processor = Florence2PostProcessor(config=post_processor_config, tokenizer=tokenizer) + self.image_token = tokenizer.image_token + self.image_token_id = tokenizer.image_token_id + + super().__init__(image_processor, tokenizer, **kwargs) + + def _construct_prompts(self, text: Union[str, list[str]]) -> list[str]: + """ + Construct prompts by replacing task tokens with corresponding prompt strings. + """ + if isinstance(text, str): + text = [text] + + prompts = [] + for prompt in text: + # Check for tasks without inputs + for task_token, task_prompt in self.task_prompts_without_inputs.items(): + if task_token in prompt: + if prompt != task_token: + raise ValueError(f"Task token {task_token} should be the only content in the prompt.") + prompt = task_prompt + break + # Check for tasks with inputs + for task_token, task_prompt in self.task_prompts_with_input.items(): + if task_token in prompt: + input_text = prompt.replace(task_token, "").strip() + prompt = task_prompt.format(input=input_text) + break + prompts.append(prompt) + return prompts + + def __call__( + self, + images: Optional[ImageInput] = None, + text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None, + **kwargs: Unpack[Florence2ProcessorKwargs], + ) -> BatchFeature: + """ + Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text` + and `kwargs` arguments to BartTokenizerFast's [`~BartTokenizerFast.__call__`] if `text` is not `None` to encode + the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to + CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring + of the above two methods for more information. + + Args: + images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`): + The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch + tensor. Both channels-first and channels-last formats are supported. + text (`str`, `list[str]`, `list[list[str]]`): + The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings + (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set + `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). + return_tensors (`str` or [`~utils.TensorType`], *optional*): + If set, will return tensors of a particular framework. Acceptable values are: + - `'tf'`: Return TensorFlow `tf.constant` objects. + - `'pt'`: Return PyTorch `torch.Tensor` objects. + - `'np'`: Return NumPy `np.ndarray` objects. + - `'jax'`: Return JAX `jnp.ndarray` objects. + + Returns: + [`BatchFeature`]: A [`BatchFeature`] with the following fields: + + - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`. + - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when + `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not + `None`). + - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`. + """ + if images is None and text is None: + raise ValueError("You have to specify at least one of `images` or `text`.") + + output_kwargs = self._merge_kwargs( + Florence2ProcessorKwargs, + tokenizer_init_kwargs=self.tokenizer.init_kwargs, + **kwargs, + ) + + image_inputs = {} + if images is not None: + image_inputs = self.image_processor(images, **output_kwargs["images_kwargs"]) + + if text is None: + logger.warning_once("You are using Florence-2 without a text prefix.") + text = [""] * (1 if not isinstance(images, list) else len(images)) + elif isinstance(text, str): + text = [text] + + if not isinstance(text, list) or not all(isinstance(token, str) for token in text): + raise ValueError("`text` must be a string or list of strings.") + + if isinstance(images, list) and len(images) != len(text): + raise ValueError(f"Number of images ({len(images)}) must match number of texts ({len(text)}).") + + prompt_strings = self._construct_prompts(text) + + # Add image tokens and special tokens if images are provided + if image_inputs.get("pixel_values") is not None: + # Replace the image token with the expanded image token sequence + expanded_image_prompts = [] + for sample in prompt_strings: + sample = ( + self.image_token * self.num_image_tokens + + self.tokenizer.bos_token + + sample + + self.tokenizer.eos_token + ) + expanded_image_prompts.append(sample) + prompt_strings = expanded_image_prompts + + # Construct and tokenize prompts + output_kwargs["text_kwargs"].pop("add_special_tokens", None) + return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None) + return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", False) + text_inputs = self.tokenizer( + prompt_strings, **output_kwargs["text_kwargs"], add_special_tokens=False, return_tensors=None + ) + self._check_special_mm_tokens(prompt_strings, text_inputs, modalities=["image"]) + + if return_mm_token_type_ids: + array_ids = np.array(text_inputs["input_ids"]) + mm_token_type_ids = np.zeros_like(text_inputs["input_ids"]) + mm_token_type_ids[array_ids == self.image_token_id] = 1 + text_inputs["mm_token_type_ids"] = mm_token_type_ids.tolist() + + return BatchFeature(data={**image_inputs, **text_inputs}, tensor_type=return_tensors) + + def batch_decode(self, *args, **kwargs): + """ + This method forwards all its arguments to BartTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please + refer to the docstring of this method for more information. + """ + return self.tokenizer.batch_decode(*args, **kwargs) + + def decode(self, *args, **kwargs): + """ + This method forwards all its arguments to BartTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to + the docstring of this method for more information. + """ + return self.tokenizer.decode(*args, **kwargs) + + @property + def model_input_names(self): + tokenizer_input_names = self.tokenizer.model_input_names + image_processor_input_names = self.image_processor.model_input_names + return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names)) + + def _get_num_multimodal_tokens(self, image_sizes=None, **kwargs): + """ + Computes the number of placeholder tokens needed for multimodal inputs with the given sizes. + + Args: + image_sizes (`list[list[int]]`, *optional*): + The input sizes formatted as (height, width) per each image. + + Returns: + `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided + input modalities, along with other useful data. + """ + + vision_data = {} + if image_sizes is not None: + num_image_tokens = [self.image_seq_length] * len(image_sizes) + num_image_patches = [1] * len(image_sizes) + + vision_data.update({"num_image_tokens": num_image_tokens, "num_image_patches": num_image_patches}) + + return MultiModalData(**vision_data) + + def post_process_image_text_to_text(self, generated_outputs, skip_special_tokens=False, **kwargs): + """ + Post-processes the output of `FuyuForConditionalGeneration` to only return the text output. + + Args: + generated_outputs (`torch.Tensor` or `np.ndarray`): + The output of the model. The output is expected to be a tensor of shape `(batch_size, sequence_length)` + containing the token ids of the generated sequences. + skip_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not to remove special tokens in the output. Argument passed to the tokenizer's `batch_decode` method. + **kwargs: + Additional arguments to be passed to the tokenizer's `batch_decode method`. + + Returns: + `list[str]`: The decoded text output. + """ + return self.batch_decode(generated_outputs, skip_special_tokens=skip_special_tokens, **kwargs) + + def post_process_generation(self, text=None, sequence=None, task=None, image_size=None) -> dict[str, Any]: + """ + Post-process generation outputs based on the task. + + Args: + text (`str`, *optional*): + Generated text. + sequence (`Union[List[int], torch.Tensor]`, *optional*): + Generated token sequence. + task (`str`, *optional*): + The task for post-processing. + image_size (`Tuple[int, int]`, *optional*): + Image size for dequantization. + + Returns: + `Dict[str, Any]`: Post-processed results keyed by task. + """ + if task is None: + raise ValueError("`task` must be provided for post-processing.") + + post_proc_type = self.tasks_answer_post_processing_type.get(task, "pure_text") + parsed = self.post_processor( + text=text, + sequence=sequence, + image_size=image_size, + parse_tasks=[post_proc_type], + )[post_proc_type] + + if post_proc_type == "pure_text": + final_answer = parsed.replace("", "").replace("", "").strip() + elif post_proc_type in ["description_with_bboxes", "bboxes"]: + bboxes = [inst["bbox"] for inst in parsed] + labels = [inst["cat_name"] for inst in parsed] + final_answer = {"bboxes": bboxes, "labels": labels} + if parsed and "score" in parsed[0]: + final_answer["scores"] = [inst["score"] for inst in parsed] + elif post_proc_type == "ocr": + quad_boxes = [inst["quad_box"] for inst in parsed] + labels = [inst["text"] for inst in parsed] + final_answer = {"quad_boxes": quad_boxes, "labels": labels} + elif post_proc_type == "phrase_grounding": + bboxes = [] + labels = [] + for inst in parsed: + for bbox in inst["bbox"]: + bboxes.append(bbox) + labels.append(inst["cat_name"]) + final_answer = {"bboxes": bboxes, "labels": labels} + elif post_proc_type in ["description_with_polygons", "polygons"]: + polygons = [inst["polygons"] for inst in parsed] + labels = [inst["cat_name"] for inst in parsed] + final_answer = {"polygons": polygons, "labels": labels} + elif post_proc_type == "description_with_bboxes_or_polygons": + bboxes = [] + bboxes_labels = [] + polygons = [] + polygons_labels = [] + for inst in parsed: + label = inst["cat_name"] + if "polygons" in inst: + polygons.append(inst["polygons"]) + polygons_labels.append(label) + else: + bboxes.append(inst["bbox"]) + bboxes_labels.append(label) + final_answer = { + "bboxes": bboxes, + "bboxes_labels": bboxes_labels, + "polygons": polygons, + "polygons_labels": polygons_labels, + } + else: + raise ValueError(f"Unknown post-processing type: {post_proc_type}") + + return {task: final_answer} + + +class Florence2PostProcessor: + """ + Post-processor for Florence-2 model outputs. Parses generated text into structured results for various tasks + like object detection, OCR, phrase grounding, etc. + + Args: + tokenizer (`PreTrainedTokenizer`): + The tokenizer used for decoding model outputs. + """ + + def __init__(self, config, tokenizer): + self.tokenizer = tokenizer + self.parse_task_config = config or {} + self.banned_grounding_tokens = set( + self.parse_task_config.get("phrase_grounding", {}).get("banned_grounding_tokens", []) + ) + self.all_special_tokens = set(self.tokenizer.all_special_tokens) + self.quantize_bins = (1000, 1000) + + def quantize(self, locations: "torch.Tensor", size: tuple[int, int]) -> "torch.Tensor": + """ + Quantize locations. + + Args: + locations (`torch.Tensor`): + Tensor of shape (N, 4) for boxes or (N, 2) for points/coordinates. + size (`tuple[int, int]`): + Original image size (width, height). + + Returns: + `torch.Tensor`: Quantized locations as integers. + """ + bins_w, bins_h = self.quantize_bins + size_w, size_h = size + per_bin_w = size_w / bins_w + per_bin_h = size_h / bins_h + + if locations.shape[-1] == 4: # Bounding boxes: [xmin, ymin, xmax, ymax] + xmin, ymin, xmax, ymax = locations.split(1, dim=-1) + q_xmin = (xmin / per_bin_w).floor().clamp(0, bins_w - 1) + q_ymin = (ymin / per_bin_h).floor().clamp(0, bins_h - 1) + q_xmax = (xmax / per_bin_w).floor().clamp(0, bins_w - 1) + q_ymax = (ymax / per_bin_h).floor().clamp(0, bins_h - 1) + return torch.cat([q_xmin, q_ymin, q_xmax, q_ymax], dim=-1).int() + + elif locations.shape[-1] == 2: # Points/coordinates: [x, y] + x, y = locations.split(1, dim=-1) + q_x = (x / per_bin_w).floor().clamp(0, bins_w - 1) + q_y = (y / per_bin_h).floor().clamp(0, bins_h - 1) + return torch.cat([q_x, q_y], dim=-1).int() + + else: + raise ValueError(f"Unsupported location shape: last dim must be 2 or 4, got {locations.shape[-1]}.") + + def dequantize(self, locations: "torch.Tensor", size: tuple[int, int]) -> "torch.Tensor": + """ + Dequantize locations back to original scale. + + Args: + locations (`torch.Tensor`): + Quantized tensor of shape (N, 4) for boxes or (N, 2) for points/coordinates. + size (`tuple[int, int]`): + Original image size (width, height). + + Returns: + `torch.Tensor`: Dequantized locations as floats. + """ + bins_w, bins_h = self.quantize_bins + size_w, size_h = size + per_bin_w = size_w / bins_w + per_bin_h = size_h / bins_h + + # Add 0.5 to use the center position of the bin as the coordinate. + if locations.shape[-1] == 4: # Bounding boxes + xmin, ymin, xmax, ymax = locations.split(1, dim=-1) + dq_xmin = (xmin + 0.5) * per_bin_w + dq_ymin = (ymin + 0.5) * per_bin_h + dq_xmax = (xmax + 0.5) * per_bin_w + dq_ymax = (ymax + 0.5) * per_bin_h + return torch.cat([dq_xmin, dq_ymin, dq_xmax, dq_ymax], dim=-1).int() + + elif locations.shape[-1] == 2: # Points/coordinates + x, y = locations.split(1, dim=-1) + dq_x = (x + 0.5) * per_bin_w + dq_y = (y + 0.5) * per_bin_h + return torch.cat([dq_x, dq_y], dim=-1).int() + + else: + raise ValueError(f"Unsupported location shape: last dim must be 2 or 4, got {locations.shape[-1]}.") + + def decode_with_spans(self, token_ids: list[int]) -> tuple[str, list[tuple[int, int]]]: + """ + Decode token IDs to text and compute character spans. + + Args: + token_ids (`list[int]`): + list of token IDs to decode. + + Returns: + `tuple[str, list[tuple[int, int]]]`: Decoded text and list of spans (start, end) for each token. + """ + filtered_tokens = self.tokenizer.convert_ids_to_tokens(token_ids, skip_special_tokens=False) + text = "" + spans = [] + for token in filtered_tokens: + if token in self.all_special_tokens: + sub_text = token + else: + sub_text = self.tokenizer.convert_tokens_to_string([token]) + span = (len(text), len(text) + len(sub_text)) + text += sub_text + spans.append(span) + return text, spans + + def parse_ocr_from_text_and_spans( + self, text: str, pattern: Optional[str], image_size: tuple[int, int], area_threshold: float = 0.0 + ) -> list[dict[str, Any]]: + """ + Parse OCR results with quadrilateral boxes. + + Args: + text (`str`): + The generated text. + pattern (`str`): + Regex pattern for matching. + image_size (`tuple[int, int]`): + Image size (width, height). + area_threshold (`float`, *optional*, defaults to 0.0): + Minimum area threshold for filtering boxes. + + Returns: + `list[dict[str, Any]]`: list of instances with 'quad_box' and 'text'. + """ + text = text.replace("", "").replace("", "").replace("", "") + if pattern is None: + pattern = r"(.+?)" + + matches = re.findall(pattern, text) + instances = [] + width, height = image_size + + for content, *quad_str in matches: + quad_bins = [int(i) for i in quad_str] + quad_box = self.dequantize(torch.tensor(quad_bins).reshape(-1, 2), size=image_size).flatten().tolist() + + if area_threshold > 0: + x_coords = quad_box[0::2] + y_coords = quad_box[1::2] + # Apply the Shoelace formula + area = 0.5 * abs( + sum(x_coords[i] * y_coords[i + 1] - x_coords[i + 1] * y_coords[i] for i in range(4 - 1)) + ) + + if area < (width * height) * area_threshold: + continue + + instances.append({"quad_box": quad_box, "text": content.strip()}) + return instances + + def parse_phrase_grounding_from_text_and_spans( + self, text: str, image_size: tuple[int, int] + ) -> list[dict[str, Any]]: + """ + Parse phrase grounding results. + + Args: + text (`str`): + The generated text. + image_size (`tuple[int, int]`): + Image size (width, height). + + Returns: + `list[dict[str, Any]]`: list of instances with 'bbox' and 'cat_name'. + """ + text = text.replace("", "").replace("", "").replace("", "") + phrase_pattern = r"([^<]+(?:){4,})" + phrases = re.findall(phrase_pattern, text) + text_pattern = r"^\s*(.*?)(?=||||||" + + instances = [] + for phrase_text in phrases: + phrase_text = phrase_text.replace("", "", 1).replace("", "", 1) + if not phrase_text: + continue + match = re.search(text_pattern, phrase_text) + if not match: + continue + phrase = match.group().strip() + if phrase in self.banned_grounding_tokens: + continue + boxes_matches = list(re.finditer(box_pattern, phrase_text)) + if not boxes_matches: + continue + bbox_bins = [[int(m.group(j)) for j in range(1, 5)] for m in boxes_matches] + bboxes = self.dequantize(torch.tensor(bbox_bins), size=image_size).tolist() + phrase = phrase.encode("ascii", "ignore").decode("ascii") + instances.append({"bbox": bboxes, "cat_name": phrase}) + return instances + + def _find_matched_token_indices(self, cur_span: tuple[int, int], token_spans: list[tuple[int, int]]) -> list[int]: + return [i for i, span in enumerate(token_spans) if not (span[1] <= cur_span[0] or span[0] >= cur_span[1])] + + def parse_description_with_bboxes_from_text_and_spans( + self, + text: str, + image_size: tuple[int, int], + allow_empty_phrase: bool = False, + ) -> list[dict[str, Any]]: + """ + Parse descriptions with bounding boxes. + + Args: + text (`str`): + The generated text. + image_size (`tuple[int, int]`): + Image size (width, height). + allow_empty_phrase (`bool`, *optional*, defaults to `False`): + Allow phrases without text. + + Returns: + `list[dict[str, Any]]`: list of instances with 'bbox', 'cat_name', and optional 'score'. + """ + text = text.replace("", "").replace("", "").replace("", "") + + if allow_empty_phrase: + pattern = r"(?:(?:){4,})" + else: + pattern = r"([^<]+(?:){4,})" + phrases = re.findall(pattern, text) + + text_pattern = r"^\s*(.*?)(?=||||||" + + instances = [] + for phrase_text in phrases: + phrase_text = phrase_text.replace("", "", 1).replace("", "", 1) + if not phrase_text and not allow_empty_phrase: + continue + match = re.search(text_pattern, phrase_text) + if not match: + continue + phrase = match.group().strip() + boxes_matches = list(re.finditer(box_pattern, phrase_text)) + if not boxes_matches: + continue + bbox_bins = [[int(m.group(j)) for j in range(1, 5)] for m in boxes_matches] + bboxes = self.dequantize(torch.tensor(bbox_bins), size=image_size).tolist() + + phrase = phrase.encode("ascii", "ignore").decode("ascii") + for bbox in bboxes: + instance = {"bbox": bbox, "cat_name": phrase} + instances.append(instance) + + return instances + + def parse_description_with_polygons_from_text_and_spans( + self, + text: str, + image_size: tuple[int, int], + allow_empty_phrase: bool = False, + polygon_sep_token: str = "", + polygon_start_token: str = "", + polygon_end_token: str = "", + with_box_at_start: bool = False, + ) -> list[dict[str, Any]]: + """ + Parse descriptions with polygons. + + Args: + text (`str`): + The generated text. + image_size (`tuple[int, int]`): + Image size (width, height). + allow_empty_phrase (`bool`, *optional*, defaults to `False`): + Allow phrases without text. + polygon_sep_token (`str`, *optional*, defaults to ""): + Token separating polygons. + polygon_start_token (`str`, *optional*, defaults to ""): + Start token for polygons. + polygon_end_token (`str`, *optional*, defaults to ""): + End token for polygons. + with_box_at_start (`bool`, *optional*, defaults to `False`): + Whether a bounding box is at the start of polygons. + + Returns: + `list[dict[str, Any]]`: list of instances with 'polygons', 'cat_name', and optional 'bbox'. + """ + text = text.replace("", "").replace("", "").replace("", "") + + if allow_empty_phrase: + pattern = rf"(?:(?:|{re.escape(polygon_sep_token)}|{re.escape(polygon_start_token)}|{re.escape(polygon_end_token)}){{4,}})" + else: + pattern = rf"([^<]+(?:|{re.escape(polygon_sep_token)}|{re.escape(polygon_start_token)}|{re.escape(polygon_end_token)}){{4,}})" + phrases = re.findall(pattern, text) + phrase_pattern = r"^\s*(.*?)(?=||||||)" + poly_instance_pattern = rf"{re.escape(polygon_start_token)}(.*?){re.escape(polygon_end_token)}" + box_pattern = rf"((?:)+)(?:{re.escape(polygon_sep_token)}|$)" + + instances = [] + for phrase_text in phrases: + phrase_text_strip = re.sub(r"^", "", phrase_text, count=1) + if not phrase_text_strip and not allow_empty_phrase: + continue + match = re.search(phrase_pattern, phrase_text_strip) + if not match: + continue + phrase = match.group().strip() + + if polygon_start_token in phrase_text and polygon_end_token in phrase_text: + poly_instances = [m.group(1) for m in re.finditer(poly_instance_pattern, phrase_text)] + else: + poly_instances = [phrase_text] + + for poly_inst in poly_instances: + poly_matches = list(re.finditer(box_pattern, poly_inst)) + if len(poly_matches) == 0: + continue + bbox = [] + polygons = [] + for poly_match in poly_matches: + poly_str = poly_match.group(1) + poly_bins = [int(m.group(1)) for m in re.finditer(r"", poly_str)] + if with_box_at_start and not bbox: + if len(poly_bins) > 4: + bbox = poly_bins[:4] + poly_bins = poly_bins[4:] + else: + bbox = [0, 0, 0, 0] + if len(poly_bins) % 2 == 1: + poly_bins = poly_bins[:-1] + poly_coords = ( + self.dequantize(torch.tensor(poly_bins).reshape(-1, 2), size=image_size).flatten().tolist() + ) + polygons.append(poly_coords) + + instance = {"cat_name": phrase, "polygons": polygons} + if bbox: + instance["bbox"] = self.dequantize(torch.tensor([bbox]), size=image_size)[0].tolist() + instances.append(instance) + return instances + + def __call__(self, text=None, sequence=None, image_size=None, parse_tasks=None) -> dict[str, Any]: + """ + Process model output and parse into task-specific results. + + Args: + text (`Optional[str]`, *optional*): + Generated text. Either this or `sequence` must be provided. + sequence (`Optional[Union[list[int], torch.Tensor]]`, *optional*): + Token sequence. Either this or `text` must be provided. + image_size (`Optional[tuple[int, int]]`, *optional*): + Image size (width, height) required for dequantization. + parse_tasks (`Optional[Union[str, list[str]]]`, *optional*): + Specific tasks to parse. If None, parse all supported tasks. + + Returns: + `dict[str, Any]`: Parsed results for each task, including the raw 'text'. + """ + if parse_tasks is not None: + parse_tasks = [parse_tasks] if isinstance(parse_tasks, str) else parse_tasks + for task in parse_tasks: + if task not in self.parse_task_config.keys(): + raise ValueError(f"Unsupported parse task: {task}") + + if (text is None and sequence is None) or (text is not None and sequence is not None): + raise ValueError("Exactly one of 'text' or 'sequence' must be provided.") + + if sequence is not None: + if isinstance(sequence, torch.Tensor): + sequence = sequence.tolist() + sequence = sequence[1:] if sequence[0] == self.tokenizer.bos_token_id else sequence # Skip BOS if present + text, _ = self.decode_with_spans(sequence) + + parsed_dict = {"text": text} + + tasks_to_parse = parse_tasks or self.parse_task_config.keys() + for task in tasks_to_parse: + config = self.parse_task_config[task] + pattern = config.get("PATTERN") + + if task == "ocr": + parsed_dict["ocr"] = self.parse_ocr_from_text_and_spans( + text, pattern=pattern, image_size=image_size, area_threshold=config.get("AREA_THRESHOLD", 0.0) + ) + elif task == "phrase_grounding": + parsed_dict["phrase_grounding"] = self.parse_phrase_grounding_from_text_and_spans( + text, image_size=image_size + ) + elif task == "pure_text": + parsed_dict["pure_text"] = text + elif task == "description_with_bboxes": + parsed_dict["description_with_bboxes"] = self.parse_description_with_bboxes_from_text_and_spans( + text, image_size=image_size + ) + elif task == "description_with_polygons": + parsed_dict["description_with_polygons"] = self.parse_description_with_polygons_from_text_and_spans( + text, image_size=image_size + ) + elif task == "polygons": + parsed_dict["polygons"] = self.parse_description_with_polygons_from_text_and_spans( + text, image_size=image_size, allow_empty_phrase=True + ) + elif task == "bboxes": + parsed_dict["bboxes"] = self.parse_description_with_bboxes_from_text_and_spans( + text, image_size=image_size, allow_empty_phrase=True + ) + elif task == "description_with_bboxes_or_polygons": + if "" in text: + instances = self.parse_description_with_polygons_from_text_and_spans(text, image_size=image_size) + else: + instances = self.parse_description_with_bboxes_from_text_and_spans(text, image_size=image_size) + parsed_dict["description_with_bboxes_or_polygons"] = instances + else: + raise ValueError(f"task {task} is not supported") + + return parsed_dict + + +__all__ = ["Florence2Processor"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/fnet/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/fnet/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..756d690e72c11e8075429e3666e8579f0666d074 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/fnet/__init__.py @@ -0,0 +1,29 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_fnet import * + from .modeling_fnet import * + from .tokenization_fnet import * + from .tokenization_fnet_fast import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/fnet/configuration_fnet.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/fnet/configuration_fnet.py new file mode 100644 index 0000000000000000000000000000000000000000..24a578328565939b821cf8f9500559b2f585ea56 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/fnet/configuration_fnet.py @@ -0,0 +1,119 @@ +# coding=utf-8 +# Copyright 2021 Google AI and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""FNet model configuration""" + +from ...configuration_utils import PretrainedConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class FNetConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`FNetModel`]. It is used to instantiate an FNet + model according to the specified arguments, defining the model architecture. Instantiating a configuration with the + defaults will yield a similar configuration to that of the FNet + [google/fnet-base](https://huggingface.co/google/fnet-base) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 32000): + Vocabulary size of the FNet model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`FNetModel`] or [`TFFNetModel`]. + hidden_size (`int`, *optional*, defaults to 768): + Dimension of the encoder layers and the pooler layer. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + intermediate_size (`int`, *optional*, defaults to 3072): + Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu_new"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"` and `"gelu_new"` are supported. + hidden_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + max_position_embeddings (`int`, *optional*, defaults to 512): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + type_vocab_size (`int`, *optional*, defaults to 4): + The vocabulary size of the `token_type_ids` passed when calling [`FNetModel`] or [`TFFNetModel`]. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-12): + The epsilon used by the layer normalization layers. + use_tpu_fourier_optimizations (`bool`, *optional*, defaults to `False`): + Determines whether to use TPU optimized FFTs. If `True`, the model will favor axis-wise FFTs transforms. + Set to `False` for GPU/CPU hardware, in which case n-dimensional FFTs are used. + tpu_short_seq_length (`int`, *optional*, defaults to 512): + The sequence length that is expected by the model when using TPUs. This will be used to initialize the DFT + matrix only when *use_tpu_fourier_optimizations* is set to `True` and the input sequence is shorter than or + equal to 4096 tokens. + + Example: + + ```python + >>> from transformers import FNetConfig, FNetModel + + >>> # Initializing a FNet fnet-base style configuration + >>> configuration = FNetConfig() + + >>> # Initializing a model (with random weights) from the fnet-base style configuration + >>> model = FNetModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "fnet" + + def __init__( + self, + vocab_size=32000, + hidden_size=768, + num_hidden_layers=12, + intermediate_size=3072, + hidden_act="gelu_new", + hidden_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=4, + initializer_range=0.02, + layer_norm_eps=1e-12, + use_tpu_fourier_optimizations=False, + tpu_short_seq_length=512, + pad_token_id=3, + bos_token_id=1, + eos_token_id=2, + **kwargs, + ): + super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) + + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.initializer_range = initializer_range + self.type_vocab_size = type_vocab_size + self.layer_norm_eps = layer_norm_eps + self.use_tpu_fourier_optimizations = use_tpu_fourier_optimizations + self.tpu_short_seq_length = tpu_short_seq_length + + +__all__ = ["FNetConfig"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/fnet/modeling_fnet.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/fnet/modeling_fnet.py new file mode 100644 index 0000000000000000000000000000000000000000..2ad09a3b268b358c1d8be42824cf4a8dbe23dcee --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/fnet/modeling_fnet.py @@ -0,0 +1,1093 @@ +# coding=utf-8 +# Copyright 2021 Google Research and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch FNet model.""" + +import warnings +from dataclasses import dataclass +from functools import partial +from typing import Optional, Union + +import torch +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from ...utils import auto_docstring, is_scipy_available + + +if is_scipy_available(): + from scipy import linalg + +from ...activations import ACT2FN +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_outputs import ( + BaseModelOutput, + BaseModelOutputWithPooling, + MaskedLMOutput, + ModelOutput, + MultipleChoiceModelOutput, + NextSentencePredictorOutput, + QuestionAnsweringModelOutput, + SequenceClassifierOutput, + TokenClassifierOutput, +) +from ...modeling_utils import PreTrainedModel +from ...pytorch_utils import apply_chunking_to_forward +from ...utils import logging +from .configuration_fnet import FNetConfig + + +logger = logging.get_logger(__name__) + + +# Adapted from https://github.com/google-research/google-research/blob/master/f_net/fourier.py +def _two_dim_matmul(x, matrix_dim_one, matrix_dim_two): + """Applies 2D matrix multiplication to 3D input arrays.""" + seq_length = x.shape[1] + matrix_dim_one = matrix_dim_one[:seq_length, :seq_length] + x = x.type(torch.complex64) + return torch.einsum("bij,jk,ni->bnk", x, matrix_dim_two, matrix_dim_one) + + +# # Adapted from https://github.com/google-research/google-research/blob/master/f_net/fourier.py +def two_dim_matmul(x, matrix_dim_one, matrix_dim_two): + return _two_dim_matmul(x, matrix_dim_one, matrix_dim_two) + + +# Adapted from https://github.com/google-research/google-research/blob/master/f_net/fourier.py +def fftn(x): + """ + Applies n-dimensional Fast Fourier Transform (FFT) to input array. + + Args: + x: Input n-dimensional array. + + Returns: + n-dimensional Fourier transform of input n-dimensional array. + """ + out = x + for axis in reversed(range(x.ndim)[1:]): # We don't need to apply FFT to last axis + out = torch.fft.fft(out, axis=axis) + return out + + +class FNetEmbeddings(nn.Module): + """Construct the embeddings from word, position and token_type embeddings.""" + + def __init__(self, config): + super().__init__() + self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id) + self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) + self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size) + + # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load + # any TensorFlow checkpoint file + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + # NOTE: This is the project layer and will be needed. The original code allows for different embedding and different model dimensions. + self.projection = nn.Linear(config.hidden_size, config.hidden_size) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + # position_ids (1, len position emb) is contiguous in memory and exported when serialized + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) + + self.register_buffer( + "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False + ) + + def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None): + if input_ids is not None: + input_shape = input_ids.size() + else: + input_shape = inputs_embeds.size()[:-1] + + seq_length = input_shape[1] + + if position_ids is None: + position_ids = self.position_ids[:, :seq_length] + + # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs + # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves + # issue #5664 + if token_type_ids is None: + if hasattr(self, "token_type_ids"): + buffered_token_type_ids = self.token_type_ids[:, :seq_length] + buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length) + token_type_ids = buffered_token_type_ids_expanded + else: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device) + + if inputs_embeds is None: + inputs_embeds = self.word_embeddings(input_ids) + token_type_embeddings = self.token_type_embeddings(token_type_ids) + + embeddings = inputs_embeds + token_type_embeddings + + position_embeddings = self.position_embeddings(position_ids) + embeddings += position_embeddings + embeddings = self.LayerNorm(embeddings) + embeddings = self.projection(embeddings) + embeddings = self.dropout(embeddings) + return embeddings + + +class FNetBasicFourierTransform(nn.Module): + def __init__(self, config): + super().__init__() + self._init_fourier_transform(config) + + def _init_fourier_transform(self, config): + if not config.use_tpu_fourier_optimizations: + self.fourier_transform = partial(torch.fft.fftn, dim=(1, 2)) + elif config.max_position_embeddings <= 4096: + if is_scipy_available(): + self.register_buffer( + "dft_mat_hidden", torch.tensor(linalg.dft(config.hidden_size), dtype=torch.complex64) + ) + self.register_buffer( + "dft_mat_seq", torch.tensor(linalg.dft(config.tpu_short_seq_length), dtype=torch.complex64) + ) + self.fourier_transform = partial( + two_dim_matmul, matrix_dim_one=self.dft_mat_seq, matrix_dim_two=self.dft_mat_hidden + ) + else: + logging.warning( + "SciPy is needed for DFT matrix calculation and is not found. Using TPU optimized fast fourier" + " transform instead." + ) + self.fourier_transform = fftn + else: + self.fourier_transform = fftn + + def forward(self, hidden_states): + # NOTE: We do not use torch.vmap as it is not integrated into PyTorch stable versions. + # Interested users can modify the code to use vmap from the nightly versions, getting the vmap from here: + # https://pytorch.org/docs/master/generated/torch.vmap.html. Note that fourier transform methods will need + # change accordingly. + + outputs = self.fourier_transform(hidden_states).real + return (outputs,) + + +class FNetBasicOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + def forward(self, hidden_states, input_tensor): + hidden_states = self.LayerNorm(input_tensor + hidden_states) + return hidden_states + + +class FNetFourierTransform(nn.Module): + def __init__(self, config): + super().__init__() + self.self = FNetBasicFourierTransform(config) + self.output = FNetBasicOutput(config) + + def forward(self, hidden_states): + self_outputs = self.self(hidden_states) + fourier_output = self.output(self_outputs[0], hidden_states) + outputs = (fourier_output,) + return outputs + + +# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->FNet +class FNetIntermediate(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +# Copied from transformers.models.bert.modeling_bert.BertOutput with Bert->FNet +class FNetOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class FNetLayer(GradientCheckpointingLayer): + def __init__(self, config): + super().__init__() + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 # The dimension which has the sequence length + self.fourier = FNetFourierTransform(config) + self.intermediate = FNetIntermediate(config) + self.output = FNetOutput(config) + + def forward(self, hidden_states): + self_fourier_outputs = self.fourier(hidden_states) + fourier_output = self_fourier_outputs[0] + + layer_output = apply_chunking_to_forward( + self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, fourier_output + ) + + outputs = (layer_output,) + + return outputs + + def feed_forward_chunk(self, fourier_output): + intermediate_output = self.intermediate(fourier_output) + layer_output = self.output(intermediate_output, fourier_output) + return layer_output + + +class FNetEncoder(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.layer = nn.ModuleList([FNetLayer(config) for _ in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + + def forward(self, hidden_states, output_hidden_states=False, return_dict=True): + all_hidden_states = () if output_hidden_states else None + + for i, layer_module in enumerate(self.layer): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_outputs = layer_module(hidden_states) + + hidden_states = layer_outputs[0] + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, all_hidden_states] if v is not None) + + return BaseModelOutput(last_hidden_state=hidden_states, hidden_states=all_hidden_states) + + +# Copied from transformers.models.bert.modeling_bert.BertPooler with Bert->FNet +class FNetPooler(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.activation = nn.Tanh() + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. + first_token_tensor = hidden_states[:, 0] + pooled_output = self.dense(first_token_tensor) + pooled_output = self.activation(pooled_output) + return pooled_output + + +# Copied from transformers.models.bert.modeling_bert.BertPredictionHeadTransform with Bert->FNet +class FNetPredictionHeadTransform(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + if isinstance(config.hidden_act, str): + self.transform_act_fn = ACT2FN[config.hidden_act] + else: + self.transform_act_fn = config.hidden_act + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.transform_act_fn(hidden_states) + hidden_states = self.LayerNorm(hidden_states) + return hidden_states + + +class FNetLMPredictionHead(nn.Module): + def __init__(self, config): + super().__init__() + self.transform = FNetPredictionHeadTransform(config) + + # The output weights are the same as the input embeddings, but there is + # an output-only bias for each token. + self.decoder = nn.Linear(config.hidden_size, config.vocab_size) + + self.bias = nn.Parameter(torch.zeros(config.vocab_size)) + self.decoder.bias = self.bias + + def forward(self, hidden_states): + hidden_states = self.transform(hidden_states) + hidden_states = self.decoder(hidden_states) + return hidden_states + + def _tie_weights(self) -> None: + # For accelerate compatibility and to not break backward compatibility + if self.decoder.bias.device.type == "meta": + self.decoder.bias = self.bias + else: + # To tie those two weights if they get disconnected (on TPU or when the bias is resized) + self.bias = self.decoder.bias + + +class FNetOnlyMLMHead(nn.Module): + def __init__(self, config): + super().__init__() + self.predictions = FNetLMPredictionHead(config) + + def forward(self, sequence_output): + prediction_scores = self.predictions(sequence_output) + return prediction_scores + + +# Copied from transformers.models.bert.modeling_bert.BertOnlyNSPHead with Bert->FNet +class FNetOnlyNSPHead(nn.Module): + def __init__(self, config): + super().__init__() + self.seq_relationship = nn.Linear(config.hidden_size, 2) + + def forward(self, pooled_output): + seq_relationship_score = self.seq_relationship(pooled_output) + return seq_relationship_score + + +# Copied from transformers.models.bert.modeling_bert.BertPreTrainingHeads with Bert->FNet +class FNetPreTrainingHeads(nn.Module): + def __init__(self, config): + super().__init__() + self.predictions = FNetLMPredictionHead(config) + self.seq_relationship = nn.Linear(config.hidden_size, 2) + + def forward(self, sequence_output, pooled_output): + prediction_scores = self.predictions(sequence_output) + seq_relationship_score = self.seq_relationship(pooled_output) + return prediction_scores, seq_relationship_score + + +@auto_docstring +class FNetPreTrainedModel(PreTrainedModel): + config: FNetConfig + base_model_prefix = "fnet" + supports_gradient_checkpointing = True + + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, nn.Linear): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + # NOTE: Original code uses same initialization as weights for biases as well. + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + +@dataclass +@auto_docstring( + custom_intro=""" + Output type of [`FNetForPreTraining`]. + """ +) +class FNetForPreTrainingOutput(ModelOutput): + r""" + loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`): + Total loss as the sum of the masked language modeling loss and the next sequence prediction + (classification) loss. + prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + seq_relationship_logits (`torch.FloatTensor` of shape `(batch_size, 2)`): + Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation + before SoftMax). + """ + + loss: Optional[torch.FloatTensor] = None + prediction_logits: Optional[torch.FloatTensor] = None + seq_relationship_logits: Optional[torch.FloatTensor] = None + hidden_states: Optional[tuple[torch.FloatTensor]] = None + + +@auto_docstring +class FNetModel(FNetPreTrainedModel): + """ + + The model can behave as an encoder, following the architecture described in [FNet: Mixing Tokens with Fourier + Transforms](https://huggingface.co/papers/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon. + + """ + + def __init__(self, config, add_pooling_layer=True): + r""" + add_pooling_layer (bool, *optional*, defaults to `True`): + Whether to add a pooling layer + """ + super().__init__(config) + self.config = config + + self.embeddings = FNetEmbeddings(config) + self.encoder = FNetEncoder(config) + + self.pooler = FNetPooler(config) if add_pooling_layer else None + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embeddings.word_embeddings + + def set_input_embeddings(self, value): + self.embeddings.word_embeddings = value + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, BaseModelOutput]: + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + input_shape = input_ids.size() + batch_size, seq_length = input_shape + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + batch_size, seq_length = input_shape + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + if ( + self.config.use_tpu_fourier_optimizations + and seq_length <= 4096 + and self.config.tpu_short_seq_length != seq_length + ): + raise ValueError( + "The `tpu_short_seq_length` in FNetConfig should be set equal to the sequence length being passed to" + " the model when using TPU optimizations." + ) + + device = input_ids.device if input_ids is not None else inputs_embeds.device + + if token_type_ids is None: + if hasattr(self.embeddings, "token_type_ids"): + buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length] + buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length) + token_type_ids = buffered_token_type_ids_expanded + else: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) + + embedding_output = self.embeddings( + input_ids=input_ids, + position_ids=position_ids, + token_type_ids=token_type_ids, + inputs_embeds=inputs_embeds, + ) + encoder_outputs = self.encoder( + embedding_output, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + sequence_output = encoder_outputs[0] + + pooler_output = self.pooler(sequence_output) if self.pooler is not None else None + + if not return_dict: + return (sequence_output, pooler_output) + encoder_outputs[1:] + + return BaseModelOutputWithPooling( + last_hidden_state=sequence_output, + pooler_output=pooler_output, + hidden_states=encoder_outputs.hidden_states, + ) + + +@auto_docstring( + custom_intro=""" + FNet Model with two heads on top as done during the pretraining: a `masked language modeling` head and a `next + sentence prediction (classification)` head. + """ +) +class FNetForPreTraining(FNetPreTrainedModel): + _tied_weights_keys = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"] + + def __init__(self, config): + super().__init__(config) + + self.fnet = FNetModel(config) + self.cls = FNetPreTrainingHeads(config) + + # Initialize weights and apply final processing + self.post_init() + + def get_output_embeddings(self): + return self.cls.predictions.decoder + + def set_output_embeddings(self, new_embeddings): + self.cls.predictions.decoder = new_embeddings + self.cls.predictions.bias = new_embeddings.bias + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + next_sentence_label: Optional[torch.Tensor] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, FNetForPreTrainingOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., + config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the + loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` + next_sentence_label (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair + (see `input_ids` docstring) Indices should be in `[0, 1]`: + + - 0 indicates sequence B is a continuation of sequence A, + - 1 indicates sequence B is a random sequence. + + Example: + + ```python + >>> from transformers import AutoTokenizer, FNetForPreTraining + >>> import torch + + >>> tokenizer = AutoTokenizer.from_pretrained("google/fnet-base") + >>> model = FNetForPreTraining.from_pretrained("google/fnet-base") + >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") + >>> outputs = model(**inputs) + >>> prediction_logits = outputs.prediction_logits + >>> seq_relationship_logits = outputs.seq_relationship_logits + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.fnet( + input_ids, + token_type_ids=token_type_ids, + position_ids=position_ids, + inputs_embeds=inputs_embeds, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output, pooled_output = outputs[:2] + prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output) + + total_loss = None + if labels is not None and next_sentence_label is not None: + loss_fct = CrossEntropyLoss() + masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) + next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1)) + total_loss = masked_lm_loss + next_sentence_loss + + if not return_dict: + output = (prediction_scores, seq_relationship_score) + outputs[2:] + return ((total_loss,) + output) if total_loss is not None else output + + return FNetForPreTrainingOutput( + loss=total_loss, + prediction_logits=prediction_scores, + seq_relationship_logits=seq_relationship_score, + hidden_states=outputs.hidden_states, + ) + + +@auto_docstring +class FNetForMaskedLM(FNetPreTrainedModel): + _tied_weights_keys = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"] + + def __init__(self, config): + super().__init__(config) + + self.fnet = FNetModel(config) + self.cls = FNetOnlyMLMHead(config) + + # Initialize weights and apply final processing + self.post_init() + + def get_output_embeddings(self): + return self.cls.predictions.decoder + + def set_output_embeddings(self, new_embeddings): + self.cls.predictions.decoder = new_embeddings + self.cls.predictions.bias = new_embeddings.bias + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, MaskedLMOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., + config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the + loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.fnet( + input_ids, + token_type_ids=token_type_ids, + position_ids=position_ids, + inputs_embeds=inputs_embeds, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + prediction_scores = self.cls(sequence_output) + + masked_lm_loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() # -100 index = padding token + masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) + + if not return_dict: + output = (prediction_scores,) + outputs[2:] + return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output + + return MaskedLMOutput(loss=masked_lm_loss, logits=prediction_scores, hidden_states=outputs.hidden_states) + + +@auto_docstring( + custom_intro=""" + FNet Model with a `next sentence prediction (classification)` head on top. + """ +) +class FNetForNextSentencePrediction(FNetPreTrainedModel): + def __init__(self, config): + super().__init__(config) + + self.fnet = FNetModel(config) + self.cls = FNetOnlyNSPHead(config) + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + **kwargs, + ) -> Union[tuple, NextSentencePredictorOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair + (see `input_ids` docstring). Indices should be in `[0, 1]`: + + - 0 indicates sequence B is a continuation of sequence A, + - 1 indicates sequence B is a random sequence. + + Example: + + ```python + >>> from transformers import AutoTokenizer, FNetForNextSentencePrediction + >>> import torch + + >>> tokenizer = AutoTokenizer.from_pretrained("google/fnet-base") + >>> model = FNetForNextSentencePrediction.from_pretrained("google/fnet-base") + >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced." + >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light." + >>> encoding = tokenizer(prompt, next_sentence, return_tensors="pt") + >>> outputs = model(**encoding, labels=torch.LongTensor([1])) + >>> logits = outputs.logits + >>> assert logits[0, 0] < logits[0, 1] # next sentence was random + ```""" + + if "next_sentence_label" in kwargs: + warnings.warn( + "The `next_sentence_label` argument is deprecated and will be removed in a future version, use" + " `labels` instead.", + FutureWarning, + ) + labels = kwargs.pop("next_sentence_label") + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.fnet( + input_ids, + token_type_ids=token_type_ids, + position_ids=position_ids, + inputs_embeds=inputs_embeds, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + pooled_output = outputs[1] + + seq_relationship_scores = self.cls(pooled_output) + + next_sentence_loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + next_sentence_loss = loss_fct(seq_relationship_scores.view(-1, 2), labels.view(-1)) + + if not return_dict: + output = (seq_relationship_scores,) + outputs[2:] + return ((next_sentence_loss,) + output) if next_sentence_loss is not None else output + + return NextSentencePredictorOutput( + loss=next_sentence_loss, + logits=seq_relationship_scores, + hidden_states=outputs.hidden_states, + ) + + +@auto_docstring( + custom_intro=""" + FNet Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled + output) e.g. for GLUE tasks. + """ +) +class FNetForSequenceClassification(FNetPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.fnet = FNetModel(config) + + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, SequenceClassifierOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.fnet( + input_ids, + token_type_ids=token_type_ids, + position_ids=position_ids, + inputs_embeds=inputs_embeds, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + pooled_output = outputs[1] + pooled_output = self.dropout(pooled_output) + logits = self.classifier(pooled_output) + + loss = None + if labels is not None: + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits, labels) + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutput(loss=loss, logits=logits, hidden_states=outputs.hidden_states) + + +@auto_docstring +class FNetForMultipleChoice(FNetPreTrainedModel): + def __init__(self, config): + super().__init__(config) + + self.fnet = FNetModel(config) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.classifier = nn.Linear(config.hidden_size, 1) + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, MultipleChoiceModelOutput]: + r""" + input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, + 1]`: + + - 0 corresponds to a *sentence A* token, + - 1 corresponds to a *sentence B* token. + + [What are token type IDs?](../glossary#token-type-ids) + position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + + [What are position IDs?](../glossary#position-ids) + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert *input_ids* indices into associated vectors than the + model's internal embedding lookup matrix. + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., + num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See + `input_ids` above) + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] + + input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None + token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None + position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None + inputs_embeds = ( + inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1)) + if inputs_embeds is not None + else None + ) + + outputs = self.fnet( + input_ids, + token_type_ids=token_type_ids, + position_ids=position_ids, + inputs_embeds=inputs_embeds, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + pooled_output = outputs[1] + + pooled_output = self.dropout(pooled_output) + logits = self.classifier(pooled_output) + reshaped_logits = logits.view(-1, num_choices) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(reshaped_logits, labels) + + if not return_dict: + output = (reshaped_logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return MultipleChoiceModelOutput(loss=loss, logits=reshaped_logits, hidden_states=outputs.hidden_states) + + +@auto_docstring +class FNetForTokenClassification(FNetPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + + self.fnet = FNetModel(config) + + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, TokenClassifierOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.fnet( + input_ids, + token_type_ids=token_type_ids, + position_ids=position_ids, + inputs_embeds=inputs_embeds, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + sequence_output = self.dropout(sequence_output) + logits = self.classifier(sequence_output) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + # Only keep active parts of the loss + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TokenClassifierOutput(loss=loss, logits=logits, hidden_states=outputs.hidden_states) + + +@auto_docstring +class FNetForQuestionAnswering(FNetPreTrainedModel): + def __init__(self, config): + super().__init__(config) + + self.num_labels = config.num_labels + + self.fnet = FNetModel(config) + self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + start_positions: Optional[torch.Tensor] = None, + end_positions: Optional[torch.Tensor] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, QuestionAnsweringModelOutput]: + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.fnet( + input_ids, + token_type_ids=token_type_ids, + position_ids=position_ids, + inputs_embeds=inputs_embeds, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + logits = self.qa_outputs(sequence_output) + start_logits, end_logits = logits.split(1, dim=-1) + start_logits = start_logits.squeeze(-1).contiguous() + end_logits = end_logits.squeeze(-1).contiguous() + + total_loss = None + if start_positions is not None and end_positions is not None: + # If we are on multi-GPU, split add a dimension + if len(start_positions.size()) > 1: + start_positions = start_positions.squeeze(-1) + if len(end_positions.size()) > 1: + end_positions = end_positions.squeeze(-1) + # sometimes the start/end positions are outside our model inputs, we ignore these terms + ignored_index = start_logits.size(1) + start_positions = start_positions.clamp(0, ignored_index) + end_positions = end_positions.clamp(0, ignored_index) + + loss_fct = CrossEntropyLoss(ignore_index=ignored_index) + start_loss = loss_fct(start_logits, start_positions) + end_loss = loss_fct(end_logits, end_positions) + total_loss = (start_loss + end_loss) / 2 + + if not return_dict: + output = (start_logits, end_logits) + outputs[2:] + return ((total_loss,) + output) if total_loss is not None else output + + return QuestionAnsweringModelOutput( + loss=total_loss, start_logits=start_logits, end_logits=end_logits, hidden_states=outputs.hidden_states + ) + + +__all__ = [ + "FNetForMaskedLM", + "FNetForMultipleChoice", + "FNetForNextSentencePrediction", + "FNetForPreTraining", + "FNetForQuestionAnswering", + "FNetForSequenceClassification", + "FNetForTokenClassification", + "FNetLayer", + "FNetModel", + "FNetPreTrainedModel", +] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/fnet/tokenization_fnet.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/fnet/tokenization_fnet.py new file mode 100644 index 0000000000000000000000000000000000000000..72aa202612e0bd451c413d071e84443c8c384e6e --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/fnet/tokenization_fnet.py @@ -0,0 +1,314 @@ +# coding=utf-8 +# Copyright 2021 Google Research, Google AI, Google Brain and the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes for FNet model.""" + +import os +import unicodedata +from shutil import copyfile +from typing import Any, Optional + +import sentencepiece as spm + +from ...tokenization_utils import AddedToken, PreTrainedTokenizer +from ...utils import logging +from ...utils.import_utils import requires + + +logger = logging.get_logger(__name__) +VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"} + + +SPIECE_UNDERLINE = "▁" + + +@requires(backends=("sentencepiece",)) +class FNetTokenizer(PreTrainedTokenizer): + """ + Construct an FNet tokenizer. Adapted from [`AlbertTokenizer`]. Based on + [SentencePiece](https://github.com/google/sentencepiece). This tokenizer inherits from [`PreTrainedTokenizer`] + which contains most of the main methods. Users should refer to this superclass for more information regarding those + methods. + + Args: + vocab_file (`str`): + [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that + contains the vocabulary necessary to instantiate a tokenizer. + do_lower_case (`bool`, *optional*, defaults to `False`): + Whether or not to lowercase the input when tokenizing. + remove_space (`bool`, *optional*, defaults to `True`): + Whether or not to strip the text when tokenizing (removing excess spaces before and after the string). + keep_accents (`bool`, *optional*, defaults to `True`): + Whether or not to keep accents when tokenizing. + unk_token (`str`, *optional*, defaults to `""`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + sep_token (`str`, *optional*, defaults to `"[SEP]"`): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for + sequence classification or for a text and a question for question answering. It is also used as the last + token of a sequence built with special tokens. + pad_token (`str`, *optional*, defaults to `""`): + The token used for padding, for example when batching sequences of different lengths. + cls_token (`str`, *optional*, defaults to `"[CLS]"`): + The classifier token which is used when doing sequence classification (classification of the whole sequence + instead of per-token classification). It is the first token of the sequence when built with special tokens. + mask_token (`str`, *optional*, defaults to `"[MASK]"`): + The token used for masking values. This is the token used when training this model with masked language + modeling. This is the token which the model will try to predict. + sp_model_kwargs (`dict`, *optional*): + Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for + SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things, + to set: + + - `enable_sampling`: Enable subword regularization. + - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout. + + - `nbest_size = {0,1}`: No sampling is performed. + - `nbest_size > 1`: samples from the nbest_size results. + - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice) + using forward-filtering-and-backward-sampling algorithm. + - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for + BPE-dropout. + + Attributes: + sp_model (`SentencePieceProcessor`): + The *SentencePiece* processor that is used for every conversion (string, tokens and IDs). + """ + + vocab_files_names = VOCAB_FILES_NAMES + model_input_names = ["input_ids", "token_type_ids"] + + def __init__( + self, + vocab_file, + do_lower_case=False, + remove_space=True, + keep_accents=True, + unk_token="", + sep_token="[SEP]", + pad_token="", + cls_token="[CLS]", + mask_token="[MASK]", + sp_model_kwargs: Optional[dict[str, Any]] = None, + **kwargs, + ) -> None: + # Mask token behave like a normal word, i.e. include the space before it and + # is included in the raw text, there should be a match in a non-normalized sentence. + mask_token = AddedToken(mask_token, lstrip=True, special=True) if isinstance(mask_token, str) else mask_token + cls_token = AddedToken(cls_token, special=True) if isinstance(cls_token, str) else cls_token + sep_token = AddedToken(sep_token, special=True) if isinstance(sep_token, str) else sep_token + mask_token = AddedToken(mask_token, special=True) if isinstance(mask_token, str) else mask_token + self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + + self.do_lower_case = do_lower_case + self.remove_space = remove_space + self.keep_accents = keep_accents + self.vocab_file = vocab_file + + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) + self.sp_model.Load(vocab_file) + + super().__init__( + do_lower_case=do_lower_case, + remove_space=remove_space, + keep_accents=keep_accents, + unk_token=unk_token, + sep_token=sep_token, + pad_token=pad_token, + cls_token=cls_token, + mask_token=mask_token, + sp_model_kwargs=self.sp_model_kwargs, + **kwargs, + ) + + @property + def vocab_size(self): + return len(self.sp_model) + + def get_vocab(self): + vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab + + def __getstate__(self): + state = self.__dict__.copy() + state["sp_model"] = None + return state + + def __setstate__(self, d): + self.__dict__ = d + + # for backward compatibility + if not hasattr(self, "sp_model_kwargs"): + self.sp_model_kwargs = {} + + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) + self.sp_model.Load(self.vocab_file) + + def preprocess_text(self, inputs): + if self.remove_space: + outputs = " ".join(inputs.strip().split()) + else: + outputs = inputs + outputs = outputs.replace("``", '"').replace("''", '"') + + if not self.keep_accents: + outputs = unicodedata.normalize("NFKD", outputs) + outputs = "".join([c for c in outputs if not unicodedata.combining(c)]) + if self.do_lower_case: + outputs = outputs.lower() + + return outputs + + def _tokenize(self, text: str) -> list[str]: + """Tokenize a string.""" + text = self.preprocess_text(text) + pieces = self.sp_model.encode(text, out_type=str) + new_pieces = [] + for piece in pieces: + if len(piece) > 1 and piece[-1] == "," and piece[-2].isdigit(): + cur_pieces = self.sp_model.EncodeAsPieces(piece[:-1].replace(SPIECE_UNDERLINE, "")) + if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE: + if len(cur_pieces[0]) == 1: + cur_pieces = cur_pieces[1:] + else: + cur_pieces[0] = cur_pieces[0][1:] + cur_pieces.append(piece[-1]) + new_pieces.extend(cur_pieces) + else: + new_pieces.append(piece) + + return new_pieces + + def _convert_token_to_id(self, token): + """Converts a token (str) in an id using the vocab.""" + return self.sp_model.PieceToId(token) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.sp_model.IdToPiece(index) + + # Copied from transformers.models.albert.tokenization_albert.AlbertTokenizer.convert_tokens_to_string + def convert_tokens_to_string(self, tokens): + """Converts a sequence of tokens (string) in a single string.""" + current_sub_tokens = [] + out_string = "" + prev_is_special = False + for token in tokens: + # make sure that special tokens are not decoded using sentencepiece model + if token in self.all_special_tokens: + if not prev_is_special: + out_string += " " + out_string += self.sp_model.decode(current_sub_tokens) + token + prev_is_special = True + current_sub_tokens = [] + else: + current_sub_tokens.append(token) + prev_is_special = False + out_string += self.sp_model.decode(current_sub_tokens) + return out_string.strip() + + def _decode( + self, + token_ids: list[int], + skip_special_tokens: bool = False, + clean_up_tokenization_spaces: Optional[bool] = None, + spaces_between_special_tokens: bool = False, + **kwargs, + ) -> str: + text = super()._decode( + token_ids=token_ids, + skip_special_tokens=skip_special_tokens, + clean_up_tokenization_spaces=clean_up_tokenization_spaces, + spaces_between_special_tokens=spaces_between_special_tokens, + **kwargs, + ) + # Mimic the behavior of the Rust tokenizer: + # No space after + if not spaces_between_special_tokens: + text = text.replace(" ", "") + return text + + def build_inputs_with_special_tokens( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None + ) -> list[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. An FNet sequence has the following format: + + - single sequence: `[CLS] X [SEP]` + - pair of sequences: `[CLS] A [SEP] B [SEP]` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + sep = [self.sep_token_id] + cls = [self.cls_token_id] + if token_ids_1 is None: + return cls + token_ids_0 + sep + return cls + token_ids_0 + sep + token_ids_1 + sep + + def get_special_tokens_mask( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False + ) -> list[int]: + """ + Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer `prepare_for_model` method. + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not the token list is already formatted with special tokens for the model. + + Returns: + `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + + if already_has_special_tokens: + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) + + if token_ids_1 is not None: + return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] + return [1] + ([0] * len(token_ids_0)) + [1] + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]: + if not os.path.isdir(save_directory): + logger.error(f"Vocabulary path ({save_directory}) should be a directory") + return + out_vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + + if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file): + copyfile(self.vocab_file, out_vocab_file) + elif not os.path.isfile(self.vocab_file): + with open(out_vocab_file, "wb") as fi: + content_spiece_model = self.sp_model.serialized_model_proto() + fi.write(content_spiece_model) + + return (out_vocab_file,) + + +__all__ = ["FNetTokenizer"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/fnet/tokenization_fnet_fast.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/fnet/tokenization_fnet_fast.py new file mode 100644 index 0000000000000000000000000000000000000000..4aab7997650f42bc0a7c185c5fa876a85c157a41 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/fnet/tokenization_fnet_fast.py @@ -0,0 +1,155 @@ +# coding=utf-8 +# Copyright 2021 Google AI, Google Brain and the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes for FNet model.""" + +import os +from shutil import copyfile +from typing import Optional + +from ...tokenization_utils import AddedToken +from ...tokenization_utils_fast import PreTrainedTokenizerFast +from ...utils import is_sentencepiece_available, logging + + +if is_sentencepiece_available(): + from .tokenization_fnet import FNetTokenizer +else: + FNetTokenizer = None + +logger = logging.get_logger(__name__) +VOCAB_FILES_NAMES = {"vocab_file": "spiece.model", "tokenizer_file": "tokenizer.json"} + + +SPIECE_UNDERLINE = "▁" + + +class FNetTokenizerFast(PreTrainedTokenizerFast): + """ + Construct a "fast" FNetTokenizer (backed by HuggingFace's *tokenizers* library). Adapted from + [`AlbertTokenizerFast`]. Based on + [Unigram](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models). This + tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should refer to + this superclass for more information regarding those methods + + Args: + vocab_file (`str`): + [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that + contains the vocabulary necessary to instantiate a tokenizer. + do_lower_case (`bool`, *optional*, defaults to `False`): + Whether or not to lowercase the input when tokenizing. + remove_space (`bool`, *optional*, defaults to `True`): + Whether or not to strip the text when tokenizing (removing excess spaces before and after the string). + keep_accents (`bool`, *optional*, defaults to `True`): + Whether or not to keep accents when tokenizing. + unk_token (`str`, *optional*, defaults to `""`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + sep_token (`str`, *optional*, defaults to `"[SEP]"`): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for + sequence classification or for a text and a question for question answering. It is also used as the last + token of a sequence built with special tokens. + pad_token (`str`, *optional*, defaults to `""`): + The token used for padding, for example when batching sequences of different lengths. + cls_token (`str`, *optional*, defaults to `"[CLS]"`): + The classifier token which is used when doing sequence classification (classification of the whole sequence + instead of per-token classification). It is the first token of the sequence when built with special tokens. + mask_token (`str`, *optional*, defaults to `"[MASK]"`): + The token used for masking values. This is the token used when training this model with masked language + modeling. This is the token which the model will try to predict. + """ + + vocab_files_names = VOCAB_FILES_NAMES + model_input_names = ["input_ids", "token_type_ids"] + slow_tokenizer_class = FNetTokenizer + + def __init__( + self, + vocab_file=None, + tokenizer_file=None, + do_lower_case=False, + remove_space=True, + keep_accents=True, + unk_token="", + sep_token="[SEP]", + pad_token="", + cls_token="[CLS]", + mask_token="[MASK]", + **kwargs, + ): + # Mask token behave like a normal word, i.e. include the space before it and + # is included in the raw text, there should be a match in a non-normalized sentence. + mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token + cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token + sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token + + super().__init__( + vocab_file, + tokenizer_file=tokenizer_file, + do_lower_case=do_lower_case, + remove_space=remove_space, + keep_accents=keep_accents, + unk_token=unk_token, + sep_token=sep_token, + pad_token=pad_token, + cls_token=cls_token, + mask_token=mask_token, + **kwargs, + ) + + self.do_lower_case = do_lower_case + self.remove_space = remove_space + self.keep_accents = keep_accents + self.vocab_file = vocab_file + + def build_inputs_with_special_tokens( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None + ) -> list[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. An FNet sequence has the following format: + + - single sequence: `[CLS] X [SEP]` + - pair of sequences: `[CLS] A [SEP] B [SEP]` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + sep = [self.sep_token_id] + cls = [self.cls_token_id] + if token_ids_1 is None: + return cls + token_ids_0 + sep + return cls + token_ids_0 + sep + token_ids_1 + sep + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]: + if not os.path.isdir(save_directory): + logger.error(f"Vocabulary path ({save_directory}) should be a directory") + return + out_vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + + if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file): + copyfile(self.vocab_file, out_vocab_file) + + return (out_vocab_file,) + + +__all__ = ["FNetTokenizerFast"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/focalnet/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/focalnet/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..5dec8135f3b3030b20691e761483e5994ba441f0 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/focalnet/__init__.py @@ -0,0 +1,27 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_focalnet import * + from .modeling_focalnet import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/focalnet/configuration_focalnet.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/focalnet/configuration_focalnet.py new file mode 100644 index 0000000000000000000000000000000000000000..e412e3824e163379abbca82e31f5af01cb18c7fc --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/focalnet/configuration_focalnet.py @@ -0,0 +1,164 @@ +# coding=utf-8 +# Copyright 2023 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""FocalNet model configuration""" + +from ...configuration_utils import PretrainedConfig +from ...utils import logging +from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices + + +logger = logging.get_logger(__name__) + + +class FocalNetConfig(BackboneConfigMixin, PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`FocalNetModel`]. It is used to instantiate a + FocalNet model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of the FocalNet + [microsoft/focalnet-tiny](https://huggingface.co/microsoft/focalnet-tiny) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + image_size (`int`, *optional*, defaults to 224): + The size (resolution) of each image. + patch_size (`int`, *optional*, defaults to 4): + The size (resolution) of each patch in the embeddings layer. + num_channels (`int`, *optional*, defaults to 3): + The number of input channels. + embed_dim (`int`, *optional*, defaults to 96): + Dimensionality of patch embedding. + use_conv_embed (`bool`, *optional*, defaults to `False`): + Whether to use convolutional embedding. The authors noted that using convolutional embedding usually + improve the performance, but it's not used by default. + hidden_sizes (`list[int]`, *optional*, defaults to `[192, 384, 768, 768]`): + Dimensionality (hidden size) at each stage. + depths (`list(int)`, *optional*, defaults to `[2, 2, 6, 2]`): + Depth (number of layers) of each stage in the encoder. + focal_levels (`list(int)`, *optional*, defaults to `[2, 2, 2, 2]`): + Number of focal levels in each layer of the respective stages in the encoder. + focal_windows (`list(int)`, *optional*, defaults to `[3, 3, 3, 3]`): + Focal window size in each layer of the respective stages in the encoder. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder. If string, `"gelu"`, `"relu"`, + `"selu"` and `"gelu_new"` are supported. + mlp_ratio (`float`, *optional*, defaults to 4.0): + Ratio of MLP hidden dimensionality to embedding dimensionality. + hidden_dropout_prob (`float`, *optional*, defaults to 0.0): + The dropout probability for all fully connected layers in the embeddings and encoder. + drop_path_rate (`float`, *optional*, defaults to 0.1): + Stochastic depth rate. + use_layerscale (`bool`, *optional*, defaults to `False`): + Whether to use layer scale in the encoder. + layerscale_value (`float`, *optional*, defaults to 0.0001): + The initial value of the layer scale. + use_post_layernorm (`bool`, *optional*, defaults to `False`): + Whether to use post layer normalization in the encoder. + use_post_layernorm_in_modulation (`bool`, *optional*, defaults to `False`): + Whether to use post layer normalization in the modulation layer. + normalize_modulator (`bool`, *optional*, defaults to `False`): + Whether to normalize the modulator. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-05): + The epsilon used by the layer normalization layers. + encoder_stride (`int`, *optional*, defaults to 32): + Factor to increase the spatial resolution by in the decoder head for masked image modeling. + out_features (`list[str]`, *optional*): + If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc. + (depending on how many stages the model has). If unset and `out_indices` is set, will default to the + corresponding stages. If unset and `out_indices` is unset, will default to the last stage. Must be in the + same order as defined in the `stage_names` attribute. + out_indices (`list[int]`, *optional*): + If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how + many stages the model has). If unset and `out_features` is set, will default to the corresponding stages. + If unset and `out_features` is unset, will default to the last stage. Must be in the + same order as defined in the `stage_names` attribute. + + Example: + + ```python + >>> from transformers import FocalNetConfig, FocalNetModel + + >>> # Initializing a FocalNet microsoft/focalnet-tiny style configuration + >>> configuration = FocalNetConfig() + + >>> # Initializing a model (with random weights) from the microsoft/focalnet-tiny style configuration + >>> model = FocalNetModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "focalnet" + + def __init__( + self, + image_size=224, + patch_size=4, + num_channels=3, + embed_dim=96, + use_conv_embed=False, + hidden_sizes=[192, 384, 768, 768], + depths=[2, 2, 6, 2], + focal_levels=[2, 2, 2, 2], + focal_windows=[3, 3, 3, 3], + hidden_act="gelu", + mlp_ratio=4.0, + hidden_dropout_prob=0.0, + drop_path_rate=0.1, + use_layerscale=False, + layerscale_value=1e-4, + use_post_layernorm=False, + use_post_layernorm_in_modulation=False, + normalize_modulator=False, + initializer_range=0.02, + layer_norm_eps=1e-5, + encoder_stride=32, + out_features=None, + out_indices=None, + **kwargs, + ): + super().__init__(**kwargs) + + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.embed_dim = embed_dim + self.use_conv_embed = use_conv_embed + self.hidden_sizes = hidden_sizes + self.depths = depths + self.focal_levels = focal_levels + self.focal_windows = focal_windows + self.hidden_act = hidden_act + self.mlp_ratio = mlp_ratio + self.hidden_dropout_prob = hidden_dropout_prob + self.drop_path_rate = drop_path_rate + self.use_layerscale = use_layerscale + self.layerscale_value = layerscale_value + self.use_post_layernorm = use_post_layernorm + self.use_post_layernorm_in_modulation = use_post_layernorm_in_modulation + self.normalize_modulator = normalize_modulator + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + self.encoder_stride = encoder_stride + self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(self.depths) + 1)] + self._out_features, self._out_indices = get_aligned_output_features_output_indices( + out_features=out_features, out_indices=out_indices, stage_names=self.stage_names + ) + + +__all__ = ["FocalNetConfig"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/gemma2/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/gemma2/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..18905bac42cc6b19f21e069355504e46d070d814 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/gemma2/__init__.py @@ -0,0 +1,27 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_gemma2 import * + from .modeling_gemma2 import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/gemma2/configuration_gemma2.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/gemma2/configuration_gemma2.py new file mode 100644 index 0000000000000000000000000000000000000000..d43ec4c47371411c0b15b34afe4d82456d4f0cf8 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/gemma2/configuration_gemma2.py @@ -0,0 +1,182 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/gemma2/modular_gemma2.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_gemma2.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright 2024 Google Inc. HuggingFace Inc. team. All rights reserved. +# +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from ...configuration_utils import PretrainedConfig, layer_type_validation + + +class Gemma2Config(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`Gemma2Model`]. It is used to instantiate an Gemma2 + model according to the specified arguments, defining the model architecture. Instantiating a configuration with the + defaults will yield a similar configuration to that of the Gemma2-7B. + e.g. [google/gemma2-7b](https://huggingface.co/google/gemma2-7b) + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + Args: + vocab_size (`int`, *optional*, defaults to 256000): + Vocabulary size of the Gemma2 model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`Gemma2Model`] + hidden_size (`int`, *optional*, defaults to 2304): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 9216): + Dimension of the MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 26): + Number of hidden layers in the Transformer decoder. + num_attention_heads (`int`, *optional*, defaults to 8): + Number of attention heads for each attention layer in the Transformer decoder. + num_key_value_heads (`int`, *optional*, defaults to 4): + This is the number of key_value heads that should be used to implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When + converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed + by meanpooling all the original heads within that group. For more details, check out [this + paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to + `num_attention_heads`. + head_dim (`int`, *optional*, defaults to 256): + The attention head dimension. + hidden_activation (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`): + The non-linear activation function (function or string) in the decoder. Will default to `"gelu_pytorch_tanh"` + if not specified. `"gelu_pytorch_tanh"` uses an approximation of the `"gelu"` activation function. + max_position_embeddings (`int`, *optional*, defaults to 8192): + The maximum sequence length that this model might ever be used with. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + rms_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon used by the rms normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + pad_token_id (`int`, *optional*, defaults to 0): + Padding token id. + eos_token_id (`int`, *optional*, defaults to 1): + End of stream token id. + bos_token_id (`int`, *optional*, defaults to 2): + Beginning of stream token id. + tie_word_embeddings (`bool`, *optional*, defaults to `True`): + Whether to tie weight embeddings + rope_theta (`float`, *optional*, defaults to 10000.0): + The base period of the RoPE embeddings. + attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`): + Whether to use a bias in the query, key, value and output projection layers during self-attention. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + query_pre_attn_scalar (`float`, *optional*, defaults to 256): + scaling factor used on the attention scores + sliding_window (`int`, *optional*, defaults to 4096): + in Gemma2, every other layer uses sliding window attention. This is the size of the sliding window. + layer_types (`list`, *optional*): + Attention pattern for each layer. + final_logit_softcapping (`float`, *optional*, defaults to 30.0): + scaling factor when applying tanh softcapping on the logits. + attn_logit_softcapping (`float`, *optional*, defaults to 50.0): + scaling factor when applying tanh softcapping on the attention scores. + + ```python + >>> from transformers import Gemma2Model, Gemma2Config + >>> # Initializing a Gemma2 gemma2-7b style configuration + >>> configuration = Gemma2Config() + >>> # Initializing a model from the gemma2-7b style configuration + >>> model = Gemma2Model(configuration) + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "gemma2" + keys_to_ignore_at_inference = ["past_key_values"] + base_model_tp_plan = { + "layers.*.self_attn.q_proj": "colwise", + "layers.*.self_attn.k_proj": "colwise", + "layers.*.self_attn.v_proj": "colwise", + "layers.*.self_attn.o_proj": "rowwise", + "layers.*.mlp.gate_proj": "colwise", + "layers.*.mlp.up_proj": "colwise", + "layers.*.mlp.down_proj": "rowwise", + } + base_model_pp_plan = { + "embed_tokens": (["input_ids"], ["inputs_embeds"]), + "layers": (["hidden_states", "attention_mask"], ["hidden_states"]), + "norm": (["hidden_states"], ["hidden_states"]), + } + + def __init__( + self, + vocab_size=256000, + hidden_size=2304, + intermediate_size=9216, + num_hidden_layers=26, + num_attention_heads=8, + num_key_value_heads=4, + head_dim=256, + hidden_activation="gelu_pytorch_tanh", + max_position_embeddings=8192, + initializer_range=0.02, + rms_norm_eps=1e-6, + use_cache=True, + pad_token_id=0, + eos_token_id=1, + bos_token_id=2, + tie_word_embeddings=True, + rope_theta=10000.0, + attention_bias=False, + attention_dropout=0.0, + query_pre_attn_scalar=256, + sliding_window=4096, + layer_types=None, + final_logit_softcapping=30.0, + attn_logit_softcapping=50.0, + **kwargs, + ): + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.head_dim = head_dim + self.num_key_value_heads = num_key_value_heads + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + self.use_cache = use_cache + self.rope_theta = rope_theta + self.attention_bias = attention_bias + self.attention_dropout = attention_dropout + self.hidden_activation = hidden_activation + self.query_pre_attn_scalar = query_pre_attn_scalar + self.sliding_window = sliding_window + self.final_logit_softcapping = final_logit_softcapping + self.attn_logit_softcapping = attn_logit_softcapping + self.layer_types = layer_types + + if self.layer_types is None: + self.layer_types = [ + "sliding_attention" if bool((i + 1) % 2) else "full_attention" for i in range(self.num_hidden_layers) + ] + layer_type_validation(self.layer_types, self.num_hidden_layers) + + +__all__ = ["Gemma2Config"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/gemma2/modeling_gemma2.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/gemma2/modeling_gemma2.py new file mode 100644 index 0000000000000000000000000000000000000000..ec2f1521ef859c53673d7e71e751a3215e7a62d2 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/gemma2/modeling_gemma2.py @@ -0,0 +1,598 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/gemma2/modular_gemma2.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_gemma2.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright 2024 Google Inc. HuggingFace Inc. team. All rights reserved. +# +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Callable, Optional, Union + +import torch +import torch.nn as nn + +from ...activations import ACT2FN +from ...cache_utils import Cache, DynamicCache +from ...generation import GenerationMixin +from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask +from ...modeling_flash_attention_utils import FlashAttentionKwargs +from ...modeling_layers import ( + GenericForSequenceClassification, + GenericForTokenClassification, + GradientCheckpointingLayer, +) +from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast +from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update +from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel +from ...processing_utils import Unpack +from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging +from ...utils.deprecation import deprecate_kwarg +from ...utils.generic import check_model_inputs +from .configuration_gemma2 import Gemma2Config + + +logger = logging.get_logger(__name__) + + +class Gemma2RMSNorm(nn.Module): + def __init__(self, dim: int, eps: float = 1e-6): + super().__init__() + self.eps = eps + self.weight = nn.Parameter(torch.zeros(dim)) + + def _norm(self, x): + return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps) + + def forward(self, x): + output = self._norm(x.float()) + # Llama does x.to(float16) * w whilst Gemma2 is (x * w).to(float16) + # See https://github.com/huggingface/transformers/pull/29402 + output = output * (1.0 + self.weight.float()) + return output.type_as(x) + + def extra_repr(self): + return f"{tuple(self.weight.shape)}, eps={self.eps}" + + +class Gemma2MLP(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.intermediate_size = config.intermediate_size + self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) + self.act_fn = ACT2FN[config.hidden_activation] + + def forward(self, x): + down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) + return down_proj + + +class Gemma2RotaryEmbedding(nn.Module): + inv_freq: torch.Tensor # fix linting for `register_buffer` + + def __init__(self, config: Gemma2Config, device=None): + super().__init__() + # BC: "rope_type" was originally "type" + if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): + self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) + else: + self.rope_type = "default" + self.max_seq_len_cached = config.max_position_embeddings + self.original_max_seq_len = config.max_position_embeddings + + self.config = config + self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + + inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) + self.original_inv_freq = self.inv_freq + + @torch.no_grad() + @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) + def forward(self, x, position_ids): + inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device) + position_ids_expanded = position_ids[:, None, :].float() + + device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" + with torch.autocast(device_type=device_type, enabled=False): # Force float32 + freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) + emb = torch.cat((freqs, freqs), dim=-1) + cos = emb.cos() * self.attention_scaling + sin = emb.sin() * self.attention_scaling + + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) + + +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + +def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1): + """Applies Rotary Position Embedding to the query and key tensors. + + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + position_ids (`torch.Tensor`, *optional*): + Deprecated and unused. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + cos = cos.unsqueeze(unsqueeze_dim) + sin = sin.unsqueeze(unsqueeze_dim) + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + +def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: + """ + This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, + num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) + """ + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) + + +def eager_attention_forward( + module: nn.Module, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attention_mask: Optional[torch.Tensor], + dropout: float = 0.0, + scaling: Optional[float] = None, + softcap: Optional[float] = None, + **kwargs, +) -> tuple[torch.Tensor, torch.Tensor]: + if scaling is None: + scaling = module.head_dim**-0.5 + + key_states = repeat_kv(key, module.num_key_value_groups) + value_states = repeat_kv(value, module.num_key_value_groups) + + attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling + + if softcap is not None: + attn_weights = attn_weights / softcap + attn_weights = torch.tanh(attn_weights) + attn_weights = attn_weights * softcap + if attention_mask is not None: # no matter the length, we just slice it + causal_mask = attention_mask[:, :, :, : key_states.shape[-2]] + attn_weights = attn_weights + causal_mask + + # upcast attention to fp32 + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype) + attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) + attn_output = torch.matmul(attn_weights, value_states) + attn_output = attn_output.transpose(1, 2).contiguous() + return attn_output, attn_weights + + +class Gemma2Attention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config: Gemma2Config, layer_idx: int): + super().__init__() + self.config = config + self.layer_idx = layer_idx + self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads) + self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads + self.scaling = config.query_pre_attn_scalar**-0.5 + self.attention_dropout = self.config.attention_dropout + self.is_causal = True + + self.q_proj = nn.Linear( + config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias + ) + self.k_proj = nn.Linear( + config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias + ) + self.v_proj = nn.Linear( + config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias + ) + self.o_proj = nn.Linear( + config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias + ) + self.attn_logit_softcapping = self.config.attn_logit_softcapping + self.sliding_window = config.sliding_window if config.layer_types[layer_idx] == "sliding_attention" else None + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + position_embeddings: tuple[torch.Tensor, torch.Tensor], + attention_mask: Optional[torch.Tensor], + past_key_values: Optional[Cache] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Unpack[FlashAttentionKwargs], + ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]: + input_shape = hidden_states.shape[:-1] + hidden_shape = (*input_shape, -1, self.head_dim) + + query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2) + key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2) + value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2) + + cos, sin = position_embeddings + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) + + if past_key_values is not None: + # sin and cos are specific to RoPE models; cache_position needed for the static cache + cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} + key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs) + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + attn_output, attn_weights = attention_interface( + self, + query_states, + key_states, + value_states, + attention_mask, + dropout=self.attention_dropout if self.training else 0.0, + scaling=self.scaling, + sliding_window=self.sliding_window, + softcap=self.attn_logit_softcapping, + **kwargs, + ) + + attn_output = attn_output.reshape(*input_shape, -1).contiguous() + attn_output = self.o_proj(attn_output) + return attn_output, attn_weights + + +class Gemma2DecoderLayer(GradientCheckpointingLayer): + def __init__(self, config: Gemma2Config, layer_idx: int): + super().__init__() + self.hidden_size = config.hidden_size + self.config = config + self.attention_type = config.layer_types[layer_idx] + self.self_attn = Gemma2Attention(config=config, layer_idx=layer_idx) + self.mlp = Gemma2MLP(config) + self.input_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + self.pre_feedforward_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_feedforward_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + position_embeddings: tuple[torch.Tensor, torch.Tensor], + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + cache_position: Optional[torch.LongTensor] = None, + **kwargs, + ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]: + residual = hidden_states + + hidden_states = self.input_layernorm(hidden_states) + + # Self Attention + hidden_states, self_attn_weights = self.self_attn( + hidden_states=hidden_states, + position_embeddings=position_embeddings, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + output_attentions=output_attentions, + use_cache=use_cache, + cache_position=cache_position, + **kwargs, + ) + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = residual + hidden_states + + residual = hidden_states + hidden_states = self.pre_feedforward_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = self.post_feedforward_layernorm(hidden_states) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights,) + + return outputs + + +@auto_docstring +class Gemma2PreTrainedModel(PreTrainedModel): + config: Gemma2Config + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["Gemma2DecoderLayer"] + _skip_keys_device_placement = ["past_key_values"] + _supports_flash_attn = True + _supports_sdpa = True + _supports_flex_attn = True + + _can_compile_fullgraph = True + _supports_attention_backend = True + _can_record_outputs = { + "hidden_states": Gemma2DecoderLayer, + "attentions": Gemma2Attention, + } + + def _init_weights(self, module): + super()._init_weights(module) + + # We initialize with 0s to be 1 centered as the RMSNorm here does (1 + weight) + if "RMSNorm" in module.__class__.__name__: + module.weight.data.zero_() + + +@auto_docstring +class Gemma2Model(Gemma2PreTrainedModel): + def __init__(self, config: Gemma2Config): + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) + self.layers = nn.ModuleList( + [Gemma2DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + ) + self.norm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.rotary_emb = Gemma2RotaryEmbedding(config=config) + self.gradient_checkpointing = False + + # Initialize weights and apply final processing + self.post_init() + + @check_model_inputs + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> BaseModelOutputWithPast: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") + + if self.gradient_checkpointing and self.training and use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`." + ) + use_cache = False + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + if use_cache and past_key_values is None and not self.training: + past_key_values = DynamicCache(config=self.config) + + if cache_position is None: + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + cache_position = torch.arange( + past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device + ) + + if position_ids is None: + position_ids = cache_position.unsqueeze(0) + + # It may already have been prepared by e.g. `generate` + if not isinstance(causal_mask_mapping := attention_mask, dict): + # Prepare mask arguments + mask_kwargs = { + "config": self.config, + "input_embeds": inputs_embeds, + "attention_mask": attention_mask, + "cache_position": cache_position, + "past_key_values": past_key_values, + "position_ids": position_ids, + } + # Create the masks + causal_mask_mapping = { + "full_attention": create_causal_mask(**mask_kwargs), + "sliding_attention": create_sliding_window_causal_mask(**mask_kwargs), + } + + # embed positions + hidden_states = inputs_embeds + + # create position embeddings to be shared across the decoder layers + position_embeddings = self.rotary_emb(hidden_states, position_ids) + + # normalized + # Gemma2 downcasts the below to float16, causing sqrt(3072)=55.4256 to become 55.5 + # See https://github.com/huggingface/transformers/pull/29402 + normalizer = torch.tensor(self.config.hidden_size**0.5, dtype=hidden_states.dtype) + hidden_states = hidden_states * normalizer + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + + for decoder_layer in self.layers[: self.config.num_hidden_layers]: + if output_hidden_states: + all_hidden_states += (hidden_states,) + + layer_outputs = decoder_layer( + hidden_states, + position_embeddings=position_embeddings, + attention_mask=causal_mask_mapping[decoder_layer.attention_type], + position_ids=position_ids, + past_key_values=past_key_values, + output_attentions=output_attentions, + use_cache=use_cache, + cache_position=cache_position, + **kwargs, + ) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + hidden_states = self.norm(hidden_states) + + if output_hidden_states: + all_hidden_states += (hidden_states,) + + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=past_key_values, + hidden_states=all_hidden_states, + attentions=all_self_attns, + ) + + +@auto_docstring +class Gemma2ForCausalLM(Gemma2PreTrainedModel, GenerationMixin): + _tied_weights_keys = ["lm_head.weight"] + _tp_plan = {"lm_head": "colwise_rep"} + _pp_plan = {"lm_head": (["hidden_states"], ["logits"])} + + def __init__(self, config): + super().__init__(config) + self.model = Gemma2Model(config) + self.vocab_size = config.vocab_size + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + @can_return_tuple + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + logits_to_keep: Union[int, torch.Tensor] = 0, + **kwargs, + ) -> CausalLMOutputWithPast: + r""" + Example: + + ```python + >>> from transformers import AutoTokenizer, Gemma2ForCausalLM + + >>> model = Gemma2ForCausalLM.from_pretrained("google/gemma-2-9b") + >>> tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b") + + >>> prompt = "What is your favorite condiment?" + >>> inputs = tokenizer(prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(inputs.input_ids, max_length=30) + >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "What is your favorite condiment?" + ```""" + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs: BaseModelOutputWithPast = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + cache_position=cache_position, + **kwargs, + ) + + hidden_states = outputs.last_hidden_state + # Only compute necessary logits, and do not upcast them to float if we are not computing the loss + slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep + logits = self.lm_head(hidden_states[:, slice_indices, :]) + if self.config.final_logit_softcapping is not None: + logits = logits / self.config.final_logit_softcapping + logits = torch.tanh(logits) + logits = logits * self.config.final_logit_softcapping + + loss = None + if labels is not None: + loss = self.loss_function(logits, labels, self.vocab_size, **kwargs) + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +class Gemma2ForSequenceClassification(GenericForSequenceClassification, Gemma2PreTrainedModel): + pass + + +class Gemma2ForTokenClassification(GenericForTokenClassification, Gemma2PreTrainedModel): + pass + + +__all__ = [ + "Gemma2ForCausalLM", + "Gemma2Model", + "Gemma2PreTrainedModel", + "Gemma2ForSequenceClassification", + "Gemma2ForTokenClassification", +] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/gemma2/modular_gemma2.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/gemma2/modular_gemma2.py new file mode 100644 index 0000000000000000000000000000000000000000..e54795019c7f0a9cf356f3c186a5fd2735eedf2c --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/gemma2/modular_gemma2.py @@ -0,0 +1,587 @@ +# coding=utf-8 +# Copyright 2024 Google Inc. HuggingFace Inc. team. All rights reserved. +# +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Callable, Optional, Union + +import torch +import torch.nn as nn + +from ...activations import ACT2FN +from ...cache_utils import Cache, DynamicCache +from ...configuration_utils import PretrainedConfig, layer_type_validation +from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask +from ...modeling_flash_attention_utils import FlashAttentionKwargs +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast +from ...modeling_utils import ALL_ATTENTION_FUNCTIONS +from ...processing_utils import Unpack +from ...utils import TransformersKwargs, logging +from ...utils.deprecation import deprecate_kwarg +from ..gemma.modeling_gemma import ( + GemmaAttention, + GemmaForCausalLM, + GemmaForSequenceClassification, + GemmaForTokenClassification, + GemmaMLP, + GemmaModel, + GemmaPreTrainedModel, + GemmaRMSNorm, + GemmaRotaryEmbedding, + apply_rotary_pos_emb, + repeat_kv, +) + + +logger = logging.get_logger(__name__) + + +class Gemma2Config(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`Gemma2Model`]. It is used to instantiate an Gemma2 + model according to the specified arguments, defining the model architecture. Instantiating a configuration with the + defaults will yield a similar configuration to that of the Gemma2-7B. + e.g. [google/gemma2-7b](https://huggingface.co/google/gemma2-7b) + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + Args: + vocab_size (`int`, *optional*, defaults to 256000): + Vocabulary size of the Gemma2 model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`Gemma2Model`] + hidden_size (`int`, *optional*, defaults to 2304): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 9216): + Dimension of the MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 26): + Number of hidden layers in the Transformer decoder. + num_attention_heads (`int`, *optional*, defaults to 8): + Number of attention heads for each attention layer in the Transformer decoder. + num_key_value_heads (`int`, *optional*, defaults to 4): + This is the number of key_value heads that should be used to implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When + converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed + by meanpooling all the original heads within that group. For more details, check out [this + paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to + `num_attention_heads`. + head_dim (`int`, *optional*, defaults to 256): + The attention head dimension. + hidden_activation (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`): + The non-linear activation function (function or string) in the decoder. Will default to `"gelu_pytorch_tanh"` + if not specified. `"gelu_pytorch_tanh"` uses an approximation of the `"gelu"` activation function. + max_position_embeddings (`int`, *optional*, defaults to 8192): + The maximum sequence length that this model might ever be used with. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + rms_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon used by the rms normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + pad_token_id (`int`, *optional*, defaults to 0): + Padding token id. + eos_token_id (`int`, *optional*, defaults to 1): + End of stream token id. + bos_token_id (`int`, *optional*, defaults to 2): + Beginning of stream token id. + tie_word_embeddings (`bool`, *optional*, defaults to `True`): + Whether to tie weight embeddings + rope_theta (`float`, *optional*, defaults to 10000.0): + The base period of the RoPE embeddings. + attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`): + Whether to use a bias in the query, key, value and output projection layers during self-attention. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + query_pre_attn_scalar (`float`, *optional*, defaults to 256): + scaling factor used on the attention scores + sliding_window (`int`, *optional*, defaults to 4096): + in Gemma2, every other layer uses sliding window attention. This is the size of the sliding window. + layer_types (`list`, *optional*): + Attention pattern for each layer. + final_logit_softcapping (`float`, *optional*, defaults to 30.0): + scaling factor when applying tanh softcapping on the logits. + attn_logit_softcapping (`float`, *optional*, defaults to 50.0): + scaling factor when applying tanh softcapping on the attention scores. + + ```python + >>> from transformers import Gemma2Model, Gemma2Config + >>> # Initializing a Gemma2 gemma2-7b style configuration + >>> configuration = Gemma2Config() + >>> # Initializing a model from the gemma2-7b style configuration + >>> model = Gemma2Model(configuration) + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "gemma2" + keys_to_ignore_at_inference = ["past_key_values"] + base_model_tp_plan = { + "layers.*.self_attn.q_proj": "colwise", + "layers.*.self_attn.k_proj": "colwise", + "layers.*.self_attn.v_proj": "colwise", + "layers.*.self_attn.o_proj": "rowwise", + "layers.*.mlp.gate_proj": "colwise", + "layers.*.mlp.up_proj": "colwise", + "layers.*.mlp.down_proj": "rowwise", + } + base_model_pp_plan = { + "embed_tokens": (["input_ids"], ["inputs_embeds"]), + "layers": (["hidden_states", "attention_mask"], ["hidden_states"]), + "norm": (["hidden_states"], ["hidden_states"]), + } + + def __init__( + self, + vocab_size=256000, + hidden_size=2304, + intermediate_size=9216, + num_hidden_layers=26, + num_attention_heads=8, + num_key_value_heads=4, + head_dim=256, + hidden_activation="gelu_pytorch_tanh", + max_position_embeddings=8192, + initializer_range=0.02, + rms_norm_eps=1e-6, + use_cache=True, + pad_token_id=0, + eos_token_id=1, + bos_token_id=2, + tie_word_embeddings=True, + rope_theta=10000.0, + attention_bias=False, + attention_dropout=0.0, + query_pre_attn_scalar=256, + sliding_window=4096, + layer_types=None, + final_logit_softcapping=30.0, + attn_logit_softcapping=50.0, + **kwargs, + ): + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.head_dim = head_dim + self.num_key_value_heads = num_key_value_heads + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + self.use_cache = use_cache + self.rope_theta = rope_theta + self.attention_bias = attention_bias + self.attention_dropout = attention_dropout + self.hidden_activation = hidden_activation + self.query_pre_attn_scalar = query_pre_attn_scalar + self.sliding_window = sliding_window + self.final_logit_softcapping = final_logit_softcapping + self.attn_logit_softcapping = attn_logit_softcapping + self.layer_types = layer_types + + if self.layer_types is None: + self.layer_types = [ + "sliding_attention" if bool((i + 1) % 2) else "full_attention" for i in range(self.num_hidden_layers) + ] + layer_type_validation(self.layer_types, self.num_hidden_layers) + + +class Gemma2RMSNorm(GemmaRMSNorm): + pass + + +class Gemma2MLP(GemmaMLP): + def __init__(self, config): + super().__init__(config) + self.act_fn = ACT2FN[config.hidden_activation] + + +class Gemma2RotaryEmbedding(GemmaRotaryEmbedding): + pass + + +def eager_attention_forward( + module: nn.Module, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attention_mask: Optional[torch.Tensor], + dropout: float = 0.0, + scaling: Optional[float] = None, + softcap: Optional[float] = None, + **kwargs, +) -> tuple[torch.Tensor, torch.Tensor]: + if scaling is None: + scaling = module.head_dim**-0.5 + + key_states = repeat_kv(key, module.num_key_value_groups) + value_states = repeat_kv(value, module.num_key_value_groups) + + attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling + + if softcap is not None: + attn_weights = attn_weights / softcap + attn_weights = torch.tanh(attn_weights) + attn_weights = attn_weights * softcap + if attention_mask is not None: # no matter the length, we just slice it + causal_mask = attention_mask[:, :, :, : key_states.shape[-2]] + attn_weights = attn_weights + causal_mask + + # upcast attention to fp32 + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype) + attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) + attn_output = torch.matmul(attn_weights, value_states) + attn_output = attn_output.transpose(1, 2).contiguous() + return attn_output, attn_weights + + +class Gemma2Attention(GemmaAttention): + def __init__(self, config: Gemma2Config, layer_idx: int): + super().__init__(config, layer_idx) + self.attn_logit_softcapping = self.config.attn_logit_softcapping + self.attention_dropout = self.config.attention_dropout + self.is_causal = True + self.scaling = config.query_pre_attn_scalar**-0.5 + self.sliding_window = config.sliding_window if config.layer_types[layer_idx] == "sliding_attention" else None + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + position_embeddings: tuple[torch.Tensor, torch.Tensor], + attention_mask: Optional[torch.Tensor], + past_key_values: Optional[Cache] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Unpack[FlashAttentionKwargs], + ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]: + input_shape = hidden_states.shape[:-1] + hidden_shape = (*input_shape, -1, self.head_dim) + + query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2) + key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2) + value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2) + + cos, sin = position_embeddings + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) + + if past_key_values is not None: + # sin and cos are specific to RoPE models; cache_position needed for the static cache + cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} + key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs) + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + attn_output, attn_weights = attention_interface( + self, + query_states, + key_states, + value_states, + attention_mask, + dropout=self.attention_dropout if self.training else 0.0, + scaling=self.scaling, + sliding_window=self.sliding_window, + softcap=self.attn_logit_softcapping, + **kwargs, + ) + + attn_output = attn_output.reshape(*input_shape, -1).contiguous() + attn_output = self.o_proj(attn_output) + return attn_output, attn_weights + + +class Gemma2DecoderLayer(GradientCheckpointingLayer): + def __init__(self, config: Gemma2Config, layer_idx: int): + super().__init__() + self.hidden_size = config.hidden_size + self.config = config + self.attention_type = config.layer_types[layer_idx] + self.self_attn = Gemma2Attention(config=config, layer_idx=layer_idx) + self.mlp = Gemma2MLP(config) + self.input_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + self.pre_feedforward_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_feedforward_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + position_embeddings: tuple[torch.Tensor, torch.Tensor], + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + cache_position: Optional[torch.LongTensor] = None, + **kwargs, + ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]: + residual = hidden_states + + hidden_states = self.input_layernorm(hidden_states) + + # Self Attention + hidden_states, self_attn_weights = self.self_attn( + hidden_states=hidden_states, + position_embeddings=position_embeddings, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + output_attentions=output_attentions, + use_cache=use_cache, + cache_position=cache_position, + **kwargs, + ) + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = residual + hidden_states + + residual = hidden_states + hidden_states = self.pre_feedforward_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = self.post_feedforward_layernorm(hidden_states) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights,) + + return outputs + + +class Gemma2PreTrainedModel(GemmaPreTrainedModel): + pass + + +class Gemma2Model(GemmaModel): + def __init__(self, config: Gemma2Config): + super().__init__(config) + self.layers = nn.ModuleList( + [Gemma2DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + ) + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> BaseModelOutputWithPast: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") + + if self.gradient_checkpointing and self.training and use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`." + ) + use_cache = False + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + if use_cache and past_key_values is None and not self.training: + past_key_values = DynamicCache(config=self.config) + + if cache_position is None: + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + cache_position = torch.arange( + past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device + ) + + if position_ids is None: + position_ids = cache_position.unsqueeze(0) + + # It may already have been prepared by e.g. `generate` + if not isinstance(causal_mask_mapping := attention_mask, dict): + # Prepare mask arguments + mask_kwargs = { + "config": self.config, + "input_embeds": inputs_embeds, + "attention_mask": attention_mask, + "cache_position": cache_position, + "past_key_values": past_key_values, + "position_ids": position_ids, + } + # Create the masks + causal_mask_mapping = { + "full_attention": create_causal_mask(**mask_kwargs), + "sliding_attention": create_sliding_window_causal_mask(**mask_kwargs), + } + + # embed positions + hidden_states = inputs_embeds + + # create position embeddings to be shared across the decoder layers + position_embeddings = self.rotary_emb(hidden_states, position_ids) + + # normalized + # Gemma2 downcasts the below to float16, causing sqrt(3072)=55.4256 to become 55.5 + # See https://github.com/huggingface/transformers/pull/29402 + normalizer = torch.tensor(self.config.hidden_size**0.5, dtype=hidden_states.dtype) + hidden_states = hidden_states * normalizer + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + + for decoder_layer in self.layers[: self.config.num_hidden_layers]: + if output_hidden_states: + all_hidden_states += (hidden_states,) + + layer_outputs = decoder_layer( + hidden_states, + position_embeddings=position_embeddings, + attention_mask=causal_mask_mapping[decoder_layer.attention_type], + position_ids=position_ids, + past_key_values=past_key_values, + output_attentions=output_attentions, + use_cache=use_cache, + cache_position=cache_position, + **kwargs, + ) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + hidden_states = self.norm(hidden_states) + + if output_hidden_states: + all_hidden_states += (hidden_states,) + + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=past_key_values, + hidden_states=all_hidden_states, + attentions=all_self_attns, + ) + + +class Gemma2ForCausalLM(GemmaForCausalLM): + def __init__(self, config): + super().__init__(config) + self.model = Gemma2Model(config) + self.post_init() + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + logits_to_keep: Union[int, torch.Tensor] = 0, + **kwargs, + ) -> CausalLMOutputWithPast: + r""" + Example: + + ```python + >>> from transformers import AutoTokenizer, Gemma2ForCausalLM + + >>> model = Gemma2ForCausalLM.from_pretrained("google/gemma-2-9b") + >>> tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b") + + >>> prompt = "What is your favorite condiment?" + >>> inputs = tokenizer(prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(inputs.input_ids, max_length=30) + >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "What is your favorite condiment?" + ```""" + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs: BaseModelOutputWithPast = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + cache_position=cache_position, + **kwargs, + ) + + hidden_states = outputs.last_hidden_state + # Only compute necessary logits, and do not upcast them to float if we are not computing the loss + slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep + logits = self.lm_head(hidden_states[:, slice_indices, :]) + if self.config.final_logit_softcapping is not None: + logits = logits / self.config.final_logit_softcapping + logits = torch.tanh(logits) + logits = logits * self.config.final_logit_softcapping + + loss = None + if labels is not None: + loss = self.loss_function(logits, labels, self.vocab_size, **kwargs) + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +class Gemma2ForSequenceClassification(GemmaForSequenceClassification): + pass + + +class Gemma2ForTokenClassification(GemmaForTokenClassification): + pass + + +__all__ = [ + "Gemma2Config", + "Gemma2ForCausalLM", + "Gemma2Model", + "Gemma2PreTrainedModel", + "Gemma2ForSequenceClassification", + "Gemma2ForTokenClassification", +] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/gemma3n/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/gemma3n/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..229e91827036d0830593ea9294e232cffefbac7b --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/gemma3n/__init__.py @@ -0,0 +1,29 @@ +# Copyright 2025 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_gemma3n import * + from .feature_extraction_gemma3n import * + from .modeling_gemma3n import * + from .processing_gemma3n import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/gemma3n/configuration_gemma3n.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/gemma3n/configuration_gemma3n.py new file mode 100644 index 0000000000000000000000000000000000000000..47b5b47d363034d4daf39065f2c778cb453799be --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/gemma3n/configuration_gemma3n.py @@ -0,0 +1,682 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/gemma3n/modular_gemma3n.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_gemma3n.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright 2025 Google Inc. HuggingFace Inc. team. All rights reserved. +# +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from collections.abc import Sequence +from typing import Any, Optional, Union + +from ...configuration_utils import PretrainedConfig, layer_type_validation +from ...modeling_rope_utils import rope_config_validation +from ...utils import is_timm_available, logging, requires_backends + + +if is_timm_available(): + from timm.data import ImageNetInfo, infer_imagenet_subset + + +logger = logging.get_logger(__name__) + + +class Gemma3nTextConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`Gemma3nTextModel`]. It is used to instantiate an + Gemma3nTextModel model according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of the Gemma 3n E4B, e.g. + [google/gemma-3n-E4B](https://huggingface.co/google/gemma-3n-E4B). + + Configuration objects that inherit from [`Gemma3nTextConfig`] and can be used to control the model outputs. Read + the documentation from [`Gemma3nTextConfig`] for more information. + + Args: + vocab_size (`int`, *optional*, defaults to 262400): + Vocabulary size of the Gemma3nText model. Defines the number of different tokens that can be represented by + the `inputs_ids` passed when calling [`Gemma3nTextModel`] + vocab_size_per_layer_input (`int`, *optional*, defaults to 262144): + Vocabulary size of the per-layer text embeddings that augment the standard embeddings. + hidden_size (`int`, *optional*, defaults to 2048): + Dimension of the hidden representations. + hidden_size_per_layer_input (`int`, *optional*, defaults to 256): + Dimension of the hidden representations for per-layer emebeddings. + intermediate_size (`int` or `Sequence[int]`, *optional*, defaults to 16384): + Dimension of the MLP representations. MatFormer configurations may wish to provide a sequence of integers + to account for variable intermediate_size values across layers. In such cases, + `len(intermediate_size) == num_hidden_layers`. + num_hidden_layers (`int`, *optional*, defaults to 35): + Number of hidden layers in the Transformer decoder. + num_attention_heads (`int`, *optional*, defaults to 8): + Number of attention heads for each attention layer in the Transformer decoder. + num_key_value_heads (`int`, *optional*, defaults to 2): + This is the number of key_value heads that should be used to implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When + converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed + by meanpooling all the original heads within that group. For more details checkout this + [paper](https://huggingface.co/papers/2305.13245). If not specified, will default to `num_attention_heads`. + head_dim (`int`, *optional*, defaults to 256): + The attention head dimension. + hidden_activation (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`): + The non-linear activation function (function or string) in the decoder. Will default to + `"gelu_pytorch_tanh"` if not specified. `"gelu_pytorch_tanh"` uses an approximation of the `"gelu"` + activation function. + max_position_embeddings (`int`, *optional*, defaults to 32768): + The maximum sequence length that this model might ever be used with. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + rms_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon used by the rms normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + pad_token_id (`int`, *optional*, defaults to 0): + Padding token id. + eos_token_id (`int`, *optional*, defaults to 1): + End of stream token id. + bos_token_id (`int`, *optional*, defaults to 2): + Beginning of stream token id. + rope_theta (`float`, *optional*, defaults to 1000000.0): + The base period of the RoPE embeddings. + rope_scaling (`Dict`, *optional*): + Dictionary containing the scaling configuration for the RoPE embeddings used in global attention. + NOTE: if you apply new rope type and you expect the model to work on longer `max_position_embeddings`, we + recommend you to update this value accordingly. + Expected contents: + `rope_type` (`str`): + The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', + 'llama3'], with 'default' being the original RoPE implementation. + `factor` (`float`, *optional*): + Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In + most scaling types, a `factor` of x will enable the model to handle sequences of length x * + original maximum pre-trained length. + `original_max_position_embeddings` (`int`, *optional*): + Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during + pretraining. + `attention_factor` (`float`, *optional*): + Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention + computation. If unspecified, it defaults to value recommended by the implementation, using the + `factor` field to infer the suggested value. + `beta_fast` (`float`, *optional*): + Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear + ramp function. If unspecified, it defaults to 32. + `beta_slow` (`float`, *optional*): + Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear + ramp function. If unspecified, it defaults to 1. + `short_factor` (`List[float]`, *optional*): + Only used with 'longrope'. The scaling factor to be applied to short contexts (< + `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden + size divided by the number of attention heads divided by 2 + `long_factor` (`List[float]`, *optional*): + Only used with 'longrope'. The scaling factor to be applied to long contexts (< + `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden + size divided by the number of attention heads divided by 2 + `low_freq_factor` (`float`, *optional*): + Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE + `high_freq_factor` (`float`, *optional*): + Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE + rope_local_base_freq (float, *optional*, defaults to 10000.0): + The base period of the RoPE embeddings for local attention. + attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`): + Whether to use a bias in the query, key, value and output projection layers during self-attention. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + sliding_window (`int`, *optional*, defaults to 512): + This is the size of the sliding window used by local attention layers. + layer_types (`Optional`, *optional*): + A sequence of strings defining the attention type for that layer as either "sliding_attention" or + "full_attention". If not provided, `layer_types` will de inferred from `num_hidden_layers` using a pattern + of four "sliding_attention" layers followed one "full_attention". The last layer in the model should always + be a "full_attention" layer. + final_logit_softcapping (`float`, *optional*, defaults to 30.0): + Scaling factor when applying tanh softcapping on the logits. + altup_active_idx (`int`, *optional*, defaults to 0): + The index of the prediction from which AltUp will compute additional predictions or correct + altup_coef_clip (`float`, *optional*, defaults to 120.0): + The maximum amplitude of an AltUp prediction or correction coefficient weight. + altup_correct_scale (`bool`, *optional*, defaults to `True`): + If True, apply the `AltUp.correct_output_scale` to the corrected prediction at `altup_active_idx`. + altup_num_inputs (`int`, *optional*, defaults to 4): + The number of predictions that AltUp should be make given the input sequence. + num_kv_shared_layers (`int`, *optional*, defaults to 15): + The number of layer that share KV cache values. During the forward pass, the last `num_kv_shared_layers` + layers in the model "share" the KV values in that each local and global layer in this range uses the KV + cache values computed for the last local or global layer, respectively, before entering this range. The + value should be a multiple of the attention pattern size (see `layer_types` parameter). + laurel_rank (int, *optional*, defaults to 64): + The intermediate size for the linear projections in the Learned Augmented Residual Layer. + activation_sparsity_pattern (Sequence[float], *optional*): + The sparsity factor used to extract the top-k activations for a given layer. The provided Sequence must + explicitly provide a sparsity value for each layer in the model. By default, the first 10 layers are + sparse with a sparsity factor of 0.95 and the rest are dense. + + ```python + >>> from transformers import Gemma3nTextModel, Gemma3nTextConfig + + >>> # Initializing a Gemma3nText gemma3n_text-E4B style configuration + >>> configuration = Gemma3nTextConfig() + + >>> # Initializing a model from the gemma3n_text-E4B style configuration + >>> model = Gemma3nTextModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ``` + """ + + model_type = "gemma3n_text" + keys_to_ignore_at_inference = ["past_key_values"] + base_model_tp_plan = { + "layers.*.self_attn.q_proj": "colwise", + "layers.*.self_attn.k_proj": "colwise", + "layers.*.self_attn.v_proj": "colwise", + "layers.*.self_attn.o_proj": "rowwise", + "layers.*.mlp.gate_proj": "colwise", + "layers.*.mlp.up_proj": "colwise", + "layers.*.mlp.down_proj": "rowwise", + } + base_model_pp_plan = { + "embed_tokens": (["input_ids"], ["inputs_embeds"]), + "layers": (["hidden_states", "attention_mask"], ["hidden_states"]), + "norm": (["hidden_states"], ["hidden_states"]), + } + + def __init__( + self, + vocab_size: int = 262_400, + vocab_size_per_layer_input: int = 262_144, + hidden_size: int = 2048, + hidden_size_per_layer_input: int = 256, + intermediate_size: Union[int, Sequence[int]] = 16_384, + num_hidden_layers: int = 35, + num_attention_heads: int = 8, + num_key_value_heads: int = 2, + head_dim: int = 256, + hidden_activation: str = "gelu_pytorch_tanh", + max_position_embeddings: int = 32_768, + initializer_range: float = 0.02, + rms_norm_eps: float = 1e-6, + use_cache: bool = True, + pad_token_id: int = 0, + eos_token_id: int = 1, + bos_token_id: int = 2, + rope_theta: float = 1_000_000.0, + rope_scaling: Optional[dict[str, Any]] = None, + rope_local_base_freq: float = 10_000.0, + attention_bias: bool = False, + attention_dropout: float = 0.0, + sliding_window: int = 512, + layer_types: Optional[Sequence[str]] = None, + final_logit_softcapping: float = 30.0, + altup_active_idx: int = 0, + altup_coef_clip: float = 120.0, + altup_correct_scale: bool = True, + altup_num_inputs: int = 4, + num_kv_shared_layers: int = 15, + laurel_rank: int = 64, + activation_sparsity_pattern: Optional[Union[float, Sequence[float]]] = None, + **kwargs, + ): + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + **kwargs, + ) + + if isinstance(intermediate_size, Sequence) and (intsize_len := len(intermediate_size)) != num_hidden_layers: + raise ValueError( + "intermediate_size must have an explicit intermediate size for every layer or one for all layers. " + f"Expected {num_hidden_layers} values but got {intsize_len}." + ) + elif not isinstance(intermediate_size, Sequence): + intermediate_size = [intermediate_size] * num_hidden_layers + + self.vocab_size = vocab_size + self.vocab_size_per_layer_input = vocab_size_per_layer_input + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.head_dim = head_dim + self.num_key_value_heads = num_key_value_heads + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + self.use_cache = use_cache + self.rope_theta = rope_theta + self.attention_bias = attention_bias + self.attention_dropout = attention_dropout + self.hidden_activation = hidden_activation + self.sliding_window = sliding_window + self.final_logit_softcapping = final_logit_softcapping + self.layer_types = layer_types + + self.rope_local_base_freq = rope_local_base_freq + self.rope_scaling = rope_scaling + rope_config_validation(self) + + if layer_types is None: + self.layer_types = [ + "full_attention" if (i + 1) % 5 == 0 else "sliding_attention" for i in range(self.num_hidden_layers) + ] + else: + self.layer_types = layer_types + + layer_type_validation(self.layer_types, self.num_hidden_layers) + + self.hidden_size_per_layer_input = hidden_size_per_layer_input + self.num_kv_shared_layers = num_kv_shared_layers + + self.altup_active_idx = altup_active_idx + self.altup_coef_clip = altup_coef_clip + self.altup_correct_scale = altup_correct_scale + self.altup_num_inputs = altup_num_inputs + + self.laurel_rank = laurel_rank + + if activation_sparsity_pattern is None: + num_sparse_layers = 10 if num_hidden_layers > 10 else 0 + activation_sparsity_pattern = [0.95] * num_sparse_layers + [0.0] * (num_hidden_layers - num_sparse_layers) + + if (len_asp := len(activation_sparsity_pattern)) != num_hidden_layers: + raise ValueError( + "activation_sparsity_pattern must have an explicit activation sparsity value for every layer." + f"Expected {num_hidden_layers} values but got {len_asp}." + ) + self.activation_sparsity_pattern = activation_sparsity_pattern + + +class Gemma3nAudioConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`Gemma3nAudioEncoder`]. It is used to instantiate + an `Gemma3nAudioEncoder` model according to the specified arguments, defining the model architecture. Instantiating + a configuration with the defaults will yield a similar configuration to that of the Gemma 3n E4B, e.g., + [google/gemma-3n-E4B](https://huggingface.co/google/gemma-3n-E4B). + + Configuration objects that inherit from [`Gemma3nAudioConfig`] and can be used to control the model outputs. Read + the documentation from [`Gemma3nAudioConfig`] for more information. + + Args: + vocab_size (`int`, *optional*, defaults to 128): + Vocabulary size of the additional hard-token embeddings for audio model. These augment the embeddings + included in the `Gemma3nTextModel` to provide, e.g., the end of audio and audio soft token placeholder + tokens when converting `input_ids` to embeddings in the `Gemma3nForConditionalGeneration` model. + vocab_offset (`int`, *optional*, defaults to 262272): + Offset between the tokenizer vocab index for the token ids embedded by `Gemma3nMultimodalEmbedder` and the + 0-indexed `Gemma3nMultimodalEmbedder.embedding` table. + input_feat_size (`int`, *optional*, defaults to 128): + The number of channels in each mel-spectrogram frame. + hidden_size (`int`, *optional*, defaults to 1536): + Dimension of the hidden representations. + rms_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon used by the rms normalization layers. + gradient_clipping (`float`, *optional*, defaults to 10000000000.0): + Clipping value used to stabilize extremely large gradient values. + conf_attention_chunk_size (`int`, *optional*, defaults to 12): + The sub-sequence size for local attention processing inside the Conformer ("conf") section of the + Universal Speech Model. + conf_attention_context_left (`int`, *optional*, defaults to 13): + The left context size of the local attention inside the Conformer ("conf") section of the + Universal Speech Model. + conf_attention_context_right (`int`, *optional*, defaults to 0): + The right context size of the local attention inside the Conformer ("conf") section of the + Universal Speech Model. + conf_attention_logit_cap (`float`, *optional*, defaults to 50.0): + Logit cap applied during local attention inside the Conformer ("conf") section of the + Universal Speech Model. + conf_num_attention_heads (`int`, *optional*, defaults to 8): + The number of attention heads in local attention inside the Conformer ("conf") section of the + Universal Speech Model. + conf_num_hidden_layers (`int`, *optional*, defaults to 12): + The number of layers that use local attention inside the Conformer ("conf") section of the + Universal Speech Model. + conf_conv_kernel_size (`int`, *optional*, defaults to 5): + Convolution kernel size for the conformer block inside the Conformer ("conf") section of the + Universal Speech Model. + conf_reduction_factor (`int`, *optional*, defaults to 4): + Reduction factor used in the conformer block inside the Conformer ("conf") section of the + Universal Speech Model. + conf_residual_weight (`float`, *optional*, defaults to 0.5): + Residual connection weight inside the Conformer ("conf") section of the + Universal Speech Model. + sscp_conv_channel_size (`tuple(int, int)`, *optional*, defaults to `(128, 32)`): + The channel sizes for the first and second convolutional layers in the Sub-sample Convolution Projection + ("sscp") section of the Universal Speech Model. + sscp_conv_group_norm_eps (`float`, *optional*, defaults to 0.001): + Epsilon used in group normalization in the subsample convolution projection in the Sub-sample Convolution + Projection ("sscp") section of the Universal Speech Model. + sscp_conv_kernel_size (`tuple(tuple(int, int), tuple(int, int))`, *optional*, defaults to `((3, 3), (3, 3))`): + Kernel sizes of the two convolutional layers in the subsample convolution projection in the Sub-sample + Convolution Projection ("sscp") section of the Universal Speech Model. The kernel sizes are specified as a + tuple of height and width for each layer, where the height corresponds to the time dimension and the width + corresponds to the frequency dimension. + sscp_conv_stride_size (`tuple(tuple(int, int), tuple(int, int))`, *optional*, defaults to `((2, 2), (2, 2))`): + Stride sizes of the two convolutional layers in the subsample convolution projection in the Sub-sample + Convolution Projection ("sscp") section of the Universal Speech Model. The stride sizes are specified as a + tuple of height and width for each layer, where the height corresponds to the time dimension and the width + corresponds to the frequency dimension. + + Example: + + ```python + >>> from transformers import Gemma3nAudioConfig, Gemma3nAudioEncoder + + >>> # Initializing a Gemma3nAudioEncoder gemma3n_audio-E4B-style configuration + >>> configuration = Gemma3nAudioConfig() + + >>> # Initializing a model from the gemma3n_audio-E4B style configuration + >>> model = Gemma3nAudioEncoder(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ``` + """ + + model_type = "gemma3n_audio" + + def __init__( + self, + vocab_size: int = 128, + vocab_offset: int = 262_144 + 128, # text vocab size + vision vocab size + input_feat_size: int = 128, + hidden_size: int = 1536, + rms_norm_eps: float = 1e-6, + gradient_clipping: float = 10_000_000_000.0, + conf_attention_chunk_size: int = 12, + conf_attention_context_left: int = 13, + conf_attention_context_right: int = 0, + conf_attention_logit_cap: float = 50.0, + conf_num_attention_heads: int = 8, + conf_num_hidden_layers: int = 12, + conf_conv_kernel_size: int = 5, + conf_reduction_factor: int = 4, + conf_residual_weight: float = 0.5, + sscp_conv_channel_size: tuple[int, int] = (128, 32), + sscp_conv_group_norm_eps: float = 1e-3, + sscp_conv_kernel_size: tuple[tuple[int, int], tuple[int, int]] = ( + (3, 3), + (3, 3), + ), + sscp_conv_stride_size: tuple[tuple[int, int], tuple[int, int]] = ( + (2, 2), + (2, 2), + ), + **kwargs, + ): + super().__init__(**kwargs) + self.input_feat_size = input_feat_size + self.hidden_size = hidden_size + self.rms_norm_eps = rms_norm_eps + self.vocab_size = vocab_size + self.vocab_offset = vocab_offset + self.gradient_clipping = gradient_clipping + self.conf_attention_chunk_size = conf_attention_chunk_size + self.conf_attention_context_left = conf_attention_context_left + self.conf_attention_context_right = conf_attention_context_right + self.conf_attention_logit_cap = conf_attention_logit_cap + self.conf_num_attention_heads = conf_num_attention_heads + self.conf_num_hidden_layers = conf_num_hidden_layers + self.conf_conv_kernel_size = conf_conv_kernel_size + self.conf_reduction_factor = conf_reduction_factor + self.conf_residual_weight = conf_residual_weight + self.sscp_conv_channel_size = sscp_conv_channel_size + self.sscp_conv_group_norm_eps = sscp_conv_group_norm_eps + self.sscp_conv_kernel_size = sscp_conv_kernel_size + self.sscp_conv_stride_size = sscp_conv_stride_size + + +class Gemma3nVisionConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration for a timm backbone [`TimmWrapper`]. It is used to + instantiate an timm model model according to the specified arguments, defining the model architecture. + Instantiating a configuration with the defaults will yield a similar configuration to that of the Gemma 3n E4B + vision tower, e.g. [google/gemma-3n-E4B](https://huggingface.co/google/gemma-3n-E4B). + + Configuration objects inherit from [`Gemma3nVisionConfig`] and can be used to control the model outputs. Read the + documentation from [`Gemma3nVisionConfig`] for more information. + + Config loads imagenet label descriptions and stores them in `id2label` attribute, `label2id` attribute for default + imagenet models is set to `None` due to occlusions in the label descriptions. + + Args: + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + do_pooling (`bool`, *optional*, defaults to `False`): + Whether to do pooling for the last_hidden_state in `TimmWrapper` or not. + architecture (`str`, *optional*, defaults to `"mobilenetv5_300m_enc"`): + Determines vision architecture for TimmWrapper. + hidden_size (`int`, *optional*, defaults to 2048): + Dimension of the hidden representations. + vocab_size (`int`, *optional*, defaults to 128): + Vocabulary size of the additional hard-token embeddings for vision model. + vocab_offset (`int`, *optional*, defaults to 262144): + Offset between the tokenizer vocab index for the token ids embedded by `Gemma3nMultimodalEmbedder` and the + 0-indexed `Gemma3nMultimodalEmbedder.embedding` table. + rms_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon used by the rms normalization layers. + + Example: + ```python + >>> from transformers import Gemma3nVisionConfig, TimmWrapper + + >>> # Initializing a TimmWrapper gemma3n_vision-E4B-style configuration + >>> configuration = Gemma3nVisionConfig() + + >>> # Initializing a gemma3n_vision-E4B-style TimmWrapper from the configuration + >>> model = TimmWrapper(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ``` + """ + + model_type = "gemma3n_vision" + + def __init__( + self, + initializer_range: float = 0.02, + do_pooling: bool = False, + architecture: str = "mobilenetv5_300m_enc", + hidden_size: int = 2048, + vocab_size: int = 128, + vocab_offset: int = 262_144, + rms_norm_eps: float = 1e-06, + model_args: Optional[dict] = None, + **kwargs, + ): + super().__init__(**kwargs) + self.architecture = architecture + self.initializer_range = initializer_range + self.do_pooling = do_pooling + self.model_args = model_args # named "model_args" for BC with timm + self.hidden_size = hidden_size + self.vocab_size = vocab_size + self.vocab_offset = vocab_offset + self.rms_norm_eps = rms_norm_eps + + @classmethod + def from_dict(cls, config_dict: dict[str, Any], **kwargs): + label_names = config_dict.get("label_names") + is_custom_model = "num_labels" in kwargs or "id2label" in kwargs + + # if no labels added to config, use imagenet labeller in timm + if label_names is None and not is_custom_model: + requires_backends(cls, ["timm"]) + imagenet_subset = infer_imagenet_subset(config_dict) + if imagenet_subset: + dataset_info = ImageNetInfo(imagenet_subset) + synsets = dataset_info.label_names() + label_descriptions = dataset_info.label_descriptions(as_dict=True) + label_names = [label_descriptions[synset] for synset in synsets] + + if label_names is not None and not is_custom_model: + kwargs["id2label"] = dict(enumerate(label_names)) + + # if all label names are unique, create label2id mapping as well + if len(set(label_names)) == len(label_names): + kwargs["label2id"] = {name: i for i, name in enumerate(label_names)} + else: + kwargs["label2id"] = None + + # timm config stores the `num_classes` attribute in both the root of config and in the "pretrained_cfg" dict. + # We are removing these attributes in order to have the native `transformers` num_labels attribute in config + # and to avoid duplicate attributes + num_labels_in_kwargs = kwargs.pop("num_labels", None) + num_labels_in_dict = config_dict.pop("num_classes", None) + + # passed num_labels has priority over num_classes in config_dict + kwargs["num_labels"] = num_labels_in_kwargs or num_labels_in_dict + + # pop num_classes from "pretrained_cfg", + # it is not necessary to have it, only root one is used in timm + if "pretrained_cfg" in config_dict and "num_classes" in config_dict["pretrained_cfg"]: + config_dict["pretrained_cfg"].pop("num_classes", None) + + return super().from_dict(config_dict, **kwargs) + + def to_dict(self) -> dict[str, Any]: + output = super().to_dict() + output.setdefault("num_classes", self.num_labels) + output.setdefault("label_names", list(self.id2label.values())) + output.pop("id2label", None) + output.pop("label2id", None) + return output + + +class Gemma3nConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`Gemma3nForConditionalGeneration`]. It is used to + instantiate a Gemma3nForConditionalGeneration according to the specified arguments, defining the model + architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of + Gemma3n-E4B. + + e.g. [google/gemma-3n-E4B](https://huggingface.co/google/gemma-3n-E4B) + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + text_config (`Union[Gemma3nTextConfig, dict]`, *optional*): + The config object of the text backbone. + vision_config (`Union[AutoConfig, dict]`, *optional*): + Custom vision config or dict. + audio_config (`Union[AutoConfig, dict]`, *optional*): + Custom audio config or dict. + audio_soft_tokens_per_image (`int`, *optional*, defaults to 188): + The number of soft tokens per audio clip. + vision_soft_tokens_per_image (`int`, *optional*, defaults to 256): + The number of soft tokens per image. + boi_token_id (`int`, *optional*, defaults to 255999): + The begin-of-image token index to wrap the image prompt. + eoi_token_id (`int`, *optional*, defaults to 262144): + The end-of-image token index to wrap the image prompt. + image_token_id (`int`, *optional*, defaults to 262145): + The image token index to encode the image prompt. + boa_token_id (`int`, *optional*, defaults to 256000): + The begin-of-audio token index to wrap the audio prompt. + eoa_token_id (`int`, *optional*, defaults to 262272): + The end-of-audio token index to wrap the audio prompt. + audio_token_id (`int`, *optional*, defaults to 262273): + The audio token index to encode the audio prompt. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + + + Example: + + ```python + >>> from transformers import Gemma3nForConditionalGeneration, Gemma3nConfig, Gemma3nTextConfig + + >>> # Initializing a MobileNet vision config, which is loaded from TIMM + >>> vision_config = Gemma3nVisionConfig() + + >>> # Initializing a Gemma3n Audio config + >>> audio_config = Gemma3nAudioConfig() + + >>> # Initializing a Gemma3n Text config + >>> text_config = Gemma3nTextConfig() + + >>> # Initializing a Gemma3n gemma-3-4b style configuration + >>> configuration = Gemma3nConfig(text_config, vision_config, audio_config) + + >>> # Initializing a model from the gemma-3-4b style configuration + >>> model = Gemma3nTextConfig(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "gemma3n" + sub_configs = { + "text_config": Gemma3nTextConfig, + "vision_config": Gemma3nVisionConfig, + "audio_config": Gemma3nAudioConfig, + } + + def __init__( + self, + text_config: Optional[Union[Gemma3nTextConfig, dict[str, Any]]] = None, + vision_config: Optional[Union[Gemma3nVisionConfig, dict[str, Any]]] = None, + audio_config: Optional[Union[Gemma3nAudioConfig, dict[str, Any]]] = None, + audio_soft_tokens_per_image: int = 188, + vision_soft_tokens_per_image: int = 256, + boi_token_id: int = 255_999, + eoi_token_id: int = 262_144, + image_token_id: int = 262_145, + boa_token_id: int = 256_000, + eoa_token_id: int = 262_272, + audio_token_id: int = 262_273, + initializer_range: float = 0.02, + **kwargs, + ): + super().__init__(**kwargs) + + if isinstance(text_config, dict): + text_config = Gemma3nTextConfig(**text_config) + elif text_config is None: + text_config = Gemma3nTextConfig() + logger.info("text_config is None. Using default Gemma3nTextConfig.") + + if isinstance(vision_config, dict): + vision_config = Gemma3nVisionConfig(**vision_config) + elif vision_config is None: + vision_config = Gemma3nVisionConfig() + logger.info("vision_config is None. Using default Gemma3nVisionConfig.") + + if isinstance(audio_config, dict): + audio_config = Gemma3nAudioConfig(**audio_config) + elif audio_config is None: + audio_config = Gemma3nAudioConfig() + logger.info("audio_config is None. Using default Gemma3nAudioConfig.") + + self.text_config = text_config + self.vision_config = vision_config + self.audio_config = audio_config + + self.audio_soft_tokens_per_image = audio_soft_tokens_per_image + self.vision_soft_tokens_per_image = vision_soft_tokens_per_image + self.boi_token_id = boi_token_id + self.eoi_token_id = eoi_token_id + self.image_token_id = image_token_id + self.boa_token_id = boa_token_id + self.eoa_token_id = eoa_token_id + self.audio_token_id = audio_token_id + self.initializer_range = initializer_range + + +__all__ = ["Gemma3nAudioConfig", "Gemma3nConfig", "Gemma3nTextConfig", "Gemma3nVisionConfig"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/gemma3n/feature_extraction_gemma3n.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/gemma3n/feature_extraction_gemma3n.py new file mode 100644 index 0000000000000000000000000000000000000000..62e3fb3878f73fc7c794f544ddd94232f9ae1b4d --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/gemma3n/feature_extraction_gemma3n.py @@ -0,0 +1,338 @@ +# coding=utf-8 +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +from collections.abc import Sequence +from typing import Optional, Union + +import numpy as np + +from ...feature_extraction_sequence_utils import SequenceFeatureExtractor +from ...feature_extraction_utils import BatchFeature +from ...utils import PaddingStrategy, TensorType, logging + + +logger = logging.get_logger(__name__) + + +def create_fb_matrix( + n_freqs: int, + f_min: float, + f_max: float, + n_mels: int, + sample_rate: int, + fft_length: int, + norm: Optional[str] = None, +) -> np.ndarray: + r"""Create a frequency bin conversion matrix (NumPy version). + + Args: + n_freqs (int): Number of frequencies to highlight/apply + f_min (float): Minimum frequency (Hz) + f_max (float): Maximum frequency (Hz) + n_mels (int): Number of mel filterbanks + sample_rate (int): Sample rate of the audio waveform + fft_length (int): FFT length + norm (Optional[str]): If 'slaney', divide the triangular mel weights by + the width of the mel band (area normalization). (Default: ``None``) + + Returns: + np.ndarray: Triangular filter banks (fb matrix) of size (``n_freqs``, + ``n_mels``) + meaning number of frequencies to highlight/apply to x the number of + filterbanks. + Each column is a filterbank so that assuming there is a matrix A of + size (..., ``n_freqs``), the applied result would be + ``A @ create_fb_matrix_numpy(A.shape[-1], ...)``. + """ + + if norm is not None and norm != "slaney": + raise ValueError("norm must be one of None or 'slaney'") + + # freq bins + all_freqs = np.arange(n_freqs, dtype=np.float32) * (sample_rate / fft_length) + + # calculate mel freq bins + # hertz to mel(f) is 2595. * math.log10(1. + (f / 700.)) + m_min = 2595.0 * math.log10(1.0 + (f_min / 700.0)) + m_max = 2595.0 * math.log10(1.0 + (f_max / 700.0)) + m_pts = np.linspace(m_min, m_max, n_mels + 2) + # mel to hertz(mel) is 700. * (10**(mel / 2595.) - 1.) + f_pts = 700.0 * (10 ** (m_pts / 2595.0) - 1.0) + # calculate difference between each mel point and each stft freq point in Hz + f_diff = f_pts[1:] - f_pts[:-1] # (n_mels + 1) + slopes = np.expand_dims(f_pts, 0) - np.expand_dims(all_freqs, 1) # (n_freqs, n_mels + 2) + # create overlapping triangles + zero = np.zeros(1, dtype=np.float32) + down_slopes = (-1.0 * slopes[:, :-2]) / f_diff[:-1] # (n_freqs, n_mels) + up_slopes = slopes[:, 2:] / f_diff[1:] # (n_freqs, n_mels) + fb = np.maximum(zero, np.minimum(down_slopes, up_slopes)) + + if norm is not None and norm == "slaney": + # Slaney-style mel is scaled to be approx constant energy per channel + enorm = 2.0 / (f_pts[2 : n_mels + 2] - f_pts[:n_mels]) + fb *= np.expand_dims(enorm, 0) + + return fb + + +def _unfold(array: np.ndarray, dimension: int, size: int, step: int) -> np.ndarray: + """A basic NumPy equivalent of PyTorch's unfold for 2D arrays along the last dim.""" + if array.ndim != 2: + raise ValueError("This unfold implementation currently supports 2D arrays (batch, time).") + if dimension != -1 and dimension != array.ndim - 1: + raise ValueError("This unfold implementation only supports unfolding the last dimension.") + + batch_size, original_length = array.shape + num_frames = (original_length - size) // step + 1 + + if num_frames <= 0: + return np.zeros((batch_size, 0, size), dtype=array.dtype) + + output_shape = (batch_size, num_frames, size) + output_strides = (array.strides[0], array.strides[1] * step, array.strides[1]) + + return np.lib.stride_tricks.as_strided(array, shape=output_shape, strides=output_strides) + + +class Gemma3nAudioFeatureExtractor(SequenceFeatureExtractor): + """An audio feature extractor Universal Speech Models https://huggingface.co/papers/2303.01037. + + Args: + feature_size (`int`, *optional*, defaults to 128): + The feature dimension of the extracted features. + sampling_rate (`int`, *optional*, defaults to 16000): + The sampling rate at which the audio files should be digitalized expressed in hertz (Hz). + padding_value (`float`, *optional*, defaults to 0.0): + Padding value used to pad the audio. Should correspond to silences. + return_attention_mask (`bool`, *optional*, defaults to `True`): + Whether to return the attention mask for the generated MEL spectrograms. + frame_length_ms (`float`, *optional*, defaults to 32.0): + The length of a frame in milliseconds. + hop_length_ms (`float`, *optional*, defaults to 10.0): + Length of the overlapping windows for the STFT used to obtain the Mel Frequency coefficients. + min_frequency (`float`, *optional*, defaults to 125.0): + The minimum frequency (in Hz) for the Mel filterbank. + max_frequency (`float`, *optional*, defaults to 7600.0): + The maximum frequency (in Hz) for the Mel filterbank. + preemphasis (`float`, *optional*, defaults to 0.97): + The preemphasis coefficient. + preemphasis_htk_flavor (`bool`, *optional*, defaults to `True`): + Whether to use HTK-style preemphasis. + fft_overdrive (`bool`, *optional*, defaults to `True`): + Whether to use FFT overdrive. + dither (`float`, *optional*, defaults to 0.0): + Adds dithering. In other words, adds a small Gaussian noise to each frame. + E.g. use 0.0001 to add dithering with a normal distribution centered + around 0.0 with standard deviation 0.0001 (assuming [-1,+1] range of raw_speech). + The value 0.0 means no dithering. + Dithering has similar effect as `spectrogram(mel_floor=...)`. It reduces + the high log_mel_fbank values for signals with hard-zero sections, + when VAD cutoff is present in the signal. + input_scale_factor (`float`, *optional*, defaults to 1.0): + Scaling factor applied to the input waveform. + mel_floor (`float`, *optional*, defaults to 1e-05): + Minimum value for Mel spectrograms to avoid log(0). + per_bin_mean (`Optional[Sequence[float]]`, *optional*): + Mean values for per-bin normalization. + per_bin_stddev (`Optional[Sequence[float]]`, *optional*): + Standard deviation values for per-bin normalization. + """ + + model_input_names = ["input_features", "input_features_mask"] + + def __init__( + self, + feature_size: int = 128, + sampling_rate: int = 16_000, + padding_value: float = 0.0, + return_attention_mask: bool = True, + frame_length_ms: float = 32.0, + hop_length_ms: float = 10.0, + min_frequency: float = 125.0, + max_frequency: float = 7600.0, + preemphasis: float = 0.97, + preemphasis_htk_flavor: bool = True, + fft_overdrive: bool = True, + dither: float = 0.0, + input_scale_factor: float = 1.0, + mel_floor: float = 1e-5, + per_bin_mean: Optional[Sequence[float]] = None, + per_bin_stddev: Optional[Sequence[float]] = None, + **kwargs, + ): + super().__init__( + feature_size=feature_size, + sampling_rate=sampling_rate, + padding_value=padding_value, + return_attention_mask=return_attention_mask, + **kwargs, + ) + + self.min_frequency = min_frequency + self.max_frequency = max_frequency + self.preemphasis = preemphasis + self.preemphasis_htk_flavor = preemphasis_htk_flavor + self.fft_overdrive = fft_overdrive + self.dither = dither + self.input_scale_factor = input_scale_factor + self.frame_length = int(round(sampling_rate * frame_length_ms / 1000.0)) + self.hop_length = int(round(sampling_rate * hop_length_ms / 1000.0)) + self.mel_floor = np.array(mel_floor, dtype=np.float64) + + fft_length = 2 ** math.ceil(math.log2(self.frame_length)) + if self.fft_overdrive: + fft_length *= 2 + self.fft_length = fft_length + + hann_arange = np.arange(self.frame_length, dtype=np.float32) + window = 0.5 * (1 - np.cos(2 * np.pi * hann_arange / self.frame_length)) + self.window = window.astype(np.float32) + + self.mel_filters = create_fb_matrix( + n_freqs=self.fft_length // 2 + 1, + f_min=min_frequency, + f_max=max_frequency, + n_mels=feature_size, + sample_rate=self.sampling_rate, + norm=None, + fft_length=fft_length, + ) + + if per_bin_mean is not None: + self.per_bin_mean = np.array(per_bin_mean).reshape(1, 1, feature_size) + else: + self.per_bin_mean = None + + if per_bin_stddev is not None: + self.per_bin_stddev = np.array(per_bin_stddev).reshape(1, 1, feature_size) + else: + self.per_bin_stddev = None + + def _extract_spectrogram(self, waveform: np.ndarray, attention_mask: np.ndarray) -> tuple[np.ndarray, np.ndarray]: + """""" + if waveform.ndim == 1: # If single waveform, add batch dimension + waveform = np.expand_dims(waveform, axis=0) + + if self.dither > 0.0: + waveform = waveform + self.dither * np.random.randn(*waveform.shape).astype(waveform.dtype) + + if self.input_scale_factor != 1.0: + waveform = waveform * self.input_scale_factor + + frame_size_for_unfold = self.frame_length + 1 + + # NumPy equivalent of unfold for [B, NumFrames, frame_size_for_unfold] + frames_to_process = _unfold(waveform, dimension=-1, size=frame_size_for_unfold, step=self.hop_length) + + if self.preemphasis > 0.0: + if self.preemphasis_htk_flavor: + first_in_frame = frames_to_process[..., :1] * (1.0 - self.preemphasis) + rest_in_frame = frames_to_process[..., 1:-1] - self.preemphasis * frames_to_process[..., :-2] + frames = np.concatenate([first_in_frame, rest_in_frame], axis=-1) + else: + frames = frames_to_process[..., 1:] - self.preemphasis * frames_to_process[..., :-1] + else: + frames = frames_to_process[..., :-1] + + frames = frames * self.window # Broadcasting window + stft = np.fft.rfft(frames, n=self.fft_length, axis=-1) + + magnitude_spec = np.abs(stft) + + mel_spec = np.matmul(magnitude_spec, self.mel_filters) + log_mel_spec = np.log(np.maximum(mel_spec, self.mel_floor)) + + if self.per_bin_mean is not None: + log_mel_spec = log_mel_spec - self.per_bin_mean # Broadcasting + + if self.per_bin_stddev is not None: + log_mel_spec = log_mel_spec / self.per_bin_stddev # Broadcasting + + mel_spectrogram = log_mel_spec.squeeze(0) + mask = attention_mask[:: self.hop_length].astype(bool) + # TODO: The filtered mask is always exactly 3 elements longer than the mel_spectrogram. Why??? + return mel_spectrogram, mask[: mel_spectrogram.shape[0]] + + def __call__( + self, + raw_speech: Union[np.ndarray, list[float], list[np.ndarray], list[list[float]]], + padding: Union[bool, str, PaddingStrategy] = "longest", + max_length: Optional[int] = 480_000, + truncation: bool = True, + pad_to_multiple_of: Optional[int] = 128, + return_tensors: Optional[Union[str, TensorType]] = None, + return_attention_mask: Optional[bool] = True, + **kwargs, + ) -> BatchFeature: + """Creates a batch of MEL spectrograms from the provided raw speech. + + This implementation uses a different algorithm for windowing and preemphasis compared to the built-in + `transformers.audio_utils.spectrogram()` function that _will_ result in different outputs. Consider this + carefully when selecting an audio feature extractor, especially with pre-trained models. + + Args: + raw_speech: + The audio for which MEL spectrograms are created. + padding (`Union[bool, str, PaddingStrategy]`, *optional*, defaults to `"longest"`): + The padding strategy to use for batches of audio with different lengths. + max_length (`int`, *optional*, defaults to 480000): + If provided, defines the maximum length of the audio to allow. Audio longer than this will be + truncated if `truncation=True`. + truncation (`bool`, *optional*, defaults to `True`): + Whether or not to truncate audio above `max_length`. + pad_to_multiple_of (`int`, *optional*, defaults to 128): + When padding, pad to a multiple of this value. The default value is defined for optimal TPU support. + return_tensors (`Union[str, TensorType]`, *optional*, defaults to `None`): + The type of tensors to return (e.g., NumPy, Torch, JAX, TensorFlow). + return_attention_mask (`bool`, *optional*, defaults to `True`): + Whether to return the attention mask for the generated MEL spectrograms. + """ + + is_batched_numpy = isinstance(raw_speech, np.ndarray) and len(raw_speech.shape) > 1 + is_batched_sequence = isinstance(raw_speech, Sequence) and isinstance(raw_speech[0], (np.ndarray, Sequence)) + is_batched = is_batched_numpy or is_batched_sequence + + if is_batched: + raw_speech = [np.asarray([rs]).T for rs in raw_speech] + elif not is_batched and not isinstance(raw_speech, np.ndarray): + raw_speech = np.asarray(raw_speech) + + if not is_batched: # always return a batch + raw_speech = [np.asarray([raw_speech])] + + batched_speech = self.pad( + BatchFeature({"input_features": raw_speech}), + padding=padding, + max_length=max_length, + truncation=truncation, + pad_to_multiple_of=pad_to_multiple_of, + return_attention_mask=return_attention_mask, + ) + + prepared_speech = [] + prepared_speech_mask = [] + for speech, mask in zip(batched_speech.input_features, batched_speech.attention_mask): + speech, mask = self._extract_spectrogram(speech.T, mask) + prepared_speech.append(speech.astype(np.float32)) + prepared_speech_mask.append(mask) + + return BatchFeature( + {"input_features": prepared_speech, "input_features_mask": prepared_speech_mask}, + tensor_type=return_tensors, + ) + + +__all__ = ["Gemma3nAudioFeatureExtractor"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/gemma3n/modeling_gemma3n.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/gemma3n/modeling_gemma3n.py new file mode 100644 index 0000000000000000000000000000000000000000..68595ead4371bba4087813566d5c83d5c0155c03 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/gemma3n/modeling_gemma3n.py @@ -0,0 +1,2394 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/gemma3n/modular_gemma3n.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_gemma3n.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright 2025 Google Inc. HuggingFace Inc. team. All rights reserved. +# +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import copy +import math +from collections.abc import Callable, Sequence +from dataclasses import dataclass +from typing import Optional, Union + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from ...activations import ACT2FN +from ...cache_utils import Cache, DynamicCache +from ...generation import GenerationMixin +from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask +from ...modeling_flash_attention_utils import FlashAttentionKwargs +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast +from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update +from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel +from ...processing_utils import Unpack +from ...utils import ModelOutput, TransformersKwargs, auto_docstring, can_return_tuple, logging +from ...utils.deprecation import deprecate_kwarg +from ..auto import AutoModel +from .configuration_gemma3n import Gemma3nAudioConfig, Gemma3nConfig, Gemma3nTextConfig, Gemma3nVisionConfig + + +logger = logging.get_logger(__name__) + + +@dataclass +@auto_docstring( + custom_intro=""" + Base class for Gemma3n outputs, with hidden states and attentions. + """ +) +class Gemma3nModelOutputWithPast(BaseModelOutputWithPast): + r""" + past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache). + + Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see + `past_key_values` input) to speed up sequential decoding. + image_hidden_states (`torch.FloatTensor`, *optional*): + A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`. + image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state. + audio_hidden_states (`torch.FloatTensor`, *optional*): + A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`. + audio_hidden_states of the model produced by the audio encoder and after projecting the last hidden state. + """ + + image_hidden_states: Optional[torch.FloatTensor] = None + + audio_hidden_states: Optional[torch.FloatTensor] = None + + +@dataclass +@auto_docstring( + custom_intro=""" + Base class for Gemma3n causal language model (or autoregressive) outputs. + """ +) +class Gemma3nCausalLMOutputWithPast(ModelOutput): + r""" + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Language modeling loss (for next-token prediction). + logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.text_config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache). + + Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see + `past_key_values` input) to speed up sequential decoding. + image_hidden_states (`torch.FloatTensor`, *optional*): + A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`. + image_hidden_states of the model produced by the vision encoder after projecting last hidden state. + audio_hidden_states (`torch.FloatTensor`, *optional*): + A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`. + audio_hidden_states of the model produced by the audio encoder and after projecting the last hidden state. + """ + + loss: Optional[torch.FloatTensor] = None + logits: Optional[torch.FloatTensor] = None + past_key_values: Optional[Cache] = None + hidden_states: Optional[tuple[torch.FloatTensor]] = None + attentions: Optional[tuple[torch.FloatTensor]] = None + image_hidden_states: Optional[torch.FloatTensor] = None + + audio_hidden_states: Optional[torch.FloatTensor] = None + + +class Gemma3nRMSNorm(nn.Module): + def __init__(self, dim: int, eps: float = 1e-6, with_scale: bool = True): + super().__init__() + self.eps = eps + self.with_scale = with_scale + + if self.with_scale: + self.weight = nn.Parameter(torch.ones(dim)) + else: + self.register_buffer("weight", torch.tensor(1.0), persistent=False) + + def _norm(self, x): + return x / torch.sqrt(x.pow(2).mean(-1, keepdim=True) + self.eps) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + # Llama does x.to(float16) * w whilst Gemma2 is (x * w).to(float16) + # See https://github.com/huggingface/transformers/pull/29402 + output = self._norm(x.float()) * self.weight.float() + return output.type_as(x) + + def extra_repr(self): + return f"{tuple(self.weight.shape)}, eps={self.eps}" + + +# ==== Audio Encoder ==== + + +class Gemma3nAudioRelativePositionEmbedding(nn.Module): + def __init__(self, config: Gemma3nAudioConfig): + super().__init__() + self.config = config + + self.num_heads = self.config.conf_num_attention_heads + self.channels = self.config.hidden_size + self.head_dim = self.channels // self.num_heads + self.max_backward = max(0, self.config.conf_attention_context_left - 1) + self.max_forward = self.config.conf_attention_context_right + + self.pos_proj = nn.Linear(self.channels, self.num_heads * self.head_dim, bias=False) + + min_timescale = 1.0 + max_timescale = 1.0e4 + num_timescales = self.channels // 2 + log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / max(num_timescales - 1, 1) + inv_timescales = min_timescale * torch.exp(torch.arange(num_timescales) * -log_timescale_increment) + self.register_buffer( + "inv_timescales", + inv_timescales.float().unsqueeze(0).unsqueeze(0), + persistent=False, + ) + + def _get_timing_signal_1d_pos(self, position: torch.Tensor, dtype: torch.dtype) -> torch.Tensor: + position = position.float().unsqueeze(-1) + scaled_time = position * self.inv_timescales.to(device=position.device, dtype=torch.float32) + timing_signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=-1) + return timing_signal.type(dtype) + + def _relative_shift( + self, + term_bd_before_shift: torch.Tensor, + batch_size: int, + num_heads: int, + num_query_blocks: int, + query_block_size: int, + key_context_size: int, + max_span_plus_1: int, + ) -> torch.Tensor: + """Performs the relative shift. + + Args: + term_bd_before_shift: Tensor of shape [B, N, U, W, F_span]. batch_size + (B), num_heads (N), num_query_blocks (U), query_block_size (W), + key_context_size (C = W+L+R), max_span_plus_1 (F_span = L+R+1). + + Returns: + Tensor of shape [B, N, U, W, C]. + """ + # term_bd_before_shift shape: [B, N, U, W, F_span] + # Target shape after shift: [B, N, U, W, C] + + # Padding amount for the last dimension (F_span) to become (C + 1) + # C = key_context_size + # F_span = max_span_plus_1 + pad_amount_last_dim = (key_context_size + 1) - max_span_plus_1 + + # PyTorch F.pad expects (pad_left, pad_right, pad_top, pad_bottom ...) + # We only pad the last dimension on the right. + padding_tuple = (0, pad_amount_last_dim) + + term_bd_padded = nn.functional.pad(term_bd_before_shift, padding_tuple) + # Shape after pad: [B, N, U, W, C+1] + + # Reshape for slicing (emulating JAX's behavior) + # [B, N, U, W * (C+1)] + term_bd_reshaped = term_bd_padded.reshape( + ( + batch_size, + num_heads, + num_query_blocks, + query_block_size * (key_context_size + 1), + ) + ) + + # Slice to effective [B, N, U, W * C] + term_bd_sliced = term_bd_reshaped[:, :, :, : query_block_size * key_context_size] + + # Reshape back to [B, N, U, W, C] + term_bd_shifted = term_bd_sliced.reshape( + ( + batch_size, + num_heads, + num_query_blocks, + query_block_size, + key_context_size, + ) + ) + return term_bd_shifted + + def forward(self, queries: torch.Tensor, keys: torch.Tensor) -> torch.Tensor: + # queries: [B, U, W, N, H] (batch, num_query_blocks, query_block_size, num_heads, head_dim) + # keys: [B, U, C, N, H] (batch, num_query_blocks, key_context_size, num_heads, head_dim) + # C = W + L + R (key_context_size) + # F_span = L + R + 1 (max_span + 1) + + batch_size, num_query_blocks, query_block_size, num_heads, head_dim = queries.shape + _, _, key_context_size, _, _ = keys.shape + + # Relative positions for sinusoidal embeddings: [L, L-1, ..., -R] + # Length is L+R+1 = self.max_span + 1 + pos_indices = torch.arange(self.max_backward, -self.max_forward - 1, -1, device=queries.device).unsqueeze( + 0 + ) # Shape [1, F_span] + + max_span_plus_1 = pos_indices.shape[1] # F_span + + sin_emb_timing_signal = self._get_timing_signal_1d_pos( + pos_indices, dtype=queries.dtype + ) # Shape [1, F_span, self.channels] + + # Project sinusoidal embeddings: [1, F_span, self.channels] -> [1, F_span, N*H] + projected_sin_emb = self.pos_proj(sin_emb_timing_signal) + # Reshape to [1, F_span, N, H] then squeeze to [F_span, N, H] + sin_emb = projected_sin_emb.reshape(1, max_span_plus_1, self.num_heads, self.head_dim).squeeze( + 0 + ) # Shape [F, N, H] + + # term_ac: Query-Key content interaction + # queries: [B, U, W, N, H] -> permute to [B, N, U, W, H] for matmul + # keys: [B, U, C, N, H] -> permute to [B, N, U, H, C] for matmul + queries_p = queries.permute(0, 3, 1, 2, 4) # [B, N, U, W, H] + keys_p_t = keys.permute(0, 3, 1, 4, 2) # [B, N, U, H, C] + term_ac = torch.matmul(queries_p, keys_p_t) # [B, N, U, W, C] + + # term_bd: Query-Position interaction + # Original einsum: term_bd_unshifed = torch.einsum('buwnh,fnh->bnuwf', queries, sin_emb) + # queries shape: [B, U, W, N, H] + # sin_emb shape: [F, N, H] + # Target output shape: [B, N, U, W, F] + + # Permute queries to [B, N, U, W, H] for easier broadcasting with sin_emb + q_permuted = queries.permute(0, 3, 1, 2, 4) + + # Permute sin_emb to [N, H, F] to prepare for matmul + # sin_emb original is [F, N, H] + s_permuted = sin_emb.permute(1, 2, 0) # Shape: [N, H, F] + + # Reshape queries for matmul: [B, N, U*W, H] + q_reshaped = q_permuted.reshape(batch_size, num_heads, num_query_blocks * query_block_size, head_dim) + + # Perform matmul: [B, N, U*W, H] @ [N, H, F] + # s_permuted ([N, H, F]) will be broadcast to [B, N, H, F] + # Result: [B, N, U*W, F] + term_bd_unshifed_matmul = torch.matmul(q_reshaped, s_permuted) + + # Reshape to target [B, N, U, W, F] + term_bd_unshifed = term_bd_unshifed_matmul.reshape( + batch_size, + num_heads, + num_query_blocks, + query_block_size, + max_span_plus_1, + ) + + # Apply relative shift to term_bd_unshifed + term_bd_shifted = self._relative_shift( + term_bd_unshifed, + batch_size, + num_heads, + num_query_blocks, + query_block_size, + key_context_size, + max_span_plus_1, + ) # Shape [B, N, U, W, C] + + return term_ac + term_bd_shifted + + +class Gemma3nAudioAttention(nn.Module): + def __init__(self, config: Gemma3nAudioConfig): + super().__init__() + self.config = config + + self.num_heads = self.config.conf_num_attention_heads + self.hidden_size = self.config.hidden_size + self.head_dim = self.hidden_size // self.num_heads + + self.chunk_size = self.config.conf_attention_chunk_size + self.max_future_horizon = self.config.conf_attention_context_right + self.max_past_horizon = max(0, self.config.conf_attention_context_left - 1) + self.attention_logits_soft_cap = self.config.conf_attention_logit_cap + self.context_size = self.chunk_size + self.max_past_horizon + self.max_future_horizon + + self.relative_position_embedding = Gemma3nAudioRelativePositionEmbedding(config) + self.per_dim_scale = nn.Parameter(torch.zeros((self.head_dim,))) + + self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False) + self.k_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False) + self.v_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False) + + q_scale = self.head_dim**-0.5 + r_softplus_0 = 1.0 / torch.nn.functional.softplus(torch.tensor(0.0)) + self.register_buffer("q_scale", (q_scale * r_softplus_0).clone().detach(), persistent=False) + + lower_causal_mask = torch.tril( + torch.ones((self.context_size, self.chunk_size), dtype=torch.bool), + diagonal=0, + ).T + upper_causal_mask = torch.tril( + torch.ones((self.chunk_size, self.context_size), dtype=torch.bool), + diagonal=self.max_past_horizon + self.max_future_horizon, + ) + local_causal_valid_mask = torch.ones((self.chunk_size, self.context_size), dtype=torch.bool) + local_causal_valid_mask = local_causal_valid_mask * lower_causal_mask * upper_causal_mask + self.register_buffer("local_causal_valid_mask", local_causal_valid_mask, persistent=False) + + self.register_buffer( + "softcap", + torch.tensor(self.attention_logits_soft_cap).float(), + persistent=False, + ) + + def _pad_dim1(self, x: torch.Tensor, pad_left: int, pad_right: int) -> torch.Tensor: + batch, _, *tail_shape = x.shape + left = x.new_zeros((batch, pad_left, *tail_shape)) + right = x.new_zeros((batch, pad_right, *tail_shape)) + x = torch.cat([left, x, right], dim=1) + return x + + def _convert_to_block(self, hidden_states: torch.Tensor) -> torch.Tensor: + """Turns a sequence to non overlapping blocks. + + Args: + hidden_states: a tensor of [batch, time, ...]. + + Returns: + A tensor of [batch, num_blocks, block_size, ...], with necessary + paddings, + where output[:, i, ...] are x[:, i*block_size:(i+1)*block_size, ...]. + """ + shape = hidden_states.shape + b, t = shape[:2] + num_blocks = (t + self.chunk_size - 1) // self.chunk_size + + if (padding_len := num_blocks * self.chunk_size - t) > 0: + hidden_states = self._pad_dim1(hidden_states, 0, padding_len) + + permute_dims = (b, num_blocks, self.chunk_size) + shape[2:] + hidden_states = hidden_states.reshape(permute_dims).contiguous() + return hidden_states + + def _extract_block_context(self, hidden_states: torch.Tensor) -> torch.Tensor: + """Extracts temporal context for every block. + + Args: + hidden_states: a tensor of [batch, time, ...]. + + Returns: + A tensor of [batch, num_blocks, context_size, ...], with necessary + paddings, + where context_size = block_size + left_context + right_context, + and output[:, i, ...] are x[:, start-left_context:end+right_context, + ...], + start = i * block_size, end = (i + 1) * block_size. + """ + pad_left = self.max_past_horizon + # The JAX equivalent padding for signal.frame with pad_mode='valid' is + # (left_context, right_context + block_size - 1) on the time dimension. + # PyTorch's _pad_dim1 applies padding symmetrically if only one value is given, + # or (pad_dim_start, pad_dim_end) if two are given. + # Our _pad_dim1(x, pad_left, pad_right) pads dim -2 (time for [B,T,N,H]) + # or dim 1 (time for [B,T]). + # The current pad_right calculation matches the JAX effective padding. + pad_right = self.max_future_horizon + self.chunk_size - 1 + hidden_states = self._pad_dim1(hidden_states, pad_left, pad_right) + + frame_len = self.context_size + frame_step = self.chunk_size + + # Directly use unfold without the subframe_factor logic + # x.unfold(dimension, size, step) + # dimension=1 (time dimension, assuming x is [B, T_padded, ...]) + # size=frame_len (context_size) + # step=frame_step (chunk_size) + x_unfolded = hidden_states.unfold(dimension=1, size=frame_len, step=frame_step) + + # If x was [B, T_padded], x_unfolded is [B, num_blocks, frame_len] + # If x was [B, T_padded, N, H], x_unfolded is [B, num_blocks, N, H, frame_len] + # We want to match JAX's typical output for such operations which might be + # [B, num_blocks, frame_len, N, H] if N, H are present. + # The relative_position_embedding expects keys as [B, U, C, N, H]. + # If x_unfolded is [B, U, N, H, C(frame_len)], we need to move C. + if hidden_states.ndim > 2 and x_unfolded.ndim > 3: # Check if inner dimensions (like N, H) exist + # Current shape after unfold for [B, T_pad, N, H] is [B, U, N, H, C] + # Target shape for keys in RPE: [B, U, C, N, H] + x_unfolded = torch.movedim(x_unfolded, source=-1, destination=2) + + return x_unfolded.contiguous() + + def forward(self, hidden_states: torch.Tensor, mask: torch.BoolTensor) -> torch.Tensor: + # sl.Dense uses jax.numpy.einsum("...a,abcd->...bcd") and jax.numpy.select() + qkv_shape = (*hidden_states.shape[:-1], self.num_heads, self.head_dim) + query_states = self.q_proj(hidden_states).reshape(qkv_shape).contiguous() + key_states = self.k_proj(hidden_states).reshape(qkv_shape).contiguous() + value_states = self.v_proj(hidden_states).reshape(qkv_shape).contiguous() + + per_dim_scale_sp = torch.nn.functional.softplus(self.per_dim_scale) + + broadcast_shape = (1, 1, 1, self.head_dim) + per_dim_scale_sp_broadcast = per_dim_scale_sp.view(broadcast_shape) + query_states = query_states * self.q_scale * per_dim_scale_sp_broadcast + + batch_size, q_time = query_states.shape[:2] + + query_blocks = self._convert_to_block(query_states) + key_blocks = self._extract_block_context(key_states) + value_blocks = self._extract_block_context(value_states) + num_query_blocks = query_blocks.shape[1] + + # 1. Create a mask indicating originally valid positions. + original_valid_mask = ~mask # True for valid, False for padded + + # 2. Extract blocks from this validity mask. + extracted_valid_mask_blocks = self._extract_block_context(original_valid_mask) + + # If subframe_factor was used in _extract_block_context for a [B, T] input mask, + # the shape might be [B, U, C/SF, SF]. Reshape to [B, U, C]. + # batch_size and num_query_blocks are known from query_blocks. + # self.context_size is C. + if ( + extracted_valid_mask_blocks.ndim == 4 + and extracted_valid_mask_blocks.shape[2] * extracted_valid_mask_blocks.shape[3] == self.context_size + ): + extracted_valid_mask_blocks = extracted_valid_mask_blocks.reshape( + batch_size, num_query_blocks, self.context_size + ) + # After potential reshape, ensure it's [B, U, C] if it was from a [B,T] mask. + # This assertion might be too strict if _extract_block_context handles higher-rank inputs differently, + # but for the mask case, this should hold. + if extracted_valid_mask_blocks.shape != ( + batch_size, + num_query_blocks, + self.context_size, + ): + raise ValueError( + "Shape of extracted_valid_mask_blocks" + f" {extracted_valid_mask_blocks.shape} is not ({batch_size}," + f" {num_query_blocks}, {self.context_size}) after potential reshape." + ) + + # 3. Expand dimensions for broadcasting with logits and causal mask. + # Target shape for broadcasting with logits [B,N,U,W,C] + # extracted_valid_mask_blocks to [B, 1, U, 1, C] + condition_from_input_validity = extracted_valid_mask_blocks.unsqueeze(1).unsqueeze(-2) + + # self.local_causal_valid_mask is [W, C], True where allowed by local window. + # Expand to [1, 1, 1, W, C] + condition_from_causality = self.local_causal_valid_mask.unsqueeze(0).unsqueeze(0).unsqueeze(0) + + # 4. Combine the two conditions. + # final_condition will be True where a key is *both* originally valid *and* causally accessible. + # Broadcasts to [B, 1, U, W, C] + final_condition_for_where = torch.logical_and( + condition_from_input_validity, + condition_from_causality.to(condition_from_input_validity.device), # Ensure same device + ) + + # Embed queries and keys + logits = self.relative_position_embedding(query_blocks, key_blocks) + + # Apply attention logit softcap + # Ensure softcap is on the same device as logits + softcap_val = self.softcap.to(logits.device) + logits = logits / softcap_val + logits = torch.tanh(logits) + logits = logits * softcap_val + + # Apply the combined mask. + # final_condition_for_where will broadcast with logits [B,N,U,W,C] + logits = torch.where(final_condition_for_where, logits, torch.finfo(logits.dtype).min) + probabilities = torch.nn.functional.softmax(logits, dim=-1, dtype=torch.float32).to(dtype=value_blocks.dtype) + + # context_vectors is adapted from jax.numpy.einsum("BNuwc,BucNH->BuwNH", ...) + b_dim, n_dim, u_dim, w_dim, c_dim = probabilities.shape + h_dim = value_blocks.shape[-1] + prob_bun = probabilities.permute(0, 2, 1, 3, 4).reshape(-1, w_dim, c_dim) + v_bun = value_blocks.permute(0, 1, 3, 2, 4).reshape(-1, c_dim, h_dim) + result_bmm = torch.bmm(prob_bun, v_bun) + context_vectors = result_bmm.reshape(b_dim, u_dim, n_dim, w_dim, h_dim).permute(0, 1, 3, 2, 4) + context_vectors = context_vectors.reshape( + ( + batch_size, + num_query_blocks * self.chunk_size, + self.num_heads, + self.head_dim, + ) + ) + context_vectors = context_vectors[:, :q_time] + + return context_vectors + + +class Gemma3nAudioCumulativeGroupNorm(nn.Module): + """Applies Group Normalization cumulatively over the time dimension. + + This layer normalizes the input by calculating the mean and variance + cumulatively over the time dimension (dim 1). The statistics are computed + over all feature dimensions (specified by `feature_dims` and `num_channels`) + for elements marked as valid by the optional `mask`. + + If a `mask` is provided (True for valid, False for invalid/padded), + invalid time steps do not contribute to the statistics calculation, and + their corresponding output values are zeroed out. + + Scale and bias, if enabled, are applied per-channel (last dimension). + This behavior is similar to JAX's `GroupNormalization` with `num_groups=1` + and `cumulative=True`. + """ + + def __init__( + self, + num_channels: int, # Number of channels (size of the last dimension) + feature_dims: Sequence[int], # Sizes of non-channel feature dimensions, e.g., (H, W) for input [B,T,H,W,C] + eps: float = 1e-3, + ): + super().__init__() + self.num_channels = num_channels + self.feature_dims = tuple(feature_dims) + self.eps = eps + + # Scale parameter depends only on the channel dimension + self.weight = nn.Parameter(torch.ones(num_channels)) + + # Axes for normalization: all dimensions except Batch (0) and Time (1). + # For input [B, T, *feature_dims, C], these are dims from 2 onwards. + self.reduction_axes = tuple(range(2, 2 + len(self.feature_dims) + 1)) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + """Applies cumulative group norm, optionally using a mask. + + Args: + hidden_states: Input tensor, shape [B, T, *feature_dims, C]. + + Returns: + Normalized tensor with the same shape as x. + """ + expected_input_suffix = self.feature_dims + (self.num_channels,) + if hidden_states.shape[2:] != expected_input_suffix: + raise ValueError( + f"Input tensor shape suffix {hidden_states.shape[2:]} does not match expected" + f" suffix (feature_dims + num_channels) {expected_input_suffix}" + ) + + input_dtype = hidden_states.dtype + # Calculations are performed in float32 for numerical stability. + calc_dtype = torch.float32 + x_calc = hidden_states.to(calc_dtype) + + # Prepare a broadcastable mask (`mask_calc`). + # If no mask is provided, treat all elements as valid + # (mask_calc is all ones). + # Otherwise, expand the [B, T] mask to [B, T, 1, ..., 1] for broadcasting. + mask_calc = torch.ones_like(x_calc, dtype=calc_dtype) + + # Cumulative Statistics Calculation + # 1. Sum of values over reduction axes at each time step. + sum_values_at_t = torch.sum(x_calc, dim=self.reduction_axes, keepdim=True) + # 2. Cumulative sum of values over time. + cum_sum_values = torch.cumsum(sum_values_at_t, dim=1) + + # 3. Count of valid elements in the normalization group at each time step. + # (A "group" here consists of all features at a given Batch, Time). + elements_in_group_at_t = torch.sum(mask_calc, dim=self.reduction_axes, keepdim=True) + # 4. Cumulative count of valid elements over time. + cum_count_elements = torch.cumsum(elements_in_group_at_t, dim=1) + # Avoid division by zero if all preceding elements were masked. + safe_cum_count_elements = torch.clamp(cum_count_elements, min=1.0) + + # 5. Cumulative mean. + cum_mean = cum_sum_values / safe_cum_count_elements + + # 6. Sum of squared differences from the cumulative mean. + # Only sum for valid elements: (x_calc - cum_mean)^2 * mask_calc. + # Using x_calc here for the difference, as cum_mean already accounts for masking. + squared_diff_from_mean = (x_calc - cum_mean).pow(2) + sum_sq_diff_at_t = torch.sum(squared_diff_from_mean, dim=self.reduction_axes, keepdim=True) + + # 7. Cumulative sum of squared differences over time. + cum_sum_sq_diff = torch.cumsum(sum_sq_diff_at_t, dim=1) + + # 8. Cumulative variance. + cum_variance = cum_sum_sq_diff / safe_cum_count_elements + + # Normalize the input using the calculated cumulative statistics: + # (x - E[x]) / sqrt(Var[x] + eps) + normalized_x = (x_calc - cum_mean) * torch.rsqrt(cum_variance + self.eps) + + # Apply affine transformation (scale and bias) if enabled. + # Scale and bias are applied per-channel (last dimension). + scale = self.weight.to(calc_dtype) + # Reshape for broadcasting: [C] -> [1, ..., 1, C] + scale_view_shape = [1] * (hidden_states.dim() - 1) + [self.num_channels] + normalized_x = normalized_x * scale.view(scale_view_shape) + + # Zero out outputs for time steps that were originally masked (where mask_calc is 0). + # This ensures padded/invalid positions in the input result in zero output. + final_output = normalized_x * mask_calc + + return final_output.to(input_dtype) + + +class Gemma3nAudioSSCPConvBlock(nn.Module): + """A single convolution block for the SubSampleConvProjection. + + This block consists of a 2D convolution, followed by CumulativeGroupNorm, + and a ReLU activation. It handles manual padding for the convolution. + """ + + def __init__( + self, + config: Gemma3nAudioConfig, + idx: int, + input_freq_dim: int, # Changed from input_spatial_dim + manual_padding: tuple[int, int, int, int] = (0, 0, 0, 0), + ): + super().__init__() + self.config = config + self.manual_padding = manual_padding + + # in_channels is 1 for the first block, or C_out from previous block's conv + in_channels = 1 if idx == 0 else self.config.sscp_conv_channel_size[idx - 1] + out_channels = self.config.sscp_conv_channel_size[idx] + kernel_h, kernel_w = self.config.sscp_conv_kernel_size[idx] + stride_h, stride_w = self.config.sscp_conv_stride_size[idx] + + self.conv = nn.Conv2d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=( + kernel_h, + kernel_w, + ), # Kernel (kH, kW) operates on (Time, Freq_dim) + stride=(stride_h, stride_w), + padding=(0, 0), # Manual padding is used + bias=False, + ) + + # Calculate output frequency dimension (f_out_conv) after this convolution. + # input_freq_dim is the unpadded width (feature dimension). + # self.manual_padding is (pad_F_left, pad_F_right, pad_T_top, pad_T_bottom) + f_in_padded = input_freq_dim + self.manual_padding[0] + self.manual_padding[1] + f_out_conv = (f_in_padded - kernel_w) // stride_w + 1 + + self.norm = Gemma3nAudioCumulativeGroupNorm( + num_channels=out_channels, # Channels of the conv output + feature_dims=(f_out_conv,), # The frequency dimension size after conv + eps=self.config.sscp_conv_group_norm_eps, + ) + + self.activation = nn.ReLU() + + def forward(self, audio_encodings: torch.Tensor) -> torch.Tensor: + # Input audio_encodings is [B, C_in, T_in, F_in] (e.g., C_in=1) + # manual_padding is (pad_F_left, pad_F_right, pad_T_top, pad_T_bottom) + # F.pad applies to last two dims: F_in then T_in + audio_encodings_padded = F.pad(audio_encodings, self.manual_padding, mode="constant", value=0.0).to( + self.conv.weight.dtype + ) + # Expected padded shape for F_in, k_w=3, pad_F=(1,1) -> F_padded = F_in+2 + # Expected padded shape for T_in, k_h=3, pad_T=(0,2) -> T_padded = T_in+2 + audio_encodings_conv = self.conv(audio_encodings_padded) + # Expected conv output shape: [B, C_out, T_out, F_out] + # Input to norm is [B, T_out, F_out, C_out] + x_for_norm = audio_encodings_conv.permute(0, 2, 3, 1).contiguous() + x_normed = self.norm(x_for_norm) + # Output of norm is [B, T_out, F_out, C_out], permute back to [B, C_out, T_out, F_out] + audio_encodings_normed = x_normed.permute(0, 3, 1, 2).contiguous() + return self.activation(audio_encodings_normed) + + +class Gemma3nAudioSubSampleConvProjection(nn.Module): + def __init__(self, config: Gemma3nAudioConfig): + super().__init__() + self.config = config + + current_f_for_block_input = config.input_feat_size # Start with original feature dim + calculated_block_padding = [] + calculated_f_out_dims = [] # Tracking frequency dimension output sizes + + for i in range(2): # Assuming 2 conv layers as per sscp_conv_... arrays + kernel_h, kernel_w = config.sscp_conv_kernel_size[i] + stride_h, stride_w = config.sscp_conv_stride_size[i] + + # Padding for Time (Height for Conv2d) - REVERSE_CAUSAL like + # JAX 'reverse_causal' padding is (0, kernel_size - 1) + pad_t_top = 0 + pad_t_bottom = kernel_h - 1 + + # Frequency Padding (Width for Conv2d) + # Based on JAX effective padding (1,1) for F_in=10, K_w=3, S_w=2 + # and the successful test configuration. + # If kernel/stride/input_freq for frequency changes, this might need re-evaluation + # to match generic JAX 'SAME' behavior if it differs. + pad_f_left = 1 + pad_f_right = 1 + + manual_padding_tuple = ( + pad_f_left, + pad_f_right, + pad_t_top, + pad_t_bottom, + ) + calculated_block_padding.append(manual_padding_tuple) + + # Calculate output frequency dimension after this convolution + # This uses the actual padding applied and kernel/stride. + f_in_padded = current_f_for_block_input + pad_f_left + pad_f_right + f_out_after_conv = (f_in_padded - kernel_w) // stride_w + 1 # Assuming dilation_w = 1 + calculated_f_out_dims.append(f_out_after_conv) + current_f_for_block_input = f_out_after_conv + + self.conv_0 = Gemma3nAudioSSCPConvBlock( + idx=0, + input_freq_dim=config.input_feat_size, # Pass original feature dim + config=config, + manual_padding=calculated_block_padding[0], + ) + self.conv_1 = Gemma3nAudioSSCPConvBlock( + idx=1, + input_freq_dim=calculated_f_out_dims[0], # Output freq dim from conv_0 + config=config, + manual_padding=calculated_block_padding[1], + ) + final_c_out = config.sscp_conv_channel_size[-1] + final_f_out = calculated_f_out_dims[-1] # Final frequency dimension + self.input_proj_in_features = final_c_out * final_f_out + self.input_proj_linear = nn.Linear(self.input_proj_in_features, self.config.hidden_size, bias=False) + + def forward(self, audio_encodings: torch.Tensor) -> torch.Tensor: + # audio_encodings is [B, T, F_in] + # Reshape to [B, 1, T, F_in] (Batch, Channels=1, Height=Time, Width=F_in) + audio_encodings_reshaped = audio_encodings.unsqueeze(1) + x = self.conv_0(audio_encodings_reshaped) + x = self.conv_1(x) + # x from conv_1 is [B, C_out_1, T_out_1, F_out_1] + b, c_out, t_out, f_out = x.shape + # Permute to [B, T_out_1, F_out_1, C_out_1] then flatten F_out_1 and C_out_1 + x_permuted = x.permute(0, 2, 3, 1).contiguous() + output_flattened = x_permuted.view(b, t_out, f_out * c_out) + output = self.input_proj_linear(output_flattened) + return output + + +class Gemma3nAudioConformerAttention(nn.Module): + def __init__(self, config: Gemma3nAudioConfig): + super().__init__() + self.config = config + self.post_in_features = self.config.hidden_size + self.register_buffer("gradient_clipping", torch.tensor(self.config.gradient_clipping), persistent=False) + self.pre_attn_norm = Gemma3nRMSNorm(self.config.hidden_size) + self.attn = Gemma3nAudioAttention(config) + self.post = nn.Linear(self.post_in_features, self.config.hidden_size, bias=False) + self.post_norm = Gemma3nRMSNorm(self.config.hidden_size) + + def forward(self, audio_encodings: torch.Tensor, audio_mel_mask: torch.BoolTensor) -> torch.Tensor: + audio_encodings_input_to_attn = audio_encodings + audio_encodings = torch.clamp(audio_encodings, -self.gradient_clipping, self.gradient_clipping) + audio_encodings_norm = self.pre_attn_norm(audio_encodings) + # Output of self.attn is [B, T, NumHeads, HeadDim] + audio_encodings_attn_out = self.attn(audio_encodings_norm, audio_mel_mask) + + # Reshape from [B, T, NumHeads, HeadDim] to [B, T, NumHeads * HeadDim] + # NumHeads * HeadDim = hidden_size + b, t, num_heads, head_dim = audio_encodings_attn_out.shape + audio_encodings_reshaped = audio_encodings_attn_out.reshape(b, t, num_heads * head_dim) + + audio_encodings = self.post(audio_encodings_reshaped) + audio_encodings = torch.clamp(audio_encodings, -self.gradient_clipping, self.gradient_clipping) + return audio_encodings_input_to_attn + self.post_norm(audio_encodings) + + +class Gemma3nAudioConformerFeedForward(nn.Module): + def __init__(self, config: Gemma3nAudioConfig): + super().__init__() + self.config = config + + self.register_buffer("gradient_clipping", torch.tensor(self.config.gradient_clipping), persistent=False) + + self.pre_layer_norm = Gemma3nRMSNorm(self.config.hidden_size) + self.ffw_layer_1 = nn.Linear(self.config.hidden_size, self.config.hidden_size * 4, bias=False) + self.ffw_layer_2 = nn.Linear(self.config.hidden_size * 4, self.config.hidden_size, bias=False) + self.post_layer_norm = Gemma3nRMSNorm(self.config.hidden_size) + self.post_layer_scale = torch.tensor(self.config.conf_residual_weight) + + def forward(self, audio_encodings: torch.Tensor) -> torch.Tensor: + residual = audio_encodings + audio_encodings = torch.clamp(audio_encodings, -self.gradient_clipping, self.gradient_clipping) + audio_encodings = self.pre_layer_norm(audio_encodings) + audio_encodings: torch.Tensor = self.ffw_layer_1(audio_encodings) + audio_encodings = nn.functional.silu(audio_encodings) + audio_encodings: torch.Tensor = self.ffw_layer_2(audio_encodings) + audio_encodings = torch.clamp(audio_encodings, -self.gradient_clipping, self.gradient_clipping) + audio_encodings = self.post_layer_norm(audio_encodings) + return residual + (audio_encodings * self.post_layer_scale) + + +class Gemma3nAudioConformerLightConv1d(nn.Module): + def __init__(self, config: Gemma3nAudioConfig): + super().__init__() + self.config = config + + self.pre_layer_norm = Gemma3nRMSNorm(self.config.hidden_size, eps=self.config.rms_norm_eps) + self.linear_start = nn.Linear(self.config.hidden_size, self.config.hidden_size * 2, bias=False) + self.depthwise_conv1d = nn.Conv1d( + in_channels=self.config.hidden_size, + out_channels=self.config.hidden_size, + kernel_size=self.config.conf_conv_kernel_size, + stride=1, + padding=0, # Manual causal padding + groups=self.config.hidden_size, # Depthwise + bias=False, + ) + self.register_buffer("gradient_clipping", torch.tensor(self.config.gradient_clipping), persistent=False) + self.conv_norm = Gemma3nRMSNorm(self.config.hidden_size, eps=self.config.rms_norm_eps) + self.linear_end = nn.Linear(self.config.hidden_size, self.config.hidden_size, bias=False) + + self.causal_padding = self.config.conf_conv_kernel_size - 1 + + def forward(self, audio_encodings: torch.Tensor) -> torch.Tensor: + audio_encodings_residual = audio_encodings # Save for residual connection + + audio_encodings = self.pre_layer_norm(audio_encodings) + audio_encodings = self.linear_start(audio_encodings) + audio_encodings = torch.nn.functional.glu(audio_encodings, dim=-1) + # Permute for Conv1d: [B, T, D] -> [B, D, T] + audio_encodings_permuted = audio_encodings.permute(0, 2, 1) + # Apply manual causal padding + audio_encodings_permuted_padded = F.pad(audio_encodings_permuted, (self.causal_padding, 0)) + audio_encodings = self.depthwise_conv1d(audio_encodings_permuted_padded) + # Permute back: [B, D, T_out] -> [B, T_out, D] + audio_encodings = audio_encodings.permute(0, 2, 1) + audio_encodings = torch.clamp(audio_encodings, -self.gradient_clipping, self.gradient_clipping) + audio_encodings = self.conv_norm(audio_encodings) + audio_encodings = nn.functional.silu(audio_encodings) + audio_encodings = self.linear_end(audio_encodings) + output = audio_encodings + audio_encodings_residual + return output + + +class Gemma3nAudioConformerBlock(nn.Module): + def __init__(self, config: Gemma3nAudioConfig): + super().__init__() + self.config = config + + self.ffw_layer_start = Gemma3nAudioConformerFeedForward(self.config) + self.attention = Gemma3nAudioConformerAttention(self.config) + self.lconv1d = Gemma3nAudioConformerLightConv1d(self.config) + self.ffw_layer_end = Gemma3nAudioConformerFeedForward(self.config) + self.register_buffer("gradient_clipping", torch.tensor(self.config.gradient_clipping), persistent=False) + self.norm = Gemma3nRMSNorm(self.config.hidden_size) + + def forward(self, audio_encodings: torch.Tensor, audio_mel_mask: torch.BoolTensor) -> torch.Tensor: + audio_encodings = self.ffw_layer_start(audio_encodings) + audio_encodings = self.attention(audio_encodings, audio_mel_mask) + validity_mask_for_lconv = ~audio_mel_mask # True for valid + audio_encodings_for_lconv_input = audio_encodings * validity_mask_for_lconv.unsqueeze(-1).to( + audio_encodings.dtype + ) + audio_encodings = self.lconv1d(audio_encodings_for_lconv_input) + + audio_encodings = self.ffw_layer_end(audio_encodings) + audio_encodings = torch.clamp(audio_encodings, -self.gradient_clipping, self.gradient_clipping) + output = self.norm(audio_encodings) + return output + + +class Gemma3nAudioEncoder(PreTrainedModel): + """ + An audio encoder based on the [Universal Speech Model](https://huggingface.co/papers/2303.01037) architecture. + """ + + config: Gemma3nAudioConfig + + main_input_name = "audio_mel" + + def __init__(self, config: Gemma3nAudioConfig): + super().__init__(config) + self.config = config + + self.subsample_conv_projection = Gemma3nAudioSubSampleConvProjection(config) + self.conformer = nn.ModuleList( + [Gemma3nAudioConformerBlock(config) for _ in range(config.conf_num_hidden_layers)] + ) + + def forward( + self, audio_mel: torch.Tensor, audio_mel_mask: torch.BoolTensor + ) -> tuple[torch.Tensor, torch.BoolTensor]: + """Encodes a batch of MELs. + + Args: + audio_mel: a torch.Tensor of shape [batch, num_frames, num_channels, + mel_bins]. + + Returns: + audio_encodings: a torch.Tensor of shape + `[batch_size, self.config.audio_soft_tokens_per_image, + self.config.audio_config.hidden_size]` + audio_mel_mask: a torch.BoolTensor of shape [batch, num_frames]. + """ + audio_encodings = self.subsample_conv_projection(audio_mel) # audio_encodings: [B, T_sub, D] + + # Subsample the input audio_mel_mask to match the time dimension of audio_encodings (T_sub) + t_sub = audio_encodings.shape[1] + + time_stride_product = 1 + for stride_pair_idx in range(len(self.config.sscp_conv_stride_size)): + time_stride_product *= self.config.sscp_conv_stride_size[stride_pair_idx][0] + + # Create indices for gathering from the original mask. + # These indices map to original time steps corresponding to the start of each + # receptive field in the subsampled output. + indices = torch.arange(t_sub, device=audio_mel_mask.device) * time_stride_product + indices = torch.clamp(indices, max=audio_mel_mask.shape[1] - 1) # Ensure indices are valid + + # Expand indices for batch compatibility if B > 1 and indices is 1D. + if audio_mel_mask.ndim > 1 and indices.ndim == 1: + indices = indices.unsqueeze(0).expand(audio_mel_mask.shape[0], -1) # [B, T_sub] + elif ( + audio_mel_mask.ndim == indices.ndim + and audio_mel_mask.shape[0] == 1 + and indices.shape[0] != 1 + and t_sub == indices.shape[0] + ): + # Handle case where B=1 but indices became [T_sub] instead of [1, T_sub] + indices = indices.unsqueeze(0) + + current_mask = torch.gather(audio_mel_mask, 1, indices) # [B, T_sub] + + for block in self.conformer: + audio_encodings = block(audio_encodings, current_mask) # Pass the processed mask + + if self.config.conf_reduction_factor > 1: + audio_encodings = audio_encodings[:, :: self.config.conf_reduction_factor] + # Reduce the mask as well + current_mask = current_mask[:, :: self.config.conf_reduction_factor] + + audio_encodings = audio_encodings.masked_fill(current_mask.unsqueeze(-1), 0.0) + return audio_encodings, current_mask + + +class Gemma3nTextScaledWordEmbedding(nn.Embedding): + """ + This module overrides nn.Embeddings' forward by multiplying with embeddings scale. + """ + + def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int, embed_scale: float = 1.0): + super().__init__(num_embeddings, embedding_dim, padding_idx) + self.register_buffer("embed_scale", torch.tensor(embed_scale), persistent=False) + + def forward(self, input_ids: torch.Tensor): + return super().forward(input_ids) * self.embed_scale.to(self.weight.dtype) + + +class Gemma3nTextLaurelBlock(nn.Module): + """Learned Augmented Residual Layer""" + + def __init__(self, config: Gemma3nTextConfig): + super().__init__() + self.config = config + + self.linear_left = nn.Linear(self.config.hidden_size, self.config.laurel_rank, bias=False) + self.linear_right = nn.Linear(self.config.laurel_rank, self.config.hidden_size, bias=False) + self.post_laurel_norm = Gemma3nRMSNorm(self.config.hidden_size, eps=self.config.rms_norm_eps) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + laurel_hidden_states: torch.Tensor = self.linear_left(hidden_states) + laurel_hidden_states: torch.Tensor = self.linear_right(laurel_hidden_states) + normed_laurel_hidden_states = self.post_laurel_norm(laurel_hidden_states) + return hidden_states + normed_laurel_hidden_states + + +class Gemma3nTextMLP(nn.Module): + def __init__(self, config: Gemma3nTextConfig, layer_idx: int = 0): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.intermediate_size = config.intermediate_size[layer_idx] + self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) + self.act_fn = ACT2FN[config.hidden_activation] + self.activation_sparsity = config.activation_sparsity_pattern[layer_idx] + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + gate_proj = self.gate_proj(hidden_states) + if self.activation_sparsity > 0.0: + gate_proj = self._gaussian_topk(gate_proj) + activations = self.act_fn(gate_proj) + up_proj = self.up_proj(hidden_states) + down_proj = self.down_proj(activations * up_proj) + return down_proj + + def _gaussian_topk(self, inputs: torch.Tensor) -> torch.Tensor: + target_sparsity_tensor = torch.tensor(self.activation_sparsity, dtype=torch.float32, device=inputs.device) + # normal_dist and std_multiplier are adapted from jax.scipy.stats.norm.ppf(). + # + # References: + # * https://docs.jax.dev/en/latest/_autosummary/jax.scipy.stats.norm.ppf.html + # * https://pytorch.org/docs/stable/distributions.html#torch.distributions.normal.Normal + # * https://pytorch.org/docs/stable/distributions.html#torch.distributions.transformed_distribution.TransformedDistribution.icdf + normal_dist = torch.distributions.normal.Normal(0, 1) + std_multiplier: torch.Tensor = normal_dist.icdf(target_sparsity_tensor) + std_multiplier = std_multiplier.type(inputs.dtype) + inputs_mean = torch.mean(inputs, dim=-1, keepdim=True) + inputs_std = torch.std(inputs, dim=-1, keepdim=True, unbiased=False) + cutoff_x = inputs_mean + inputs_std * std_multiplier + return nn.functional.relu(inputs - cutoff_x) + + +class Gemma3nTextAltUp(nn.Module): + """Alternating Updates (AltUp) + + The AltUp module wraps transformer layers. The `predict` step modifies the + input to the transformer layer, and the `correct` step propagates the output + of the transformer layer to the sparsely updated dimensions. + + See more in the research paper: + + https://proceedings.neurips.cc/paper_files/paper/2023/file/f2059277ac6ce66e7e5543001afa8bb5-Paper-Conference.pdf + """ + + def __init__(self, config: Gemma3nTextConfig): + super().__init__() + self.config = config + self.correct_output_scale = nn.Parameter(torch.zeros(self.config.hidden_size)) + self.correction_coefs = nn.Linear(self.config.altup_num_inputs, self.config.altup_num_inputs, bias=False) + self.prediction_coefs = nn.Linear(self.config.altup_num_inputs, self.config.altup_num_inputs**2, bias=False) + self.modality_router = nn.Linear(self.config.hidden_size, self.config.altup_num_inputs, bias=False) + self.router_norm = Gemma3nRMSNorm(self.config.hidden_size, eps=self.config.rms_norm_eps) + self.register_buffer("router_input_scale", torch.tensor(self.config.hidden_size**-1.0), persistent=False) + + def compute_router_modalities(self, x: torch.Tensor) -> torch.Tensor: + router_inputs = self.router_norm(x) * self.router_input_scale + routed = self.modality_router(router_inputs) + return torch.tanh(routed.float()).type_as(x) + + def predict(self, hidden_states: torch.Tensor) -> torch.Tensor: + """Predicts the output of a layer using a trainable map. + + Args: + hidden_states: A 4D tensor of shape `[num_altup_inputs, batch_size, num_tokens, hidden_size]` derived by + stacking the input embeddings and preprocessing the last `num_altup_inputs - 1` matrices. + + Returns: + A 4D tensor of shape `[num_altup_inputs, batch_size, num_tokens, hidden_size]` containing the predictions. + """ + modalities = self.compute_router_modalities(hidden_states[self.config.altup_active_idx]) + + if self.training and self.config.altup_coef_clip is not None: + self.prediction_coefs.weight.data.clamp_(-self.config.altup_coef_clip, self.config.altup_coef_clip) + + # Project and then transpose all 2D matrices contained so that mulmat gives the correct result + all_coefs: torch.Tensor = ( + self.prediction_coefs(modalities) + .reshape(*modalities.shape[:-1], self.config.altup_num_inputs, self.config.altup_num_inputs) + .permute(0, 1, 3, 2) + ) + + # permute hidden_states to [batch_size, num_tokens, hidden_size, altup_num_inputs] + predictions = torch.matmul(hidden_states.permute(1, 2, 3, 0), all_coefs) + predictions = predictions.permute(3, 0, 1, 2) # undo the permute + predictions += hidden_states # add the original input + return predictions.contiguous().type_as(hidden_states) + + def correct(self, predictions: torch.Tensor, activated: torch.Tensor) -> torch.Tensor: + """Corrects the predictions relative to the + + Args: + predictions: A 4D tensor of shape `[num_altup_inputs, batch_size, num_tokens, hidden_size]` derived by + stacking the input embeddings and preprocessing the last `num_altup_inputs - 1` matrices. + activated: A 3D tensor of shape `[batch_size, num_tokens, hidden_size]` containing the activated inputs. + + Returns: + A 4D tensor of shape `[num_altup_inputs, batch_size, num_tokens, hidden_size]` correcting the original + predictions relative to the activated input embeddings. + """ + modalities = self.compute_router_modalities(activated) + innovation = activated - predictions[self.config.altup_active_idx] # (batch, num_tokens, hidden_size) + innovation = innovation.repeat(self.config.altup_num_inputs, 1, 1, 1) # Repeat on dim0 to match predictions + + if self.config.altup_coef_clip is not None: + self.correction_coefs.weight.data.clamp_(-self.config.altup_coef_clip, self.config.altup_coef_clip) + + # all_coefs adapted from jax.numpy.einsum("...p,pi->...i", ...) + # Permute to (altup_num_inputs, batch_size, num_tokens) as the last dim is a scalar applied to each altup input + # and expand on dim1 for broadcastability + all_coefs: torch.Tensor = self.correction_coefs(modalities) + 1.0 + all_coefs = all_coefs.permute(2, 0, 1).unsqueeze(-1) + + corrected = torch.mul(innovation, all_coefs) + corrected += predictions # add the original input + return corrected.contiguous().type_as(activated) + + def forward(self, corrected: torch.Tensor) -> torch.Tensor: + """ + This is only defined as the `forward` so that accelerate hooks can move correctly `correct_output_scale` + (which is a nn.Parameter, not a Module) between devices when offloading. It is otherwise only used in + `scale_corrected_output` + """ + return (corrected.type_as(self.correct_output_scale) * self.correct_output_scale).type_as(corrected) + + def scale_corrected_output(self, corrected: torch.Tensor) -> torch.Tensor: + """Scales the provided 3D tensor of shape [batch_size, num_tokens, hidden_size].""" + return self.forward(corrected) + + +class Gemma3nTextRotaryEmbedding(nn.Module): + inv_freq: torch.Tensor # fix linting for `register_buffer` + + def __init__(self, config: Gemma3nTextConfig, device=None): + super().__init__() + # BC: "rope_type" was originally "type" + if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): + self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) + else: + self.rope_type = "default" + self.max_seq_len_cached = config.max_position_embeddings + self.original_max_seq_len = config.max_position_embeddings + + self.config = config + self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + + inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) + self.original_inv_freq = self.inv_freq + + @torch.no_grad() + @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) + def forward(self, x, position_ids): + inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device) + position_ids_expanded = position_ids[:, None, :].float() + + device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" + with torch.autocast(device_type=device_type, enabled=False): # Force float32 + freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) + emb = torch.cat((freqs, freqs), dim=-1) + cos = emb.cos() * self.attention_scaling + sin = emb.sin() * self.attention_scaling + + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) + + +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + +def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: + """ + This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, + num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) + """ + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) + + +def eager_attention_forward( + module: nn.Module, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attention_mask: Optional[torch.Tensor], + dropout: float = 0.0, + scaling: Optional[float] = None, + softcap: Optional[float] = None, + **kwargs, +) -> tuple[torch.Tensor, torch.Tensor]: + if scaling is None: + scaling = module.head_dim**-0.5 + + key_states = repeat_kv(key, module.num_key_value_groups) + value_states = repeat_kv(value, module.num_key_value_groups) + + attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling + + if softcap is not None: + attn_weights = attn_weights / softcap + attn_weights = torch.tanh(attn_weights) + attn_weights = attn_weights * softcap + if attention_mask is not None: # no matter the length, we just slice it + causal_mask = attention_mask[:, :, :, : key_states.shape[-2]] + attn_weights = attn_weights + causal_mask + + # upcast attention to fp32 + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype) + attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) + attn_output = torch.matmul(attn_weights, value_states) + attn_output = attn_output.transpose(1, 2).contiguous() + return attn_output, attn_weights + + +def apply_rotary_pos_emb( + x: torch.Tensor, + cos: torch.Tensor, + sin: torch.Tensor, + position_ids: Optional[torch.Tensor] = None, + unsqueeze_dim: int = 1, +): + """Applies Rotary Position Embedding to the query and key tensors. + + Args: + x (`torch.Tensor`): The tensor to embed. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + position_ids (`torch.Tensor`, *optional*): + Deprecated and unused. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + cos = cos.unsqueeze(unsqueeze_dim) + sin = sin.unsqueeze(unsqueeze_dim) + return (x * cos) + (rotate_half(x) * sin) + + +class Gemma3nTextAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config: Gemma3nTextConfig, layer_idx: int): + super().__init__() + self.is_sliding = config.layer_types[layer_idx] == "sliding_attention" + self.config = config + self.layer_idx = layer_idx + self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads) + self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads + self.attention_dropout = self.config.attention_dropout + self.is_causal = True + + self.q_proj = nn.Linear( + config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias + ) + self.k_proj = nn.Linear( + config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias + ) + self.v_proj = nn.Linear( + config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias + ) + self.o_proj = nn.Linear( + config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias + ) + self.sliding_window = config.sliding_window if self.is_sliding else None + + self.q_norm = Gemma3nRMSNorm(dim=config.head_dim, eps=config.rms_norm_eps) + self.k_norm = Gemma3nRMSNorm(dim=config.head_dim, eps=config.rms_norm_eps) + self.v_norm = Gemma3nRMSNorm(dim=config.head_dim, eps=config.rms_norm_eps, with_scale=False) + + first_kv_shared_layer_idx = self.config.num_hidden_layers - self.config.num_kv_shared_layers + self.is_kv_shared_layer = layer_idx >= first_kv_shared_layer_idx > 0 + prev_layers = config.layer_types[:first_kv_shared_layer_idx] + if self.is_kv_shared_layer: + # For shared layers, find the last non-shared layer of the same type before sharing starts + self.kv_shared_layer_index = len(prev_layers) - 1 - prev_layers[::-1].index(config.layer_types[layer_idx]) + self.store_full_length_kv = False + else: + self.kv_shared_layer_index = None + # For non-shared layers, store full-length kv if this is the last non-shared layer of its type + self.store_full_length_kv = layer_idx == len(prev_layers) - 1 - prev_layers[::-1].index( + config.layer_types[layer_idx] + ) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + position_embeddings: torch.Tensor, + attention_mask: Optional[torch.Tensor], + past_key_values: Optional[Cache] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Unpack[FlashAttentionKwargs], + ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]: + input_shape = hidden_states.shape[:-1] + hidden_shape = (*input_shape, -1, self.config.head_dim) + + cos, sin = position_embeddings + + query_states = self.q_proj(hidden_states).view(hidden_shape) + query_states = self.q_norm(query_states) + query_states = apply_rotary_pos_emb(query_states, cos, sin, unsqueeze_dim=2) + query_states = query_states.transpose(1, 2) + + # For layers with shared KV (from kv sharing point onwards), we reuse the same keys/values states as the last non-sharing layer + if self.is_kv_shared_layer and past_key_values is not None: + key_states, value_states = past_key_values.shared_layers[self.kv_shared_layer_index] + # Device of past layer may be different from current one + key_states = key_states.to(query_states.device) + value_states = value_states.to(query_states.device) + else: + key_states = self.k_proj(hidden_states).view(hidden_shape) + key_states = self.k_norm(key_states) + key_states = apply_rotary_pos_emb(key_states, cos, sin, unsqueeze_dim=2) + key_states = key_states.transpose(1, 2) + + value_states = self.v_proj(hidden_states).view(hidden_shape) + value_states = self.v_norm(value_states) + value_states = value_states.transpose(1, 2) + + if past_key_values is not None: + # sin and cos are specific to RoPE models; cache_position needed for the static cache + cache_kwargs = { + "sin": sin, + "cos": cos, + "cache_position": cache_position, + "sliding_window": self.sliding_window, + } + if not self.is_kv_shared_layer: + key_states, value_states = past_key_values.update( + key_states, value_states, self.layer_idx, cache_kwargs + ) + if self.store_full_length_kv: + if not hasattr(past_key_values, "shared_layers"): + past_key_values.shared_layers = {} + past_key_values.shared_layers[self.layer_idx] = key_states, value_states + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + attn_output, attn_weights = attention_interface( + self, + query_states, + key_states, + value_states, + attention_mask, + dropout=self.attention_dropout if self.training else 0.0, + scaling=1.0, + sliding_window=self.sliding_window, + **kwargs, + ) + + attn_output = attn_output.reshape(*input_shape, -1).contiguous() + attn_output = self.o_proj(attn_output) + return attn_output, attn_weights + + +class Gemma3nTextDecoderLayer(GradientCheckpointingLayer): + def __init__(self, config: Gemma3nTextConfig, layer_idx: int): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.layer_idx = layer_idx + self.attention_type = config.layer_types[layer_idx] + self.self_attn = Gemma3nTextAttention(config, layer_idx) + self.mlp = Gemma3nTextMLP(config, layer_idx=layer_idx) + self.input_layernorm = Gemma3nRMSNorm(self.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = Gemma3nRMSNorm(self.hidden_size, eps=config.rms_norm_eps) + self.pre_feedforward_layernorm = Gemma3nRMSNorm(self.hidden_size, eps=config.rms_norm_eps) + self.post_feedforward_layernorm = Gemma3nRMSNorm(self.hidden_size, eps=config.rms_norm_eps) + + self.hidden_size_per_layer_input = config.hidden_size_per_layer_input + self.act_fn = ACT2FN[config.hidden_activation] + + self.altup = Gemma3nTextAltUp(config) + self.laurel = Gemma3nTextLaurelBlock(config) + self.per_layer_input_gate = nn.Linear(self.hidden_size, self.hidden_size_per_layer_input, bias=False) + self.per_layer_projection = nn.Linear(self.hidden_size_per_layer_input, self.hidden_size, bias=False) + self.post_per_layer_input_norm = Gemma3nRMSNorm(self.hidden_size, eps=config.rms_norm_eps) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + position_embeddings_global: torch.Tensor, + position_embeddings_local: torch.Tensor, + per_layer_input: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + cache_position: Optional[torch.LongTensor] = None, + **kwargs, + ) -> tuple[torch.Tensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]: + predictions = self.altup.predict(hidden_states) + active_prediction = predictions[self.config.altup_active_idx] + + active_prediction_normed = self.input_layernorm(active_prediction) + laurel_output = self.laurel(active_prediction_normed) + + # apply global RoPE to non-sliding layer only + if self.self_attn.is_sliding: + position_embeddings = position_embeddings_local + else: + position_embeddings = position_embeddings_global + + attn, self_attn_weights = self.self_attn( + hidden_states=active_prediction_normed, + position_embeddings=position_embeddings, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + output_attentions=output_attentions, + use_cache=use_cache, + cache_position=cache_position, + **kwargs, + ) + attn = self.post_attention_layernorm(attn) + + attn_gated = active_prediction + attn + attn_laurel = (attn_gated + laurel_output) / math.sqrt(2) + + attn_norm = self.pre_feedforward_layernorm(attn_laurel) + attn_ffw = self.mlp(attn_norm) + attn_ffw_norm = self.post_feedforward_layernorm(attn_ffw) + attn_ffw_laurel_gated = attn_laurel + attn_ffw_norm + corrected_predictions = self.altup.correct(predictions, attn_ffw_laurel_gated) + + first_prediction = corrected_predictions[self.config.altup_active_idx].clone() + if self.config.altup_correct_scale: + first_prediction = self.altup.scale_corrected_output(first_prediction) + + # per_layer_input_gate adapted from jax.numpy.einsum("btd,dp->btp", ...) + first_prediction = self.per_layer_input_gate(first_prediction) + first_prediction = self.act_fn(first_prediction) + first_prediction = torch.multiply(first_prediction, per_layer_input) + + # per_layer_projection adapted from jax.numpy.einsum("btp,pd->btd", ...) + first_prediction = self.per_layer_projection(first_prediction) + first_prediction = self.post_per_layer_input_norm(first_prediction) + corrected_predictions[1:] += first_prediction + + outputs = (corrected_predictions,) + + if output_attentions: + outputs += (self_attn_weights,) + + return outputs + + +@auto_docstring +class Gemma3nPreTrainedModel(PreTrainedModel): + config: Gemma3nConfig + base_model_prefix = "" + supports_gradient_checkpointing = True + _no_split_modules = ["Gemma3nTextDecoderLayer"] + _skip_keys_device_placement = ["past_key_values"] + _supports_flash_attn = True + _supports_sdpa = True + _supports_flex_attn = True + + _can_compile_fullgraph = True + _supports_attention_backend = True + _can_record_outputs = { + "hidden_states": Gemma3nTextDecoderLayer, + "attentions": Gemma3nTextAttention, + } + + def _init_weights(self, module): + super()._init_weights(module) + if isinstance(module, Gemma3nAudioCumulativeGroupNorm): + module.weight.data.fill_(1.0) + elif isinstance(module, Gemma3nAudioAttention): + module.per_dim_scale.data.zero_() + elif isinstance(module, Gemma3nTextAltUp): + module.correct_output_scale.data.zero_() + + +@auto_docstring(custom_intro="The base Gemma 3n language model without a language modeling head.") +class Gemma3nTextModel(Gemma3nPreTrainedModel): + config: Gemma3nTextConfig + + def __init__(self, config: Gemma3nTextConfig): + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + # Gemma3n downcasts the below to bfloat16, causing sqrt(3072)=55.4256 to become 55.5. See https://github.com/huggingface/transformers/pull/29402 + self.embed_tokens = Gemma3nTextScaledWordEmbedding( + config.vocab_size, config.hidden_size, self.padding_idx, embed_scale=self.config.hidden_size**0.5 + ) + self.layers = nn.ModuleList( + [Gemma3nTextDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + ) + + self.norm = Gemma3nRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.rotary_emb = Gemma3nTextRotaryEmbedding(config=config) + self.gradient_checkpointing = False + + # TODO (raushan): Fix this after RoPE refactor. For now we hack it by + # reassigning thetas when we want to create a local RoPE layer. Config + # defaults should hold values for global RoPE. + config = copy.deepcopy(config) + config.rope_theta = config.rope_local_base_freq + config.rope_scaling = {"rope_type": "default"} + self.rotary_emb_local = Gemma3nTextRotaryEmbedding(config=config) + + self.hidden_size = config.hidden_size + self.hidden_size_per_layer_input = config.hidden_size_per_layer_input + + self.embed_tokens_per_layer = Gemma3nTextScaledWordEmbedding( + config.vocab_size_per_layer_input, + config.num_hidden_layers * config.hidden_size_per_layer_input, + self.padding_idx, + embed_scale=config.hidden_size_per_layer_input**0.5, + ) + + self.per_layer_model_projection = nn.Linear( + self.hidden_size, + config.num_hidden_layers * config.hidden_size_per_layer_input, + bias=False, + ) + + self.per_layer_projection_norm = Gemma3nRMSNorm(config.hidden_size_per_layer_input, eps=config.rms_norm_eps) + + self.altup_projections = nn.ModuleList( + [nn.Linear(self.hidden_size, self.hidden_size, bias=False) for _ in range(1, self.config.altup_num_inputs)] + ) + + self.altup_unembed_projections = nn.ModuleList( + [nn.Linear(self.hidden_size, self.hidden_size, bias=False) for _ in range(1, self.config.altup_num_inputs)] + ) + + self.register_buffer("per_layer_projection_scale", torch.tensor(self.hidden_size**-0.5), persistent=False) + self.register_buffer("per_layer_input_scale", torch.rsqrt(torch.tensor(2.0)), persistent=False) + + # Initialize weights and apply final processing + self.post_init() + + @can_return_tuple + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + per_layer_inputs: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> BaseModelOutputWithPast: + r""" + per_layer_inputs (torch.Tensor, *optional*, defaults to None): + Pre-computed per-layer embeddings. If None, they are derived from input_ids if provided. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") + + if self.gradient_checkpointing and self.training and use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`." + ) + use_cache = False + + if input_ids is not None: + inputs_embeds = self.embed_tokens(input_ids) + per_layer_inputs = self.get_per_layer_inputs(input_ids) + + per_layer_inputs = self.project_per_layer_inputs(inputs_embeds, per_layer_inputs) + + if use_cache and past_key_values is None and not self.training: + past_key_values = DynamicCache(config=self.config) + + if cache_position is None: + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + cache_position = torch.arange( + past_seen_tokens, + past_seen_tokens + inputs_embeds.shape[1], + device=inputs_embeds.device, + ) + + if position_ids is None: + position_ids = cache_position.unsqueeze(0) + + # It may already have been prepared by e.g. `generate` + if not isinstance(causal_mask_mapping := attention_mask, dict): + # Prepare mask arguments + mask_kwargs = { + "config": self.config, + "input_embeds": inputs_embeds, + "attention_mask": attention_mask, + "cache_position": cache_position, + "past_key_values": past_key_values, + "position_ids": position_ids, + } + # Create the masks + causal_mask_mapping = { + "full_attention": create_causal_mask(**mask_kwargs), + "sliding_attention": create_sliding_window_causal_mask(**mask_kwargs), + } + + # embed positions + hidden_states_0 = inputs_embeds + + # Initialize RoPE embeddings + position_embeddings_global = self.rotary_emb(hidden_states_0, position_ids) + position_embeddings_local = self.rotary_emb_local(hidden_states_0, position_ids) + + # Expand hidden_states to support per-layer inputs + target_magnitude = torch.mean(hidden_states_0**2, dim=-1, keepdim=True) ** 0.5 + epsilon_tensor = torch.tensor(1e-5) + + temp_hidden_states = [hidden_states_0] + for i in range(1, self.config.altup_num_inputs): + # altup_proj adapted from jax.numpy.einsum("btp,pd->btd", ...) + altup_proj = self.altup_projections[i - 1](hidden_states_0) + current_hidden_state = altup_proj.to(dtype=hidden_states_0.dtype, device=target_magnitude.device) + new_magnitude = torch.mean(current_hidden_state**2, dim=-1, keepdim=True) + new_magnitude = torch.sqrt(torch.maximum(new_magnitude, epsilon_tensor.to(target_magnitude.device))) + current_hidden_state = current_hidden_state * target_magnitude / new_magnitude + temp_hidden_states.append(current_hidden_state) + + hidden_states = torch.stack(temp_hidden_states, dim=0) # [num_altup_inputs, batch, seq_len, hidden_size] + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + + for decoder_layer in self.layers[: self.config.num_hidden_layers]: + if output_hidden_states: + all_hidden_states += (hidden_states,) + + causal_mask = causal_mask_mapping[decoder_layer.attention_type] + per_layer_input = per_layer_inputs[:, :, decoder_layer.layer_idx, :] + + layer_outputs = decoder_layer( + hidden_states, + position_embeddings_global, + position_embeddings_local, + per_layer_input, + attention_mask=causal_mask, + position_ids=position_ids, + past_key_values=past_key_values, + output_attentions=output_attentions, + use_cache=use_cache, + cache_position=cache_position, + **kwargs, + ) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + # add hidden states from the last decoder layer (but before reprojecting to stay consistent with layer output) + if output_hidden_states: + all_hidden_states += (hidden_states,) + + # Per-layer inputs to single output + target_magnitude = torch.mean(hidden_states[0] ** 2, dim=-1, keepdim=True) ** 0.5 + temp_hidden_states = [hidden_states[0]] + for i in range(1, self.config.altup_num_inputs): + # altup_unembed_projections adapted from jax.numpy.einsum("btp,pd->btd", ...) + altup_unemb_proj: torch.Tensor = self.altup_unembed_projections[i - 1](hidden_states[i]) + current_hidden_state = altup_unemb_proj.to(dtype=hidden_states_0.dtype, device=target_magnitude.device) + new_magnitude = torch.mean(current_hidden_state**2, dim=-1, keepdim=True) + new_magnitude = torch.sqrt(torch.maximum(new_magnitude, epsilon_tensor.to(target_magnitude.device))) + current_hidden_state = current_hidden_state * target_magnitude / new_magnitude + temp_hidden_states.append(current_hidden_state) + + hidden_states = torch.stack(temp_hidden_states) + hidden_states = torch.mean(hidden_states, dim=0) + hidden_states = self.norm(hidden_states) + + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=past_key_values, + hidden_states=all_hidden_states, + attentions=all_self_attns, + ) + + def get_per_layer_inputs(self, input_ids: torch.LongTensor) -> torch.Tensor: + return self.embed_tokens_per_layer(input_ids).reshape( + *input_ids.shape, + self.config.num_hidden_layers, + self.hidden_size_per_layer_input, + ) + + def project_per_layer_inputs( + self, + inputs_embeds: torch.Tensor, + per_layer_inputs: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + per_layer_projection: torch.Tensor = self.per_layer_model_projection(inputs_embeds) + per_layer_projection *= self.per_layer_projection_scale.to( + dtype=inputs_embeds.dtype, device=per_layer_projection.device + ) + per_layer_projection = per_layer_projection.reshape( + *inputs_embeds.shape[:-1], + self.config.num_hidden_layers, + self.hidden_size_per_layer_input, + ) + per_layer_projection = self.per_layer_projection_norm(per_layer_projection) + + if per_layer_inputs is None: + return per_layer_projection + + if per_layer_projection.shape != per_layer_inputs.shape: + # per-layer inputs are sometimes padded with zeros, slice the relevant embeddings. + per_layer_inputs = per_layer_inputs[..., : self.config.num_hidden_layers, :] + + return (per_layer_projection + per_layer_inputs) * self.per_layer_input_scale.to( + dtype=inputs_embeds.dtype, device=per_layer_projection.device + ) + + +@auto_docstring(custom_intro="The base Gemma 3n language model with a language modeling head.") +class Gemma3nForCausalLM(Gemma3nPreTrainedModel, GenerationMixin): + _tied_weights_keys = ["lm_head.weight"] + _tp_plan = {"lm_head": "colwise_rep"} + _pp_plan = {"lm_head": (["hidden_states"], ["logits"])} + config: Gemma3nTextConfig + base_model_prefix = "model" + _checkpoint_conversion_mapping = {"model.language_model": "model"} + + def __init__(self, config: Gemma3nTextConfig): + super().__init__(config) + self.model = Gemma3nTextModel(config) + self.vocab_size = config.vocab_size + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + @can_return_tuple + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + logits_to_keep: Union[int, torch.Tensor] = 0, + **kwargs, + ) -> CausalLMOutputWithPast: + r""" + Example: + + ```python + >>> from transformers import AutoTokenizer, Gemma3nForCausalLM + + >>> model = Gemma3nForCausalLM.from_pretrained("google/gemma-2-9b") + >>> tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b") + + >>> prompt = "What is your favorite condiment?" + >>> inputs = tokenizer(prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(inputs.input_ids, max_length=30) + >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "What is your favorite condiment?" + ```""" + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs: BaseModelOutputWithPast = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + cache_position=cache_position, + **kwargs, + ) + + hidden_states = outputs.last_hidden_state + # Only compute necessary logits, and do not upcast them to float if we are not computing the loss + slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep + logits = self.lm_head(hidden_states[:, slice_indices, :]) + if self.config.final_logit_softcapping is not None: + logits = logits / self.config.final_logit_softcapping + logits = torch.tanh(logits) + logits = logits * self.config.final_logit_softcapping + + loss = None + if labels is not None: + loss = self.loss_function(logits, labels, self.vocab_size, **kwargs) + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +class Gemma3nMultimodalEmbedder(nn.Module): + """Embeds token ids or soft tokens for multimodal content into language model space.""" + + def __init__( + self, + multimodal_config: Union[Gemma3nAudioConfig, Gemma3nVisionConfig], + text_config: Gemma3nTextConfig, + ): + super().__init__() + + self.multimodal_hidden_size = multimodal_config.hidden_size + self.eps = multimodal_config.rms_norm_eps + self.vocab_offset = multimodal_config.vocab_offset + self.vocab_size = multimodal_config.vocab_size + self.text_hidden_size = text_config.hidden_size + + self.embedding = nn.Embedding(self.vocab_size, self.multimodal_hidden_size) + self.hard_embedding_norm = Gemma3nRMSNorm(self.multimodal_hidden_size, eps=self.eps) + self.soft_embedding_norm = Gemma3nRMSNorm(self.multimodal_hidden_size, eps=self.eps) + self.embedding_projection = nn.Linear(self.multimodal_hidden_size, self.text_hidden_size, bias=False) + self.embedding_post_projection_norm = Gemma3nRMSNorm(self.text_hidden_size, eps=self.eps, with_scale=False) + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + """Embeds token ids or soft tokens for multimodal content into language model space. + + Args: + input_ids: A torch.LongTensor containing the token ids to embed. Values should be in the range + `[vocab_offset, vocab_offset + vocab_size)`. + inputs_embeds: A torch.Tensor containing the soft tokens to embed. + + Returns: + A torch.Tensor of embeddings with shape `[batch_size, seq_len, self.config.text_config.hidden_size]`. + """ + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") + + if inputs_embeds is not None: + emb_norm = self.soft_embedding_norm(inputs_embeds) + else: + hard_emb = self.embedding(input_ids - self.vocab_offset) + emb_norm = self.hard_embedding_norm(hard_emb) + + emb_norm_proj = self.embedding_projection(emb_norm) + return self.embedding_post_projection_norm(emb_norm_proj) + + +@auto_docstring( + custom_intro=""" + The base Gemma 3n model comprising a vision backbone, an audio backbone, and a language model without a + language modeling head. + """ +) +class Gemma3nModel(Gemma3nPreTrainedModel): + _checkpoint_conversion_mapping = {} + # we are filtering the logits/labels so we shouldn't divide the loss based on num_items_in_batch + accepts_loss_kwargs = False + + def __init__(self, config: Gemma3nConfig): + super().__init__(config) + self.vision_tower = AutoModel.from_config(config=config.vision_config) + self.vocab_size = config.text_config.vocab_size + + language_model = AutoModel.from_config(config=config.text_config) + self.language_model = language_model + + self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1 + self.vocab_size_per_layer_input = config.text_config.vocab_size_per_layer_input + self.audio_tower = AutoModel.from_config(config.audio_config) + self.embed_vision = Gemma3nMultimodalEmbedder(config.vision_config, config.text_config) + self.embed_audio = Gemma3nMultimodalEmbedder(config.audio_config, config.text_config) + self.post_init() + + def get_input_embeddings(self): + return self.language_model.get_input_embeddings() + + def set_input_embeddings(self, value): + self.language_model.set_input_embeddings(value) + + def set_decoder(self, decoder): + self.language_model = decoder + + def get_decoder(self): + return self.language_model + + def get_image_features(self, pixel_values: torch.Tensor) -> torch.Tensor: + """ + Projects the last hidden state from the vision model into language model space. + + Args: + pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`) + The tensors corresponding to the input images. + + Returns: + image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`). + """ + vision_outputs = self.vision_tower( + pixel_values=pixel_values, do_pooling=False, return_dict=True + ).last_hidden_state + # Convert from (batch, channels, height, width) to (batch, height * width, channels) where: + # height == width and height * width == Gemma3nConfig.vision_soft_tokens_per_image. + vision_outputs = vision_outputs.reshape( + vision_outputs.shape[0], + self.config.vision_config.hidden_size, + self.config.vision_soft_tokens_per_image, + ).permute(0, 2, 1) + # Normalize and embed the soft tokens into language model space. + vision_outputs *= self.config.vision_config.hidden_size**0.5 + return self.embed_vision(inputs_embeds=vision_outputs) + + def get_placeholder_mask( + self, + input_ids: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + image_features: Optional[torch.FloatTensor] = None, + audio_features: Optional[torch.FloatTensor] = None, + ): + """ + Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is + equal to the length of multimodal features. If the lengths are different, an error is raised. + """ + if input_ids is None: + special_image_mask = inputs_embeds == self.get_input_embeddings()( + torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device) + ) + special_image_mask = special_image_mask.all(-1) + special_audio_mask = ( + inputs_embeds + == self.get_input_embeddings()( + torch.tensor(self.config.audio_token_id, dtype=torch.long, device=inputs_embeds.device) + ) + ).all(-1) + else: + special_image_mask = input_ids == self.config.image_token_id + special_audio_mask = input_ids == self.config.audio_token_id + + n_image_tokens = special_image_mask.sum() + special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device) + if image_features is not None and inputs_embeds[special_image_mask].numel() != image_features.numel(): + raise ValueError( + f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {image_features.shape[0] * image_features.shape[1]}" + ) + + n_audio_tokens = special_audio_mask.sum() + special_audio_mask = special_audio_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device) + if audio_features is not None and inputs_embeds[special_audio_mask].numel() != audio_features.numel(): + raise ValueError( + f"Audio features and image tokens do not match: tokens: {n_audio_tokens}, features {audio_features.shape[0] * audio_features.shape[1]}" + ) + + return special_image_mask, special_audio_mask + + @can_return_tuple + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, # text inputs + pixel_values: Optional[torch.FloatTensor] = None, # vision inputs + input_features: Optional[torch.FloatTensor] = None, # audio inputs + attention_mask: Optional[torch.Tensor] = None, + input_features_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + token_type_ids: Optional[torch.LongTensor] = None, + cache_position: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + **lm_kwargs, + ) -> Gemma3nCausalLMOutputWithPast: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`. + + Example: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, Gemma3nForConditionalGeneration + + >>> model = Gemma3nForConditionalGeneration.from_pretrained("google/gemma3n2-3b-mix-224") + >>> processor = AutoProcessor.from_pretrained("google/gemma3n2-3b-mix-224") + + >>> prompt = "Where is the cat standing?" + >>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> inputs = processor(images=image, text=prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(**inputs,) + >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "Where is the cat standing?\nsnow" + ``` + """ + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + + if input_ids is not None: + inputs_embeds = self.get_input_embeddings()(input_ids) + + # Prepare per-layer inputs from inputs_ids + per_layer_inputs_mask = torch.logical_and(input_ids >= 0, input_ids < self.vocab_size_per_layer_input) + per_layer_inputs_tokens = torch.where(per_layer_inputs_mask, input_ids, torch.zeros_like(input_ids)) + per_layer_inputs = self.language_model.get_per_layer_inputs(per_layer_inputs_tokens) + + # Handle vision tokens (>= embed_vision.vocab_offset and < embed_audio.vocab_offset) + vision_mask = torch.logical_and( + input_ids >= self.embed_vision.vocab_offset, input_ids < self.embed_audio.vocab_offset + ) + dummy_vision_token_id = self.embed_vision.vocab_offset + self.embed_vision.vocab_size - 1 + vision_input_ids = torch.where(vision_mask, input_ids, dummy_vision_token_id).to(inputs_embeds.device) + vision_embeds = self.embed_vision(input_ids=vision_input_ids) + expanded_vision_mask = vision_mask.unsqueeze(-1).expand_as(inputs_embeds) + inputs_embeds = torch.where(expanded_vision_mask, vision_embeds, inputs_embeds) + + # Handle audio tokens (>= embed_audio.vocab_offset) + audio_mask = input_ids >= self.embed_audio.vocab_offset + dummy_audio_token_id = self.embed_audio.vocab_offset + self.embed_audio.vocab_size - 1 + audio_input_ids = torch.where(audio_mask, input_ids, dummy_audio_token_id).to(inputs_embeds.device) + audio_embeds = self.embed_audio(input_ids=audio_input_ids) + expanded_audio_mask = audio_mask.unsqueeze(-1).expand_as(inputs_embeds) + inputs_embeds = torch.where(expanded_audio_mask, audio_embeds, inputs_embeds) + else: + per_layer_inputs = None + + # Merge text and images + if pixel_values is not None: + image_features = self.get_image_features(pixel_values) + image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype) + special_image_mask, _ = self.get_placeholder_mask( + input_ids, inputs_embeds=inputs_embeds, image_features=image_features + ) + inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features) + + # Merge text and audio + if input_features is not None and input_features_mask is not None: + audio_features, audio_mask = self.get_audio_features(input_features, ~input_features_mask) + + # The Gemma3nProcessor expects all audio will be 30s in length and inserts 188 audio soft tokens into the + # text to account for this. However, the audio preprocessing and encoder do not gurarantee they will + # produce 188 soft tokens; they will produce at most that many tokens, but they may produce fewer tokens + # depending on the length of the longest audio input in the batch. When we encounter this situation, we pad + # the audio feature out to 188 soft tokens with the emebedding of the last token in the embed_audio vocab. + audio_padding_toks = torch.tensor([[self.vocab_size - 1]], dtype=torch.long, device=audio_features.device) + audio_padding_embs = self.embed_audio(input_ids=audio_padding_toks) + audio_features = torch.where(audio_mask.unsqueeze(-1), audio_padding_embs, audio_features) + + audio_batch_size, audio_seq_len, audio_embed_dim = audio_features.shape + extra_padding_tokens = self.config.audio_soft_tokens_per_image - audio_seq_len + extra_padding_features = audio_padding_embs.expand(audio_batch_size, extra_padding_tokens, audio_embed_dim) + + audio_features = torch.cat((audio_features, extra_padding_features), dim=1) + audio_features = audio_features.to(inputs_embeds.device, inputs_embeds.dtype) + _, special_audio_mask = self.get_placeholder_mask( + input_ids, inputs_embeds=inputs_embeds, audio_features=audio_features + ) + inputs_embeds = inputs_embeds.masked_scatter(special_audio_mask, audio_features) + + outputs = self.language_model( + input_ids=None, + per_layer_inputs=per_layer_inputs, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=True, + cache_position=cache_position, + **lm_kwargs, + ) + + return Gemma3nModelOutputWithPast( + last_hidden_state=outputs.last_hidden_state, + past_key_values=outputs.past_key_values if use_cache else None, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + image_hidden_states=image_features if pixel_values is not None else None, + audio_hidden_states=audio_features if input_features is not None else None, + ) + + def get_audio_features( + self, input_features: torch.Tensor, input_features_mask: torch.Tensor + ) -> tuple[torch.Tensor, torch.Tensor]: + """ + Projects the last hidden state from the audio encoder into language model space. + + Args: + input_features (`torch.FloatTensor]` of shape `(num_images, seq_length, num_features)`): + The tensors corresponding to the input audio. + input_features_mask (`torch.FloatTensor]` of shape `(num_images, seq_length)`): + The attention mask for the input audio. + + Returns: + audio_features (`torch.Tensor`): Audio feature tensor of shape `(num_images, audio_length, embed_dim)`). + """ + audio_outputs, audio_mask = self.audio_tower(input_features, input_features_mask) + return self.embed_audio(inputs_embeds=audio_outputs), audio_mask + + +@auto_docstring( + custom_intro=""" + The base Gemma 3n model comprising a vision backbone, an audio backbone, a language model, and a language modeling + head. + """ +) +class Gemma3nForConditionalGeneration(Gemma3nPreTrainedModel, GenerationMixin): + _checkpoint_conversion_mapping = {} + _tied_weights_keys = ["lm_head.weight"] + base_model_prefix = "model" + + def __init__(self, config: Gemma3nConfig): + super().__init__(config) + self.model = Gemma3nModel(config) + self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False) + self.post_init() + + def get_input_embeddings(self): + return self.model.get_input_embeddings() + + def set_input_embeddings(self, value): + self.model.set_input_embeddings(value) + + def set_decoder(self, decoder): + self.model.set_decoder(decoder) + + def get_decoder(self): + return self.model.get_decoder() + + def get_image_features(self, pixel_values): + return self.model.get_image_features(pixel_values) + + # Make modules available through conditional class for BC + @property + def language_model(self): + return self.model.language_model + + @property + def vision_tower(self): + return self.model.vision_tower + + @property + def multi_modal_projector(self): + raise AttributeError("Use embed_vision instead of multi_modal_projector.") + + @can_return_tuple + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, # text inputs + pixel_values: Optional[torch.FloatTensor] = None, # vision inputs + input_features: Optional[torch.FloatTensor] = None, # audio inputs + attention_mask: Optional[torch.Tensor] = None, + input_features_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + token_type_ids: Optional[torch.LongTensor] = None, + cache_position: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + logits_to_keep: Union[int, torch.Tensor] = 0, + **lm_kwargs, + ) -> Gemma3nCausalLMOutputWithPast: + r""" + input_features_mask (torch.Tensor, *optional*, defaults to None): + The attention mask for the input audio. + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are + ignored (masked), the loss is only computed for the tokens with labels in + `[0, ..., config.text_config.vocab_size]`. + + Example: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, Gemma3ForConditionalGeneration + + >>> model = Gemma3ForConditionalGeneration.from_pretrained("google/gemma-3-4b-it") + >>> processor = AutoProcessor.from_pretrained("google/gemma-3-4b-it") + + >>> messages = [ + ... { + ... "role": "system", + ... "content": [ + ... {"type": "text", "text": "You are a helpful assistant."} + ... ] + ... }, + ... { + ... "role": "user", "content": [ + ... {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"}, + ... {"type": "text", "text": "Where is the cat standing?"}, + ... ] + ... }, + ... ] + + >>> inputs = processor.apply_chat_template( + ... messages, + ... tokenizer=True, + ... return_dict=True, + ... return_tensors="pt", + ... add_generation_prompt=True + ... ) + >>> # Generate + >>> generate_ids = model.generate(**inputs) + >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "user\nYou are a helpful assistant.\n\n\n\n\n\nWhere is the cat standing?\nmodel\nBased on the image, the cat is standing in a snowy area, likely outdoors. It appears to" + ``` + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + + outputs = self.model( + input_ids=input_ids, + pixel_values=pixel_values, + input_features=input_features, + attention_mask=attention_mask, + input_features_mask=input_features_mask, + position_ids=position_ids, + past_key_values=past_key_values, + token_type_ids=token_type_ids, + cache_position=cache_position, + inputs_embeds=inputs_embeds, + labels=labels, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=True, + **lm_kwargs, + ) + + hidden_states = outputs.last_hidden_state + # Only compute necessary logits, and do not upcast them to float if we are not computing the loss + slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep + logits = self.lm_head(hidden_states[:, slice_indices, :]) + if (final_logit_softcapping := self.config.get_text_config().final_logit_softcapping) is not None: + logits = logits / final_logit_softcapping + logits = torch.tanh(logits) + logits = logits * final_logit_softcapping + + loss = None + if labels is not None: + # Upcast to float if we need to compute the loss to avoid potential precision issues + logits = logits.float() + shift_logits = logits[..., :-1, :] + shift_labels = labels[..., 1:] + if attention_mask is not None: + # we use the input attention mask to shift the logits and labels, because it is 2D. + # we also crop attn mask in case it is longer, which happens in PrefixTuning with peft + shift_attention_mask = attention_mask[:, -shift_logits.shape[1] :].to(logits.device) + shift_logits = shift_logits[shift_attention_mask.to(logits.device) != 0].contiguous() + shift_labels = shift_labels[shift_attention_mask.to(shift_labels.device) != 0].contiguous() + else: + shift_logits = shift_logits.contiguous() + shift_labels = shift_labels.contiguous() + # Flatten the tokens + loss_fct = nn.CrossEntropyLoss() + + flat_logits = shift_logits.view(-1, self.config.text_config.vocab_size) + flat_labels = shift_labels.view(-1).to(shift_logits.device) + loss = loss_fct(flat_logits, flat_labels) + + return Gemma3nCausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + image_hidden_states=outputs.image_hidden_states, + audio_hidden_states=outputs.audio_hidden_states, + ) + + def prepare_inputs_for_generation( + self, + input_ids, + past_key_values=None, + inputs_embeds=None, + cache_position=None, + position_ids=None, + pixel_values=None, + input_features=None, + attention_mask=None, + input_features_mask=None, + token_type_ids=None, + use_cache=True, + logits_to_keep=None, + labels=None, + **kwargs, + ): + # Overwritten -- custom `position_ids` and `pixel_values` handling + model_inputs = super().prepare_inputs_for_generation( + input_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + position_ids=position_ids, + cache_position=cache_position, + use_cache=use_cache, + logits_to_keep=logits_to_keep, + token_type_ids=token_type_ids, + **kwargs, + ) + + # If we're in cached decoding stage, multimodal inputs should be None because input ids do not contain special + # tokens anymore. Otherwise multimodal inputs should be passed to model. + # NOTE: use_cache=False always needs pixel_values, input_features, and input_features_mask + if cache_position[0] == 0: + model_inputs["pixel_values"] = pixel_values + model_inputs["input_features"] = input_features + model_inputs["input_features_mask"] = input_features_mask + + return model_inputs + + @property + def audio_tower(self): + return self.model.audio_tower + + +__all__ = [ + "Gemma3nAudioEncoder", + "Gemma3nForCausalLM", + "Gemma3nForConditionalGeneration", + "Gemma3nModel", + "Gemma3nPreTrainedModel", + "Gemma3nTextModel", +] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/gemma3n/modular_gemma3n.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/gemma3n/modular_gemma3n.py new file mode 100644 index 0000000000000000000000000000000000000000..7ea50b7572cf21c9cae1c740c799e936f9d87d34 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/gemma3n/modular_gemma3n.py @@ -0,0 +1,2684 @@ +# coding=utf-8 +# Copyright 2025 Google Inc. HuggingFace Inc. team. All rights reserved. +# +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import copy +import math +from collections.abc import Callable, Sequence +from typing import Any, Optional, Union + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from ...activations import ACT2FN +from ...cache_utils import Cache, DynamicCache +from ...configuration_utils import PretrainedConfig, layer_type_validation +from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask +from ...modeling_flash_attention_utils import FlashAttentionKwargs +from ...modeling_outputs import BaseModelOutputWithPast +from ...modeling_rope_utils import rope_config_validation +from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel +from ...processing_utils import Unpack +from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging +from ...utils.deprecation import deprecate_kwarg +from ..auto import AutoModel +from ..gemma2.configuration_gemma2 import Gemma2Config +from ..gemma2.modeling_gemma2 import ( + Gemma2MLP, + Gemma2PreTrainedModel, + Gemma2RotaryEmbedding, + eager_attention_forward, + rotate_half, +) +from ..gemma3.modeling_gemma3 import ( + Gemma3Attention, + Gemma3DecoderLayer, + Gemma3ForCausalLM, + Gemma3RMSNorm, + Gemma3TextModel, + Gemma3TextScaledWordEmbedding, +) +from ..paligemma.modeling_paligemma import ( + PaliGemmaCausalLMOutputWithPast, + PaliGemmaForConditionalGeneration, + PaliGemmaModel, + PaligemmaModelOutputWithPast, +) +from ..timm_wrapper.configuration_timm_wrapper import TimmWrapperConfig + + +logger = logging.get_logger(__name__) + + +class Gemma3nTextConfig(Gemma2Config, PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`Gemma3nTextModel`]. It is used to instantiate an + Gemma3nTextModel model according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of the Gemma 3n E4B, e.g. + [google/gemma-3n-E4B](https://huggingface.co/google/gemma-3n-E4B). + + Configuration objects that inherit from [`Gemma3nTextConfig`] and can be used to control the model outputs. Read + the documentation from [`Gemma3nTextConfig`] for more information. + + Args: + vocab_size (`int`, *optional*, defaults to 262400): + Vocabulary size of the Gemma3nText model. Defines the number of different tokens that can be represented by + the `inputs_ids` passed when calling [`Gemma3nTextModel`] + vocab_size_per_layer_input (`int`, *optional*, defaults to 262144): + Vocabulary size of the per-layer text embeddings that augment the standard embeddings. + hidden_size (`int`, *optional*, defaults to 2048): + Dimension of the hidden representations. + hidden_size_per_layer_input (`int`, *optional*, defaults to 256): + Dimension of the hidden representations for per-layer emebeddings. + intermediate_size (`int` or `Sequence[int]`, *optional*, defaults to 16384): + Dimension of the MLP representations. MatFormer configurations may wish to provide a sequence of integers + to account for variable intermediate_size values across layers. In such cases, + `len(intermediate_size) == num_hidden_layers`. + num_hidden_layers (`int`, *optional*, defaults to 35): + Number of hidden layers in the Transformer decoder. + num_attention_heads (`int`, *optional*, defaults to 8): + Number of attention heads for each attention layer in the Transformer decoder. + num_key_value_heads (`int`, *optional*, defaults to 2): + This is the number of key_value heads that should be used to implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When + converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed + by meanpooling all the original heads within that group. For more details checkout this + [paper](https://huggingface.co/papers/2305.13245). If not specified, will default to `num_attention_heads`. + head_dim (`int`, *optional*, defaults to 256): + The attention head dimension. + hidden_activation (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`): + The non-linear activation function (function or string) in the decoder. Will default to + `"gelu_pytorch_tanh"` if not specified. `"gelu_pytorch_tanh"` uses an approximation of the `"gelu"` + activation function. + max_position_embeddings (`int`, *optional*, defaults to 32768): + The maximum sequence length that this model might ever be used with. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + rms_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon used by the rms normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + pad_token_id (`int`, *optional*, defaults to 0): + Padding token id. + eos_token_id (`int`, *optional*, defaults to 1): + End of stream token id. + bos_token_id (`int`, *optional*, defaults to 2): + Beginning of stream token id. + rope_theta (`float`, *optional*, defaults to 1000000.0): + The base period of the RoPE embeddings. + rope_scaling (`Dict`, *optional*): + Dictionary containing the scaling configuration for the RoPE embeddings used in global attention. + NOTE: if you apply new rope type and you expect the model to work on longer `max_position_embeddings`, we + recommend you to update this value accordingly. + Expected contents: + `rope_type` (`str`): + The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', + 'llama3'], with 'default' being the original RoPE implementation. + `factor` (`float`, *optional*): + Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In + most scaling types, a `factor` of x will enable the model to handle sequences of length x * + original maximum pre-trained length. + `original_max_position_embeddings` (`int`, *optional*): + Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during + pretraining. + `attention_factor` (`float`, *optional*): + Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention + computation. If unspecified, it defaults to value recommended by the implementation, using the + `factor` field to infer the suggested value. + `beta_fast` (`float`, *optional*): + Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear + ramp function. If unspecified, it defaults to 32. + `beta_slow` (`float`, *optional*): + Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear + ramp function. If unspecified, it defaults to 1. + `short_factor` (`List[float]`, *optional*): + Only used with 'longrope'. The scaling factor to be applied to short contexts (< + `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden + size divided by the number of attention heads divided by 2 + `long_factor` (`List[float]`, *optional*): + Only used with 'longrope'. The scaling factor to be applied to long contexts (< + `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden + size divided by the number of attention heads divided by 2 + `low_freq_factor` (`float`, *optional*): + Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE + `high_freq_factor` (`float`, *optional*): + Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE + rope_local_base_freq (float, *optional*, defaults to 10000.0): + The base period of the RoPE embeddings for local attention. + attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`): + Whether to use a bias in the query, key, value and output projection layers during self-attention. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + sliding_window (`int`, *optional*, defaults to 512): + This is the size of the sliding window used by local attention layers. + layer_types (`Optional`, *optional*): + A sequence of strings defining the attention type for that layer as either "sliding_attention" or + "full_attention". If not provided, `layer_types` will de inferred from `num_hidden_layers` using a pattern + of four "sliding_attention" layers followed one "full_attention". The last layer in the model should always + be a "full_attention" layer. + final_logit_softcapping (`float`, *optional*, defaults to 30.0): + Scaling factor when applying tanh softcapping on the logits. + altup_active_idx (`int`, *optional*, defaults to 0): + The index of the prediction from which AltUp will compute additional predictions or correct + altup_coef_clip (`float`, *optional*, defaults to 120.0): + The maximum amplitude of an AltUp prediction or correction coefficient weight. + altup_correct_scale (`bool`, *optional*, defaults to `True`): + If True, apply the `AltUp.correct_output_scale` to the corrected prediction at `altup_active_idx`. + altup_num_inputs (`int`, *optional*, defaults to 4): + The number of predictions that AltUp should be make given the input sequence. + num_kv_shared_layers (`int`, *optional*, defaults to 15): + The number of layer that share KV cache values. During the forward pass, the last `num_kv_shared_layers` + layers in the model "share" the KV values in that each local and global layer in this range uses the KV + cache values computed for the last local or global layer, respectively, before entering this range. The + value should be a multiple of the attention pattern size (see `layer_types` parameter). + laurel_rank (int, *optional*, defaults to 64): + The intermediate size for the linear projections in the Learned Augmented Residual Layer. + activation_sparsity_pattern (Sequence[float], *optional*): + The sparsity factor used to extract the top-k activations for a given layer. The provided Sequence must + explicitly provide a sparsity value for each layer in the model. By default, the first 10 layers are + sparse with a sparsity factor of 0.95 and the rest are dense. + + ```python + >>> from transformers import Gemma3nTextModel, Gemma3nTextConfig + + >>> # Initializing a Gemma3nText gemma3n_text-E4B style configuration + >>> configuration = Gemma3nTextConfig() + + >>> # Initializing a model from the gemma3n_text-E4B style configuration + >>> model = Gemma3nTextModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ``` + """ + + model_type = "gemma3n_text" + + def __init__( + self, + vocab_size: int = 262_400, + vocab_size_per_layer_input: int = 262_144, + hidden_size: int = 2048, + hidden_size_per_layer_input: int = 256, + intermediate_size: Union[int, Sequence[int]] = 16_384, + num_hidden_layers: int = 35, + num_attention_heads: int = 8, + num_key_value_heads: int = 2, + head_dim: int = 256, + hidden_activation: str = "gelu_pytorch_tanh", + max_position_embeddings: int = 32_768, + initializer_range: float = 0.02, + rms_norm_eps: float = 1e-6, + use_cache: bool = True, + pad_token_id: int = 0, + eos_token_id: int = 1, + bos_token_id: int = 2, + rope_theta: float = 1_000_000.0, + rope_scaling: Optional[dict[str, Any]] = None, + rope_local_base_freq: float = 10_000.0, + attention_bias: bool = False, + attention_dropout: float = 0.0, + sliding_window: int = 512, + layer_types: Optional[Sequence[str]] = None, + final_logit_softcapping: float = 30.0, + altup_active_idx: int = 0, + altup_coef_clip: float = 120.0, + altup_correct_scale: bool = True, + altup_num_inputs: int = 4, + num_kv_shared_layers: int = 15, + laurel_rank: int = 64, + activation_sparsity_pattern: Optional[Union[float, Sequence[float]]] = None, + **kwargs, + ): + PretrainedConfig.__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + **kwargs, + ) + + if isinstance(intermediate_size, Sequence) and (intsize_len := len(intermediate_size)) != num_hidden_layers: + raise ValueError( + "intermediate_size must have an explicit intermediate size for every layer or one for all layers. " + f"Expected {num_hidden_layers} values but got {intsize_len}." + ) + elif not isinstance(intermediate_size, Sequence): + intermediate_size = [intermediate_size] * num_hidden_layers + + self.vocab_size = vocab_size + self.vocab_size_per_layer_input = vocab_size_per_layer_input + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.head_dim = head_dim + self.num_key_value_heads = num_key_value_heads + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + self.use_cache = use_cache + self.rope_theta = rope_theta + self.attention_bias = attention_bias + self.attention_dropout = attention_dropout + self.hidden_activation = hidden_activation + self.sliding_window = sliding_window + self.final_logit_softcapping = final_logit_softcapping + self.layer_types = layer_types + + self.rope_local_base_freq = rope_local_base_freq + self.rope_scaling = rope_scaling + rope_config_validation(self) + + if layer_types is None: + self.layer_types = [ + "full_attention" if (i + 1) % 5 == 0 else "sliding_attention" for i in range(self.num_hidden_layers) + ] + else: + self.layer_types = layer_types + + layer_type_validation(self.layer_types, self.num_hidden_layers) + + self.hidden_size_per_layer_input = hidden_size_per_layer_input + self.num_kv_shared_layers = num_kv_shared_layers + + self.altup_active_idx = altup_active_idx + self.altup_coef_clip = altup_coef_clip + self.altup_correct_scale = altup_correct_scale + self.altup_num_inputs = altup_num_inputs + + self.laurel_rank = laurel_rank + + if activation_sparsity_pattern is None: + num_sparse_layers = 10 if num_hidden_layers > 10 else 0 + activation_sparsity_pattern = [0.95] * num_sparse_layers + [0.0] * (num_hidden_layers - num_sparse_layers) + + if (len_asp := len(activation_sparsity_pattern)) != num_hidden_layers: + raise ValueError( + "activation_sparsity_pattern must have an explicit activation sparsity value for every layer." + f"Expected {num_hidden_layers} values but got {len_asp}." + ) + self.activation_sparsity_pattern = activation_sparsity_pattern + + +class Gemma3nAudioConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`Gemma3nAudioEncoder`]. It is used to instantiate + an `Gemma3nAudioEncoder` model according to the specified arguments, defining the model architecture. Instantiating + a configuration with the defaults will yield a similar configuration to that of the Gemma 3n E4B, e.g., + [google/gemma-3n-E4B](https://huggingface.co/google/gemma-3n-E4B). + + Configuration objects that inherit from [`Gemma3nAudioConfig`] and can be used to control the model outputs. Read + the documentation from [`Gemma3nAudioConfig`] for more information. + + Args: + vocab_size (`int`, *optional*, defaults to 128): + Vocabulary size of the additional hard-token embeddings for audio model. These augment the embeddings + included in the `Gemma3nTextModel` to provide, e.g., the end of audio and audio soft token placeholder + tokens when converting `input_ids` to embeddings in the `Gemma3nForConditionalGeneration` model. + vocab_offset (`int`, *optional*, defaults to 262272): + Offset between the tokenizer vocab index for the token ids embedded by `Gemma3nMultimodalEmbedder` and the + 0-indexed `Gemma3nMultimodalEmbedder.embedding` table. + input_feat_size (`int`, *optional*, defaults to 128): + The number of channels in each mel-spectrogram frame. + hidden_size (`int`, *optional*, defaults to 1536): + Dimension of the hidden representations. + rms_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon used by the rms normalization layers. + gradient_clipping (`float`, *optional*, defaults to 10000000000.0): + Clipping value used to stabilize extremely large gradient values. + conf_attention_chunk_size (`int`, *optional*, defaults to 12): + The sub-sequence size for local attention processing inside the Conformer ("conf") section of the + Universal Speech Model. + conf_attention_context_left (`int`, *optional*, defaults to 13): + The left context size of the local attention inside the Conformer ("conf") section of the + Universal Speech Model. + conf_attention_context_right (`int`, *optional*, defaults to 0): + The right context size of the local attention inside the Conformer ("conf") section of the + Universal Speech Model. + conf_attention_logit_cap (`float`, *optional*, defaults to 50.0): + Logit cap applied during local attention inside the Conformer ("conf") section of the + Universal Speech Model. + conf_num_attention_heads (`int`, *optional*, defaults to 8): + The number of attention heads in local attention inside the Conformer ("conf") section of the + Universal Speech Model. + conf_num_hidden_layers (`int`, *optional*, defaults to 12): + The number of layers that use local attention inside the Conformer ("conf") section of the + Universal Speech Model. + conf_conv_kernel_size (`int`, *optional*, defaults to 5): + Convolution kernel size for the conformer block inside the Conformer ("conf") section of the + Universal Speech Model. + conf_reduction_factor (`int`, *optional*, defaults to 4): + Reduction factor used in the conformer block inside the Conformer ("conf") section of the + Universal Speech Model. + conf_residual_weight (`float`, *optional*, defaults to 0.5): + Residual connection weight inside the Conformer ("conf") section of the + Universal Speech Model. + sscp_conv_channel_size (`tuple(int, int)`, *optional*, defaults to `(128, 32)`): + The channel sizes for the first and second convolutional layers in the Sub-sample Convolution Projection + ("sscp") section of the Universal Speech Model. + sscp_conv_group_norm_eps (`float`, *optional*, defaults to 0.001): + Epsilon used in group normalization in the subsample convolution projection in the Sub-sample Convolution + Projection ("sscp") section of the Universal Speech Model. + sscp_conv_kernel_size (`tuple(tuple(int, int), tuple(int, int))`, *optional*, defaults to `((3, 3), (3, 3))`): + Kernel sizes of the two convolutional layers in the subsample convolution projection in the Sub-sample + Convolution Projection ("sscp") section of the Universal Speech Model. The kernel sizes are specified as a + tuple of height and width for each layer, where the height corresponds to the time dimension and the width + corresponds to the frequency dimension. + sscp_conv_stride_size (`tuple(tuple(int, int), tuple(int, int))`, *optional*, defaults to `((2, 2), (2, 2))`): + Stride sizes of the two convolutional layers in the subsample convolution projection in the Sub-sample + Convolution Projection ("sscp") section of the Universal Speech Model. The stride sizes are specified as a + tuple of height and width for each layer, where the height corresponds to the time dimension and the width + corresponds to the frequency dimension. + + Example: + + ```python + >>> from transformers import Gemma3nAudioConfig, Gemma3nAudioEncoder + + >>> # Initializing a Gemma3nAudioEncoder gemma3n_audio-E4B-style configuration + >>> configuration = Gemma3nAudioConfig() + + >>> # Initializing a model from the gemma3n_audio-E4B style configuration + >>> model = Gemma3nAudioEncoder(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ``` + """ + + model_type = "gemma3n_audio" + + def __init__( + self, + vocab_size: int = 128, + vocab_offset: int = 262_144 + 128, # text vocab size + vision vocab size + input_feat_size: int = 128, + hidden_size: int = 1536, + rms_norm_eps: float = 1e-6, + gradient_clipping: float = 10_000_000_000.0, + conf_attention_chunk_size: int = 12, + conf_attention_context_left: int = 13, + conf_attention_context_right: int = 0, + conf_attention_logit_cap: float = 50.0, + conf_num_attention_heads: int = 8, + conf_num_hidden_layers: int = 12, + conf_conv_kernel_size: int = 5, + conf_reduction_factor: int = 4, + conf_residual_weight: float = 0.5, + sscp_conv_channel_size: tuple[int, int] = (128, 32), + sscp_conv_group_norm_eps: float = 1e-3, + sscp_conv_kernel_size: tuple[tuple[int, int], tuple[int, int]] = ( + (3, 3), + (3, 3), + ), + sscp_conv_stride_size: tuple[tuple[int, int], tuple[int, int]] = ( + (2, 2), + (2, 2), + ), + **kwargs, + ): + super().__init__(**kwargs) + self.input_feat_size = input_feat_size + self.hidden_size = hidden_size + self.rms_norm_eps = rms_norm_eps + self.vocab_size = vocab_size + self.vocab_offset = vocab_offset + self.gradient_clipping = gradient_clipping + self.conf_attention_chunk_size = conf_attention_chunk_size + self.conf_attention_context_left = conf_attention_context_left + self.conf_attention_context_right = conf_attention_context_right + self.conf_attention_logit_cap = conf_attention_logit_cap + self.conf_num_attention_heads = conf_num_attention_heads + self.conf_num_hidden_layers = conf_num_hidden_layers + self.conf_conv_kernel_size = conf_conv_kernel_size + self.conf_reduction_factor = conf_reduction_factor + self.conf_residual_weight = conf_residual_weight + self.sscp_conv_channel_size = sscp_conv_channel_size + self.sscp_conv_group_norm_eps = sscp_conv_group_norm_eps + self.sscp_conv_kernel_size = sscp_conv_kernel_size + self.sscp_conv_stride_size = sscp_conv_stride_size + + +class Gemma3nVisionConfig(TimmWrapperConfig): + r""" + This is the configuration class to store the configuration for a timm backbone [`TimmWrapper`]. It is used to + instantiate an timm model model according to the specified arguments, defining the model architecture. + Instantiating a configuration with the defaults will yield a similar configuration to that of the Gemma 3n E4B + vision tower, e.g. [google/gemma-3n-E4B](https://huggingface.co/google/gemma-3n-E4B). + + Configuration objects inherit from [`Gemma3nVisionConfig`] and can be used to control the model outputs. Read the + documentation from [`Gemma3nVisionConfig`] for more information. + + Config loads imagenet label descriptions and stores them in `id2label` attribute, `label2id` attribute for default + imagenet models is set to `None` due to occlusions in the label descriptions. + + Args: + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + do_pooling (`bool`, *optional*, defaults to `False`): + Whether to do pooling for the last_hidden_state in `TimmWrapper` or not. + architecture (`str`, *optional*, defaults to `"mobilenetv5_300m_enc"`): + Determines vision architecture for TimmWrapper. + hidden_size (`int`, *optional*, defaults to 2048): + Dimension of the hidden representations. + vocab_size (`int`, *optional*, defaults to 128): + Vocabulary size of the additional hard-token embeddings for vision model. + vocab_offset (`int`, *optional*, defaults to 262144): + Offset between the tokenizer vocab index for the token ids embedded by `Gemma3nMultimodalEmbedder` and the + 0-indexed `Gemma3nMultimodalEmbedder.embedding` table. + rms_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon used by the rms normalization layers. + + Example: + ```python + >>> from transformers import Gemma3nVisionConfig, TimmWrapper + + >>> # Initializing a TimmWrapper gemma3n_vision-E4B-style configuration + >>> configuration = Gemma3nVisionConfig() + + >>> # Initializing a gemma3n_vision-E4B-style TimmWrapper from the configuration + >>> model = TimmWrapper(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ``` + """ + + model_type = "gemma3n_vision" + + def __init__( + self, + initializer_range: float = 0.02, + do_pooling: bool = False, + architecture: str = "mobilenetv5_300m_enc", + hidden_size: int = 2048, + vocab_size: int = 128, + vocab_offset: int = 262_144, + rms_norm_eps: float = 1e-06, + model_args: Optional[dict] = None, + **kwargs, + ): + super().__init__(**kwargs) + self.architecture = architecture + self.initializer_range = initializer_range + self.do_pooling = do_pooling + self.hidden_size = hidden_size + self.vocab_size = vocab_size + self.vocab_offset = vocab_offset + self.rms_norm_eps = rms_norm_eps + + +class Gemma3nConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`Gemma3nForConditionalGeneration`]. It is used to + instantiate a Gemma3nForConditionalGeneration according to the specified arguments, defining the model + architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of + Gemma3n-E4B. + + e.g. [google/gemma-3n-E4B](https://huggingface.co/google/gemma-3n-E4B) + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + text_config (`Union[Gemma3nTextConfig, dict]`, *optional*): + The config object of the text backbone. + vision_config (`Union[AutoConfig, dict]`, *optional*): + Custom vision config or dict. + audio_config (`Union[AutoConfig, dict]`, *optional*): + Custom audio config or dict. + audio_soft_tokens_per_image (`int`, *optional*, defaults to 188): + The number of soft tokens per audio clip. + vision_soft_tokens_per_image (`int`, *optional*, defaults to 256): + The number of soft tokens per image. + boi_token_id (`int`, *optional*, defaults to 255999): + The begin-of-image token index to wrap the image prompt. + eoi_token_id (`int`, *optional*, defaults to 262144): + The end-of-image token index to wrap the image prompt. + image_token_id (`int`, *optional*, defaults to 262145): + The image token index to encode the image prompt. + boa_token_id (`int`, *optional*, defaults to 256000): + The begin-of-audio token index to wrap the audio prompt. + eoa_token_id (`int`, *optional*, defaults to 262272): + The end-of-audio token index to wrap the audio prompt. + audio_token_id (`int`, *optional*, defaults to 262273): + The audio token index to encode the audio prompt. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + + + Example: + + ```python + >>> from transformers import Gemma3nForConditionalGeneration, Gemma3nConfig, Gemma3nTextConfig + + >>> # Initializing a MobileNet vision config, which is loaded from TIMM + >>> vision_config = Gemma3nVisionConfig() + + >>> # Initializing a Gemma3n Audio config + >>> audio_config = Gemma3nAudioConfig() + + >>> # Initializing a Gemma3n Text config + >>> text_config = Gemma3nTextConfig() + + >>> # Initializing a Gemma3n gemma-3-4b style configuration + >>> configuration = Gemma3nConfig(text_config, vision_config, audio_config) + + >>> # Initializing a model from the gemma-3-4b style configuration + >>> model = Gemma3nTextConfig(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "gemma3n" + sub_configs = { + "text_config": Gemma3nTextConfig, + "vision_config": Gemma3nVisionConfig, + "audio_config": Gemma3nAudioConfig, + } + + def __init__( + self, + text_config: Optional[Union[Gemma3nTextConfig, dict[str, Any]]] = None, + vision_config: Optional[Union[Gemma3nVisionConfig, dict[str, Any]]] = None, + audio_config: Optional[Union[Gemma3nAudioConfig, dict[str, Any]]] = None, + audio_soft_tokens_per_image: int = 188, + vision_soft_tokens_per_image: int = 256, + boi_token_id: int = 255_999, + eoi_token_id: int = 262_144, + image_token_id: int = 262_145, + boa_token_id: int = 256_000, + eoa_token_id: int = 262_272, + audio_token_id: int = 262_273, + initializer_range: float = 0.02, + **kwargs, + ): + super().__init__(**kwargs) + + if isinstance(text_config, dict): + text_config = Gemma3nTextConfig(**text_config) + elif text_config is None: + text_config = Gemma3nTextConfig() + logger.info("text_config is None. Using default Gemma3nTextConfig.") + + if isinstance(vision_config, dict): + vision_config = Gemma3nVisionConfig(**vision_config) + elif vision_config is None: + vision_config = Gemma3nVisionConfig() + logger.info("vision_config is None. Using default Gemma3nVisionConfig.") + + if isinstance(audio_config, dict): + audio_config = Gemma3nAudioConfig(**audio_config) + elif audio_config is None: + audio_config = Gemma3nAudioConfig() + logger.info("audio_config is None. Using default Gemma3nAudioConfig.") + + self.text_config = text_config + self.vision_config = vision_config + self.audio_config = audio_config + + self.audio_soft_tokens_per_image = audio_soft_tokens_per_image + self.vision_soft_tokens_per_image = vision_soft_tokens_per_image + self.boi_token_id = boi_token_id + self.eoi_token_id = eoi_token_id + self.image_token_id = image_token_id + self.boa_token_id = boa_token_id + self.eoa_token_id = eoa_token_id + self.audio_token_id = audio_token_id + self.initializer_range = initializer_range + + +class Gemma3nModelOutputWithPast(PaligemmaModelOutputWithPast): + r""" + past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache). + + Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see + `past_key_values` input) to speed up sequential decoding. + image_hidden_states (`torch.FloatTensor`, *optional*): + A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`. + image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state. + audio_hidden_states (`torch.FloatTensor`, *optional*): + A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`. + audio_hidden_states of the model produced by the audio encoder and after projecting the last hidden state. + """ + + audio_hidden_states: Optional[torch.FloatTensor] = None + + +class Gemma3nCausalLMOutputWithPast(PaliGemmaCausalLMOutputWithPast): + r""" + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Language modeling loss (for next-token prediction). + logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.text_config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache). + + Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see + `past_key_values` input) to speed up sequential decoding. + image_hidden_states (`torch.FloatTensor`, *optional*): + A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`. + image_hidden_states of the model produced by the vision encoder after projecting last hidden state. + audio_hidden_states (`torch.FloatTensor`, *optional*): + A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`. + audio_hidden_states of the model produced by the audio encoder and after projecting the last hidden state. + """ + + audio_hidden_states: Optional[torch.FloatTensor] = None + + +class Gemma3nRMSNorm(Gemma3RMSNorm): + def __init__(self, dim: int, eps: float = 1e-6, with_scale: bool = True): + super().__init__(dim, eps=eps) + del self.weight + self.with_scale = with_scale + + if self.with_scale: + self.weight = nn.Parameter(torch.ones(dim)) + else: + self.register_buffer("weight", torch.tensor(1.0), persistent=False) + + def _norm(self, x): + return x / torch.sqrt(x.pow(2).mean(-1, keepdim=True) + self.eps) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + # Llama does x.to(float16) * w whilst Gemma2 is (x * w).to(float16) + # See https://github.com/huggingface/transformers/pull/29402 + output = self._norm(x.float()) * self.weight.float() + return output.type_as(x) + + +# ==== Audio Encoder ==== + + +class Gemma3nAudioRelativePositionEmbedding(nn.Module): + def __init__(self, config: Gemma3nAudioConfig): + super().__init__() + self.config = config + + self.num_heads = self.config.conf_num_attention_heads + self.channels = self.config.hidden_size + self.head_dim = self.channels // self.num_heads + self.max_backward = max(0, self.config.conf_attention_context_left - 1) + self.max_forward = self.config.conf_attention_context_right + + self.pos_proj = nn.Linear(self.channels, self.num_heads * self.head_dim, bias=False) + + min_timescale = 1.0 + max_timescale = 1.0e4 + num_timescales = self.channels // 2 + log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / max(num_timescales - 1, 1) + inv_timescales = min_timescale * torch.exp(torch.arange(num_timescales) * -log_timescale_increment) + self.register_buffer( + "inv_timescales", + inv_timescales.float().unsqueeze(0).unsqueeze(0), + persistent=False, + ) + + def _get_timing_signal_1d_pos(self, position: torch.Tensor, dtype: torch.dtype) -> torch.Tensor: + position = position.float().unsqueeze(-1) + scaled_time = position * self.inv_timescales.to(device=position.device, dtype=torch.float32) + timing_signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=-1) + return timing_signal.type(dtype) + + def _relative_shift( + self, + term_bd_before_shift: torch.Tensor, + batch_size: int, + num_heads: int, + num_query_blocks: int, + query_block_size: int, + key_context_size: int, + max_span_plus_1: int, + ) -> torch.Tensor: + """Performs the relative shift. + + Args: + term_bd_before_shift: Tensor of shape [B, N, U, W, F_span]. batch_size + (B), num_heads (N), num_query_blocks (U), query_block_size (W), + key_context_size (C = W+L+R), max_span_plus_1 (F_span = L+R+1). + + Returns: + Tensor of shape [B, N, U, W, C]. + """ + # term_bd_before_shift shape: [B, N, U, W, F_span] + # Target shape after shift: [B, N, U, W, C] + + # Padding amount for the last dimension (F_span) to become (C + 1) + # C = key_context_size + # F_span = max_span_plus_1 + pad_amount_last_dim = (key_context_size + 1) - max_span_plus_1 + + # PyTorch F.pad expects (pad_left, pad_right, pad_top, pad_bottom ...) + # We only pad the last dimension on the right. + padding_tuple = (0, pad_amount_last_dim) + + term_bd_padded = nn.functional.pad(term_bd_before_shift, padding_tuple) + # Shape after pad: [B, N, U, W, C+1] + + # Reshape for slicing (emulating JAX's behavior) + # [B, N, U, W * (C+1)] + term_bd_reshaped = term_bd_padded.reshape( + ( + batch_size, + num_heads, + num_query_blocks, + query_block_size * (key_context_size + 1), + ) + ) + + # Slice to effective [B, N, U, W * C] + term_bd_sliced = term_bd_reshaped[:, :, :, : query_block_size * key_context_size] + + # Reshape back to [B, N, U, W, C] + term_bd_shifted = term_bd_sliced.reshape( + ( + batch_size, + num_heads, + num_query_blocks, + query_block_size, + key_context_size, + ) + ) + return term_bd_shifted + + def forward(self, queries: torch.Tensor, keys: torch.Tensor) -> torch.Tensor: + # queries: [B, U, W, N, H] (batch, num_query_blocks, query_block_size, num_heads, head_dim) + # keys: [B, U, C, N, H] (batch, num_query_blocks, key_context_size, num_heads, head_dim) + # C = W + L + R (key_context_size) + # F_span = L + R + 1 (max_span + 1) + + batch_size, num_query_blocks, query_block_size, num_heads, head_dim = queries.shape + _, _, key_context_size, _, _ = keys.shape + + # Relative positions for sinusoidal embeddings: [L, L-1, ..., -R] + # Length is L+R+1 = self.max_span + 1 + pos_indices = torch.arange(self.max_backward, -self.max_forward - 1, -1, device=queries.device).unsqueeze( + 0 + ) # Shape [1, F_span] + + max_span_plus_1 = pos_indices.shape[1] # F_span + + sin_emb_timing_signal = self._get_timing_signal_1d_pos( + pos_indices, dtype=queries.dtype + ) # Shape [1, F_span, self.channels] + + # Project sinusoidal embeddings: [1, F_span, self.channels] -> [1, F_span, N*H] + projected_sin_emb = self.pos_proj(sin_emb_timing_signal) + # Reshape to [1, F_span, N, H] then squeeze to [F_span, N, H] + sin_emb = projected_sin_emb.reshape(1, max_span_plus_1, self.num_heads, self.head_dim).squeeze( + 0 + ) # Shape [F, N, H] + + # term_ac: Query-Key content interaction + # queries: [B, U, W, N, H] -> permute to [B, N, U, W, H] for matmul + # keys: [B, U, C, N, H] -> permute to [B, N, U, H, C] for matmul + queries_p = queries.permute(0, 3, 1, 2, 4) # [B, N, U, W, H] + keys_p_t = keys.permute(0, 3, 1, 4, 2) # [B, N, U, H, C] + term_ac = torch.matmul(queries_p, keys_p_t) # [B, N, U, W, C] + + # term_bd: Query-Position interaction + # Original einsum: term_bd_unshifed = torch.einsum('buwnh,fnh->bnuwf', queries, sin_emb) + # queries shape: [B, U, W, N, H] + # sin_emb shape: [F, N, H] + # Target output shape: [B, N, U, W, F] + + # Permute queries to [B, N, U, W, H] for easier broadcasting with sin_emb + q_permuted = queries.permute(0, 3, 1, 2, 4) + + # Permute sin_emb to [N, H, F] to prepare for matmul + # sin_emb original is [F, N, H] + s_permuted = sin_emb.permute(1, 2, 0) # Shape: [N, H, F] + + # Reshape queries for matmul: [B, N, U*W, H] + q_reshaped = q_permuted.reshape(batch_size, num_heads, num_query_blocks * query_block_size, head_dim) + + # Perform matmul: [B, N, U*W, H] @ [N, H, F] + # s_permuted ([N, H, F]) will be broadcast to [B, N, H, F] + # Result: [B, N, U*W, F] + term_bd_unshifed_matmul = torch.matmul(q_reshaped, s_permuted) + + # Reshape to target [B, N, U, W, F] + term_bd_unshifed = term_bd_unshifed_matmul.reshape( + batch_size, + num_heads, + num_query_blocks, + query_block_size, + max_span_plus_1, + ) + + # Apply relative shift to term_bd_unshifed + term_bd_shifted = self._relative_shift( + term_bd_unshifed, + batch_size, + num_heads, + num_query_blocks, + query_block_size, + key_context_size, + max_span_plus_1, + ) # Shape [B, N, U, W, C] + + return term_ac + term_bd_shifted + + +class Gemma3nAudioAttention(nn.Module): + def __init__(self, config: Gemma3nAudioConfig): + super().__init__() + self.config = config + + self.num_heads = self.config.conf_num_attention_heads + self.hidden_size = self.config.hidden_size + self.head_dim = self.hidden_size // self.num_heads + + self.chunk_size = self.config.conf_attention_chunk_size + self.max_future_horizon = self.config.conf_attention_context_right + self.max_past_horizon = max(0, self.config.conf_attention_context_left - 1) + self.attention_logits_soft_cap = self.config.conf_attention_logit_cap + self.context_size = self.chunk_size + self.max_past_horizon + self.max_future_horizon + + self.relative_position_embedding = Gemma3nAudioRelativePositionEmbedding(config) + self.per_dim_scale = nn.Parameter(torch.zeros((self.head_dim,))) + + self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False) + self.k_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False) + self.v_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False) + + q_scale = self.head_dim**-0.5 + r_softplus_0 = 1.0 / torch.nn.functional.softplus(torch.tensor(0.0)) + self.register_buffer("q_scale", (q_scale * r_softplus_0).clone().detach(), persistent=False) + + lower_causal_mask = torch.tril( + torch.ones((self.context_size, self.chunk_size), dtype=torch.bool), + diagonal=0, + ).T + upper_causal_mask = torch.tril( + torch.ones((self.chunk_size, self.context_size), dtype=torch.bool), + diagonal=self.max_past_horizon + self.max_future_horizon, + ) + local_causal_valid_mask = torch.ones((self.chunk_size, self.context_size), dtype=torch.bool) + local_causal_valid_mask = local_causal_valid_mask * lower_causal_mask * upper_causal_mask + self.register_buffer("local_causal_valid_mask", local_causal_valid_mask, persistent=False) + + self.register_buffer( + "softcap", + torch.tensor(self.attention_logits_soft_cap).float(), + persistent=False, + ) + + def _pad_dim1(self, x: torch.Tensor, pad_left: int, pad_right: int) -> torch.Tensor: + batch, _, *tail_shape = x.shape + left = x.new_zeros((batch, pad_left, *tail_shape)) + right = x.new_zeros((batch, pad_right, *tail_shape)) + x = torch.cat([left, x, right], dim=1) + return x + + def _convert_to_block(self, hidden_states: torch.Tensor) -> torch.Tensor: + """Turns a sequence to non overlapping blocks. + + Args: + hidden_states: a tensor of [batch, time, ...]. + + Returns: + A tensor of [batch, num_blocks, block_size, ...], with necessary + paddings, + where output[:, i, ...] are x[:, i*block_size:(i+1)*block_size, ...]. + """ + shape = hidden_states.shape + b, t = shape[:2] + num_blocks = (t + self.chunk_size - 1) // self.chunk_size + + if (padding_len := num_blocks * self.chunk_size - t) > 0: + hidden_states = self._pad_dim1(hidden_states, 0, padding_len) + + permute_dims = (b, num_blocks, self.chunk_size) + shape[2:] + hidden_states = hidden_states.reshape(permute_dims).contiguous() + return hidden_states + + def _extract_block_context(self, hidden_states: torch.Tensor) -> torch.Tensor: + """Extracts temporal context for every block. + + Args: + hidden_states: a tensor of [batch, time, ...]. + + Returns: + A tensor of [batch, num_blocks, context_size, ...], with necessary + paddings, + where context_size = block_size + left_context + right_context, + and output[:, i, ...] are x[:, start-left_context:end+right_context, + ...], + start = i * block_size, end = (i + 1) * block_size. + """ + pad_left = self.max_past_horizon + # The JAX equivalent padding for signal.frame with pad_mode='valid' is + # (left_context, right_context + block_size - 1) on the time dimension. + # PyTorch's _pad_dim1 applies padding symmetrically if only one value is given, + # or (pad_dim_start, pad_dim_end) if two are given. + # Our _pad_dim1(x, pad_left, pad_right) pads dim -2 (time for [B,T,N,H]) + # or dim 1 (time for [B,T]). + # The current pad_right calculation matches the JAX effective padding. + pad_right = self.max_future_horizon + self.chunk_size - 1 + hidden_states = self._pad_dim1(hidden_states, pad_left, pad_right) + + frame_len = self.context_size + frame_step = self.chunk_size + + # Directly use unfold without the subframe_factor logic + # x.unfold(dimension, size, step) + # dimension=1 (time dimension, assuming x is [B, T_padded, ...]) + # size=frame_len (context_size) + # step=frame_step (chunk_size) + x_unfolded = hidden_states.unfold(dimension=1, size=frame_len, step=frame_step) + + # If x was [B, T_padded], x_unfolded is [B, num_blocks, frame_len] + # If x was [B, T_padded, N, H], x_unfolded is [B, num_blocks, N, H, frame_len] + # We want to match JAX's typical output for such operations which might be + # [B, num_blocks, frame_len, N, H] if N, H are present. + # The relative_position_embedding expects keys as [B, U, C, N, H]. + # If x_unfolded is [B, U, N, H, C(frame_len)], we need to move C. + if hidden_states.ndim > 2 and x_unfolded.ndim > 3: # Check if inner dimensions (like N, H) exist + # Current shape after unfold for [B, T_pad, N, H] is [B, U, N, H, C] + # Target shape for keys in RPE: [B, U, C, N, H] + x_unfolded = torch.movedim(x_unfolded, source=-1, destination=2) + + return x_unfolded.contiguous() + + def forward(self, hidden_states: torch.Tensor, mask: torch.BoolTensor) -> torch.Tensor: + # sl.Dense uses jax.numpy.einsum("...a,abcd->...bcd") and jax.numpy.select() + qkv_shape = (*hidden_states.shape[:-1], self.num_heads, self.head_dim) + query_states = self.q_proj(hidden_states).reshape(qkv_shape).contiguous() + key_states = self.k_proj(hidden_states).reshape(qkv_shape).contiguous() + value_states = self.v_proj(hidden_states).reshape(qkv_shape).contiguous() + + per_dim_scale_sp = torch.nn.functional.softplus(self.per_dim_scale) + + broadcast_shape = (1, 1, 1, self.head_dim) + per_dim_scale_sp_broadcast = per_dim_scale_sp.view(broadcast_shape) + query_states = query_states * self.q_scale * per_dim_scale_sp_broadcast + + batch_size, q_time = query_states.shape[:2] + + query_blocks = self._convert_to_block(query_states) + key_blocks = self._extract_block_context(key_states) + value_blocks = self._extract_block_context(value_states) + num_query_blocks = query_blocks.shape[1] + + # 1. Create a mask indicating originally valid positions. + original_valid_mask = ~mask # True for valid, False for padded + + # 2. Extract blocks from this validity mask. + extracted_valid_mask_blocks = self._extract_block_context(original_valid_mask) + + # If subframe_factor was used in _extract_block_context for a [B, T] input mask, + # the shape might be [B, U, C/SF, SF]. Reshape to [B, U, C]. + # batch_size and num_query_blocks are known from query_blocks. + # self.context_size is C. + if ( + extracted_valid_mask_blocks.ndim == 4 + and extracted_valid_mask_blocks.shape[2] * extracted_valid_mask_blocks.shape[3] == self.context_size + ): + extracted_valid_mask_blocks = extracted_valid_mask_blocks.reshape( + batch_size, num_query_blocks, self.context_size + ) + # After potential reshape, ensure it's [B, U, C] if it was from a [B,T] mask. + # This assertion might be too strict if _extract_block_context handles higher-rank inputs differently, + # but for the mask case, this should hold. + if extracted_valid_mask_blocks.shape != ( + batch_size, + num_query_blocks, + self.context_size, + ): + raise ValueError( + "Shape of extracted_valid_mask_blocks" + f" {extracted_valid_mask_blocks.shape} is not ({batch_size}," + f" {num_query_blocks}, {self.context_size}) after potential reshape." + ) + + # 3. Expand dimensions for broadcasting with logits and causal mask. + # Target shape for broadcasting with logits [B,N,U,W,C] + # extracted_valid_mask_blocks to [B, 1, U, 1, C] + condition_from_input_validity = extracted_valid_mask_blocks.unsqueeze(1).unsqueeze(-2) + + # self.local_causal_valid_mask is [W, C], True where allowed by local window. + # Expand to [1, 1, 1, W, C] + condition_from_causality = self.local_causal_valid_mask.unsqueeze(0).unsqueeze(0).unsqueeze(0) + + # 4. Combine the two conditions. + # final_condition will be True where a key is *both* originally valid *and* causally accessible. + # Broadcasts to [B, 1, U, W, C] + final_condition_for_where = torch.logical_and( + condition_from_input_validity, + condition_from_causality.to(condition_from_input_validity.device), # Ensure same device + ) + + # Embed queries and keys + logits = self.relative_position_embedding(query_blocks, key_blocks) + + # Apply attention logit softcap + # Ensure softcap is on the same device as logits + softcap_val = self.softcap.to(logits.device) + logits = logits / softcap_val + logits = torch.tanh(logits) + logits = logits * softcap_val + + # Apply the combined mask. + # final_condition_for_where will broadcast with logits [B,N,U,W,C] + logits = torch.where(final_condition_for_where, logits, torch.finfo(logits.dtype).min) + probabilities = torch.nn.functional.softmax(logits, dim=-1, dtype=torch.float32).to(dtype=value_blocks.dtype) + + # context_vectors is adapted from jax.numpy.einsum("BNuwc,BucNH->BuwNH", ...) + b_dim, n_dim, u_dim, w_dim, c_dim = probabilities.shape + h_dim = value_blocks.shape[-1] + prob_bun = probabilities.permute(0, 2, 1, 3, 4).reshape(-1, w_dim, c_dim) + v_bun = value_blocks.permute(0, 1, 3, 2, 4).reshape(-1, c_dim, h_dim) + result_bmm = torch.bmm(prob_bun, v_bun) + context_vectors = result_bmm.reshape(b_dim, u_dim, n_dim, w_dim, h_dim).permute(0, 1, 3, 2, 4) + context_vectors = context_vectors.reshape( + ( + batch_size, + num_query_blocks * self.chunk_size, + self.num_heads, + self.head_dim, + ) + ) + context_vectors = context_vectors[:, :q_time] + + return context_vectors + + +class Gemma3nAudioCumulativeGroupNorm(nn.Module): + """Applies Group Normalization cumulatively over the time dimension. + + This layer normalizes the input by calculating the mean and variance + cumulatively over the time dimension (dim 1). The statistics are computed + over all feature dimensions (specified by `feature_dims` and `num_channels`) + for elements marked as valid by the optional `mask`. + + If a `mask` is provided (True for valid, False for invalid/padded), + invalid time steps do not contribute to the statistics calculation, and + their corresponding output values are zeroed out. + + Scale and bias, if enabled, are applied per-channel (last dimension). + This behavior is similar to JAX's `GroupNormalization` with `num_groups=1` + and `cumulative=True`. + """ + + def __init__( + self, + num_channels: int, # Number of channels (size of the last dimension) + feature_dims: Sequence[int], # Sizes of non-channel feature dimensions, e.g., (H, W) for input [B,T,H,W,C] + eps: float = 1e-3, + ): + super().__init__() + self.num_channels = num_channels + self.feature_dims = tuple(feature_dims) + self.eps = eps + + # Scale parameter depends only on the channel dimension + self.weight = nn.Parameter(torch.ones(num_channels)) + + # Axes for normalization: all dimensions except Batch (0) and Time (1). + # For input [B, T, *feature_dims, C], these are dims from 2 onwards. + self.reduction_axes = tuple(range(2, 2 + len(self.feature_dims) + 1)) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + """Applies cumulative group norm, optionally using a mask. + + Args: + hidden_states: Input tensor, shape [B, T, *feature_dims, C]. + + Returns: + Normalized tensor with the same shape as x. + """ + expected_input_suffix = self.feature_dims + (self.num_channels,) + if hidden_states.shape[2:] != expected_input_suffix: + raise ValueError( + f"Input tensor shape suffix {hidden_states.shape[2:]} does not match expected" + f" suffix (feature_dims + num_channels) {expected_input_suffix}" + ) + + input_dtype = hidden_states.dtype + # Calculations are performed in float32 for numerical stability. + calc_dtype = torch.float32 + x_calc = hidden_states.to(calc_dtype) + + # Prepare a broadcastable mask (`mask_calc`). + # If no mask is provided, treat all elements as valid + # (mask_calc is all ones). + # Otherwise, expand the [B, T] mask to [B, T, 1, ..., 1] for broadcasting. + mask_calc = torch.ones_like(x_calc, dtype=calc_dtype) + + # Cumulative Statistics Calculation + # 1. Sum of values over reduction axes at each time step. + sum_values_at_t = torch.sum(x_calc, dim=self.reduction_axes, keepdim=True) + # 2. Cumulative sum of values over time. + cum_sum_values = torch.cumsum(sum_values_at_t, dim=1) + + # 3. Count of valid elements in the normalization group at each time step. + # (A "group" here consists of all features at a given Batch, Time). + elements_in_group_at_t = torch.sum(mask_calc, dim=self.reduction_axes, keepdim=True) + # 4. Cumulative count of valid elements over time. + cum_count_elements = torch.cumsum(elements_in_group_at_t, dim=1) + # Avoid division by zero if all preceding elements were masked. + safe_cum_count_elements = torch.clamp(cum_count_elements, min=1.0) + + # 5. Cumulative mean. + cum_mean = cum_sum_values / safe_cum_count_elements + + # 6. Sum of squared differences from the cumulative mean. + # Only sum for valid elements: (x_calc - cum_mean)^2 * mask_calc. + # Using x_calc here for the difference, as cum_mean already accounts for masking. + squared_diff_from_mean = (x_calc - cum_mean).pow(2) + sum_sq_diff_at_t = torch.sum(squared_diff_from_mean, dim=self.reduction_axes, keepdim=True) + + # 7. Cumulative sum of squared differences over time. + cum_sum_sq_diff = torch.cumsum(sum_sq_diff_at_t, dim=1) + + # 8. Cumulative variance. + cum_variance = cum_sum_sq_diff / safe_cum_count_elements + + # Normalize the input using the calculated cumulative statistics: + # (x - E[x]) / sqrt(Var[x] + eps) + normalized_x = (x_calc - cum_mean) * torch.rsqrt(cum_variance + self.eps) + + # Apply affine transformation (scale and bias) if enabled. + # Scale and bias are applied per-channel (last dimension). + scale = self.weight.to(calc_dtype) + # Reshape for broadcasting: [C] -> [1, ..., 1, C] + scale_view_shape = [1] * (hidden_states.dim() - 1) + [self.num_channels] + normalized_x = normalized_x * scale.view(scale_view_shape) + + # Zero out outputs for time steps that were originally masked (where mask_calc is 0). + # This ensures padded/invalid positions in the input result in zero output. + final_output = normalized_x * mask_calc + + return final_output.to(input_dtype) + + +class Gemma3nAudioSSCPConvBlock(nn.Module): + """A single convolution block for the SubSampleConvProjection. + + This block consists of a 2D convolution, followed by CumulativeGroupNorm, + and a ReLU activation. It handles manual padding for the convolution. + """ + + def __init__( + self, + config: Gemma3nAudioConfig, + idx: int, + input_freq_dim: int, # Changed from input_spatial_dim + manual_padding: tuple[int, int, int, int] = (0, 0, 0, 0), + ): + super().__init__() + self.config = config + self.manual_padding = manual_padding + + # in_channels is 1 for the first block, or C_out from previous block's conv + in_channels = 1 if idx == 0 else self.config.sscp_conv_channel_size[idx - 1] + out_channels = self.config.sscp_conv_channel_size[idx] + kernel_h, kernel_w = self.config.sscp_conv_kernel_size[idx] + stride_h, stride_w = self.config.sscp_conv_stride_size[idx] + + self.conv = nn.Conv2d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=( + kernel_h, + kernel_w, + ), # Kernel (kH, kW) operates on (Time, Freq_dim) + stride=(stride_h, stride_w), + padding=(0, 0), # Manual padding is used + bias=False, + ) + + # Calculate output frequency dimension (f_out_conv) after this convolution. + # input_freq_dim is the unpadded width (feature dimension). + # self.manual_padding is (pad_F_left, pad_F_right, pad_T_top, pad_T_bottom) + f_in_padded = input_freq_dim + self.manual_padding[0] + self.manual_padding[1] + f_out_conv = (f_in_padded - kernel_w) // stride_w + 1 + + self.norm = Gemma3nAudioCumulativeGroupNorm( + num_channels=out_channels, # Channels of the conv output + feature_dims=(f_out_conv,), # The frequency dimension size after conv + eps=self.config.sscp_conv_group_norm_eps, + ) + + self.activation = nn.ReLU() + + def forward(self, audio_encodings: torch.Tensor) -> torch.Tensor: + # Input audio_encodings is [B, C_in, T_in, F_in] (e.g., C_in=1) + # manual_padding is (pad_F_left, pad_F_right, pad_T_top, pad_T_bottom) + # F.pad applies to last two dims: F_in then T_in + audio_encodings_padded = F.pad(audio_encodings, self.manual_padding, mode="constant", value=0.0).to( + self.conv.weight.dtype + ) + # Expected padded shape for F_in, k_w=3, pad_F=(1,1) -> F_padded = F_in+2 + # Expected padded shape for T_in, k_h=3, pad_T=(0,2) -> T_padded = T_in+2 + audio_encodings_conv = self.conv(audio_encodings_padded) + # Expected conv output shape: [B, C_out, T_out, F_out] + # Input to norm is [B, T_out, F_out, C_out] + x_for_norm = audio_encodings_conv.permute(0, 2, 3, 1).contiguous() + x_normed = self.norm(x_for_norm) + # Output of norm is [B, T_out, F_out, C_out], permute back to [B, C_out, T_out, F_out] + audio_encodings_normed = x_normed.permute(0, 3, 1, 2).contiguous() + return self.activation(audio_encodings_normed) + + +class Gemma3nAudioSubSampleConvProjection(nn.Module): + def __init__(self, config: Gemma3nAudioConfig): + super().__init__() + self.config = config + + current_f_for_block_input = config.input_feat_size # Start with original feature dim + calculated_block_padding = [] + calculated_f_out_dims = [] # Tracking frequency dimension output sizes + + for i in range(2): # Assuming 2 conv layers as per sscp_conv_... arrays + kernel_h, kernel_w = config.sscp_conv_kernel_size[i] + stride_h, stride_w = config.sscp_conv_stride_size[i] + + # Padding for Time (Height for Conv2d) - REVERSE_CAUSAL like + # JAX 'reverse_causal' padding is (0, kernel_size - 1) + pad_t_top = 0 + pad_t_bottom = kernel_h - 1 + + # Frequency Padding (Width for Conv2d) + # Based on JAX effective padding (1,1) for F_in=10, K_w=3, S_w=2 + # and the successful test configuration. + # If kernel/stride/input_freq for frequency changes, this might need re-evaluation + # to match generic JAX 'SAME' behavior if it differs. + pad_f_left = 1 + pad_f_right = 1 + + manual_padding_tuple = ( + pad_f_left, + pad_f_right, + pad_t_top, + pad_t_bottom, + ) + calculated_block_padding.append(manual_padding_tuple) + + # Calculate output frequency dimension after this convolution + # This uses the actual padding applied and kernel/stride. + f_in_padded = current_f_for_block_input + pad_f_left + pad_f_right + f_out_after_conv = (f_in_padded - kernel_w) // stride_w + 1 # Assuming dilation_w = 1 + calculated_f_out_dims.append(f_out_after_conv) + current_f_for_block_input = f_out_after_conv + + self.conv_0 = Gemma3nAudioSSCPConvBlock( + idx=0, + input_freq_dim=config.input_feat_size, # Pass original feature dim + config=config, + manual_padding=calculated_block_padding[0], + ) + self.conv_1 = Gemma3nAudioSSCPConvBlock( + idx=1, + input_freq_dim=calculated_f_out_dims[0], # Output freq dim from conv_0 + config=config, + manual_padding=calculated_block_padding[1], + ) + final_c_out = config.sscp_conv_channel_size[-1] + final_f_out = calculated_f_out_dims[-1] # Final frequency dimension + self.input_proj_in_features = final_c_out * final_f_out + self.input_proj_linear = nn.Linear(self.input_proj_in_features, self.config.hidden_size, bias=False) + + def forward(self, audio_encodings: torch.Tensor) -> torch.Tensor: + # audio_encodings is [B, T, F_in] + # Reshape to [B, 1, T, F_in] (Batch, Channels=1, Height=Time, Width=F_in) + audio_encodings_reshaped = audio_encodings.unsqueeze(1) + x = self.conv_0(audio_encodings_reshaped) + x = self.conv_1(x) + # x from conv_1 is [B, C_out_1, T_out_1, F_out_1] + b, c_out, t_out, f_out = x.shape + # Permute to [B, T_out_1, F_out_1, C_out_1] then flatten F_out_1 and C_out_1 + x_permuted = x.permute(0, 2, 3, 1).contiguous() + output_flattened = x_permuted.view(b, t_out, f_out * c_out) + output = self.input_proj_linear(output_flattened) + return output + + +class Gemma3nAudioConformerAttention(nn.Module): + def __init__(self, config: Gemma3nAudioConfig): + super().__init__() + self.config = config + self.post_in_features = self.config.hidden_size + self.register_buffer("gradient_clipping", torch.tensor(self.config.gradient_clipping), persistent=False) + self.pre_attn_norm = Gemma3nRMSNorm(self.config.hidden_size) + self.attn = Gemma3nAudioAttention(config) + self.post = nn.Linear(self.post_in_features, self.config.hidden_size, bias=False) + self.post_norm = Gemma3nRMSNorm(self.config.hidden_size) + + def forward(self, audio_encodings: torch.Tensor, audio_mel_mask: torch.BoolTensor) -> torch.Tensor: + audio_encodings_input_to_attn = audio_encodings + audio_encodings = torch.clamp(audio_encodings, -self.gradient_clipping, self.gradient_clipping) + audio_encodings_norm = self.pre_attn_norm(audio_encodings) + # Output of self.attn is [B, T, NumHeads, HeadDim] + audio_encodings_attn_out = self.attn(audio_encodings_norm, audio_mel_mask) + + # Reshape from [B, T, NumHeads, HeadDim] to [B, T, NumHeads * HeadDim] + # NumHeads * HeadDim = hidden_size + b, t, num_heads, head_dim = audio_encodings_attn_out.shape + audio_encodings_reshaped = audio_encodings_attn_out.reshape(b, t, num_heads * head_dim) + + audio_encodings = self.post(audio_encodings_reshaped) + audio_encodings = torch.clamp(audio_encodings, -self.gradient_clipping, self.gradient_clipping) + return audio_encodings_input_to_attn + self.post_norm(audio_encodings) + + +class Gemma3nAudioConformerFeedForward(nn.Module): + def __init__(self, config: Gemma3nAudioConfig): + super().__init__() + self.config = config + + self.register_buffer("gradient_clipping", torch.tensor(self.config.gradient_clipping), persistent=False) + + self.pre_layer_norm = Gemma3nRMSNorm(self.config.hidden_size) + self.ffw_layer_1 = nn.Linear(self.config.hidden_size, self.config.hidden_size * 4, bias=False) + self.ffw_layer_2 = nn.Linear(self.config.hidden_size * 4, self.config.hidden_size, bias=False) + self.post_layer_norm = Gemma3nRMSNorm(self.config.hidden_size) + self.post_layer_scale = torch.tensor(self.config.conf_residual_weight) + + def forward(self, audio_encodings: torch.Tensor) -> torch.Tensor: + residual = audio_encodings + audio_encodings = torch.clamp(audio_encodings, -self.gradient_clipping, self.gradient_clipping) + audio_encodings = self.pre_layer_norm(audio_encodings) + audio_encodings: torch.Tensor = self.ffw_layer_1(audio_encodings) + audio_encodings = nn.functional.silu(audio_encodings) + audio_encodings: torch.Tensor = self.ffw_layer_2(audio_encodings) + audio_encodings = torch.clamp(audio_encodings, -self.gradient_clipping, self.gradient_clipping) + audio_encodings = self.post_layer_norm(audio_encodings) + return residual + (audio_encodings * self.post_layer_scale) + + +class Gemma3nAudioConformerLightConv1d(nn.Module): + def __init__(self, config: Gemma3nAudioConfig): + super().__init__() + self.config = config + + self.pre_layer_norm = Gemma3nRMSNorm(self.config.hidden_size, eps=self.config.rms_norm_eps) + self.linear_start = nn.Linear(self.config.hidden_size, self.config.hidden_size * 2, bias=False) + self.depthwise_conv1d = nn.Conv1d( + in_channels=self.config.hidden_size, + out_channels=self.config.hidden_size, + kernel_size=self.config.conf_conv_kernel_size, + stride=1, + padding=0, # Manual causal padding + groups=self.config.hidden_size, # Depthwise + bias=False, + ) + self.register_buffer("gradient_clipping", torch.tensor(self.config.gradient_clipping), persistent=False) + self.conv_norm = Gemma3nRMSNorm(self.config.hidden_size, eps=self.config.rms_norm_eps) + self.linear_end = nn.Linear(self.config.hidden_size, self.config.hidden_size, bias=False) + + self.causal_padding = self.config.conf_conv_kernel_size - 1 + + def forward(self, audio_encodings: torch.Tensor) -> torch.Tensor: + audio_encodings_residual = audio_encodings # Save for residual connection + + audio_encodings = self.pre_layer_norm(audio_encodings) + audio_encodings = self.linear_start(audio_encodings) + audio_encodings = torch.nn.functional.glu(audio_encodings, dim=-1) + # Permute for Conv1d: [B, T, D] -> [B, D, T] + audio_encodings_permuted = audio_encodings.permute(0, 2, 1) + # Apply manual causal padding + audio_encodings_permuted_padded = F.pad(audio_encodings_permuted, (self.causal_padding, 0)) + audio_encodings = self.depthwise_conv1d(audio_encodings_permuted_padded) + # Permute back: [B, D, T_out] -> [B, T_out, D] + audio_encodings = audio_encodings.permute(0, 2, 1) + audio_encodings = torch.clamp(audio_encodings, -self.gradient_clipping, self.gradient_clipping) + audio_encodings = self.conv_norm(audio_encodings) + audio_encodings = nn.functional.silu(audio_encodings) + audio_encodings = self.linear_end(audio_encodings) + output = audio_encodings + audio_encodings_residual + return output + + +class Gemma3nAudioConformerBlock(nn.Module): + def __init__(self, config: Gemma3nAudioConfig): + super().__init__() + self.config = config + + self.ffw_layer_start = Gemma3nAudioConformerFeedForward(self.config) + self.attention = Gemma3nAudioConformerAttention(self.config) + self.lconv1d = Gemma3nAudioConformerLightConv1d(self.config) + self.ffw_layer_end = Gemma3nAudioConformerFeedForward(self.config) + self.register_buffer("gradient_clipping", torch.tensor(self.config.gradient_clipping), persistent=False) + self.norm = Gemma3nRMSNorm(self.config.hidden_size) + + def forward(self, audio_encodings: torch.Tensor, audio_mel_mask: torch.BoolTensor) -> torch.Tensor: + audio_encodings = self.ffw_layer_start(audio_encodings) + audio_encodings = self.attention(audio_encodings, audio_mel_mask) + validity_mask_for_lconv = ~audio_mel_mask # True for valid + audio_encodings_for_lconv_input = audio_encodings * validity_mask_for_lconv.unsqueeze(-1).to( + audio_encodings.dtype + ) + audio_encodings = self.lconv1d(audio_encodings_for_lconv_input) + + audio_encodings = self.ffw_layer_end(audio_encodings) + audio_encodings = torch.clamp(audio_encodings, -self.gradient_clipping, self.gradient_clipping) + output = self.norm(audio_encodings) + return output + + +class Gemma3nAudioEncoder(PreTrainedModel): + """ + An audio encoder based on the [Universal Speech Model](https://huggingface.co/papers/2303.01037) architecture. + """ + + config: Gemma3nAudioConfig + + main_input_name = "audio_mel" + + def __init__(self, config: Gemma3nAudioConfig): + super().__init__(config) + self.config = config + + self.subsample_conv_projection = Gemma3nAudioSubSampleConvProjection(config) + self.conformer = nn.ModuleList( + [Gemma3nAudioConformerBlock(config) for _ in range(config.conf_num_hidden_layers)] + ) + + def forward( + self, audio_mel: torch.Tensor, audio_mel_mask: torch.BoolTensor + ) -> tuple[torch.Tensor, torch.BoolTensor]: + """Encodes a batch of MELs. + + Args: + audio_mel: a torch.Tensor of shape [batch, num_frames, num_channels, + mel_bins]. + + Returns: + audio_encodings: a torch.Tensor of shape + `[batch_size, self.config.audio_soft_tokens_per_image, + self.config.audio_config.hidden_size]` + audio_mel_mask: a torch.BoolTensor of shape [batch, num_frames]. + """ + audio_encodings = self.subsample_conv_projection(audio_mel) # audio_encodings: [B, T_sub, D] + + # Subsample the input audio_mel_mask to match the time dimension of audio_encodings (T_sub) + t_sub = audio_encodings.shape[1] + + time_stride_product = 1 + for stride_pair_idx in range(len(self.config.sscp_conv_stride_size)): + time_stride_product *= self.config.sscp_conv_stride_size[stride_pair_idx][0] + + # Create indices for gathering from the original mask. + # These indices map to original time steps corresponding to the start of each + # receptive field in the subsampled output. + indices = torch.arange(t_sub, device=audio_mel_mask.device) * time_stride_product + indices = torch.clamp(indices, max=audio_mel_mask.shape[1] - 1) # Ensure indices are valid + + # Expand indices for batch compatibility if B > 1 and indices is 1D. + if audio_mel_mask.ndim > 1 and indices.ndim == 1: + indices = indices.unsqueeze(0).expand(audio_mel_mask.shape[0], -1) # [B, T_sub] + elif ( + audio_mel_mask.ndim == indices.ndim + and audio_mel_mask.shape[0] == 1 + and indices.shape[0] != 1 + and t_sub == indices.shape[0] + ): + # Handle case where B=1 but indices became [T_sub] instead of [1, T_sub] + indices = indices.unsqueeze(0) + + current_mask = torch.gather(audio_mel_mask, 1, indices) # [B, T_sub] + + for block in self.conformer: + audio_encodings = block(audio_encodings, current_mask) # Pass the processed mask + + if self.config.conf_reduction_factor > 1: + audio_encodings = audio_encodings[:, :: self.config.conf_reduction_factor] + # Reduce the mask as well + current_mask = current_mask[:, :: self.config.conf_reduction_factor] + + audio_encodings = audio_encodings.masked_fill(current_mask.unsqueeze(-1), 0.0) + return audio_encodings, current_mask + + +# ==== Language Model ==== + + +class Gemma3nTextScaledWordEmbedding(Gemma3TextScaledWordEmbedding): + pass + + +class Gemma3nTextLaurelBlock(nn.Module): + """Learned Augmented Residual Layer""" + + def __init__(self, config: Gemma3nTextConfig): + super().__init__() + self.config = config + + self.linear_left = nn.Linear(self.config.hidden_size, self.config.laurel_rank, bias=False) + self.linear_right = nn.Linear(self.config.laurel_rank, self.config.hidden_size, bias=False) + self.post_laurel_norm = Gemma3nRMSNorm(self.config.hidden_size, eps=self.config.rms_norm_eps) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + laurel_hidden_states: torch.Tensor = self.linear_left(hidden_states) + laurel_hidden_states: torch.Tensor = self.linear_right(laurel_hidden_states) + normed_laurel_hidden_states = self.post_laurel_norm(laurel_hidden_states) + return hidden_states + normed_laurel_hidden_states + + +class Gemma3nTextMLP(Gemma2MLP): + def __init__(self, config: Gemma3nTextConfig, layer_idx: int = 0): + super().__init__(config) + self.intermediate_size = config.intermediate_size[layer_idx] + self.activation_sparsity = config.activation_sparsity_pattern[layer_idx] + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + gate_proj = self.gate_proj(hidden_states) + if self.activation_sparsity > 0.0: + gate_proj = self._gaussian_topk(gate_proj) + activations = self.act_fn(gate_proj) + up_proj = self.up_proj(hidden_states) + down_proj = self.down_proj(activations * up_proj) + return down_proj + + def _gaussian_topk(self, inputs: torch.Tensor) -> torch.Tensor: + target_sparsity_tensor = torch.tensor(self.activation_sparsity, dtype=torch.float32, device=inputs.device) + # normal_dist and std_multiplier are adapted from jax.scipy.stats.norm.ppf(). + # + # References: + # * https://docs.jax.dev/en/latest/_autosummary/jax.scipy.stats.norm.ppf.html + # * https://pytorch.org/docs/stable/distributions.html#torch.distributions.normal.Normal + # * https://pytorch.org/docs/stable/distributions.html#torch.distributions.transformed_distribution.TransformedDistribution.icdf + normal_dist = torch.distributions.normal.Normal(0, 1) + std_multiplier: torch.Tensor = normal_dist.icdf(target_sparsity_tensor) + std_multiplier = std_multiplier.type(inputs.dtype) + inputs_mean = torch.mean(inputs, dim=-1, keepdim=True) + inputs_std = torch.std(inputs, dim=-1, keepdim=True, unbiased=False) + cutoff_x = inputs_mean + inputs_std * std_multiplier + return nn.functional.relu(inputs - cutoff_x) + + +class Gemma3nTextAltUp(nn.Module): + """Alternating Updates (AltUp) + + The AltUp module wraps transformer layers. The `predict` step modifies the + input to the transformer layer, and the `correct` step propagates the output + of the transformer layer to the sparsely updated dimensions. + + See more in the research paper: + + https://proceedings.neurips.cc/paper_files/paper/2023/file/f2059277ac6ce66e7e5543001afa8bb5-Paper-Conference.pdf + """ + + def __init__(self, config: Gemma3nTextConfig): + super().__init__() + self.config = config + self.correct_output_scale = nn.Parameter(torch.zeros(self.config.hidden_size)) + self.correction_coefs = nn.Linear(self.config.altup_num_inputs, self.config.altup_num_inputs, bias=False) + self.prediction_coefs = nn.Linear(self.config.altup_num_inputs, self.config.altup_num_inputs**2, bias=False) + self.modality_router = nn.Linear(self.config.hidden_size, self.config.altup_num_inputs, bias=False) + self.router_norm = Gemma3nRMSNorm(self.config.hidden_size, eps=self.config.rms_norm_eps) + self.register_buffer("router_input_scale", torch.tensor(self.config.hidden_size**-1.0), persistent=False) + + def compute_router_modalities(self, x: torch.Tensor) -> torch.Tensor: + router_inputs = self.router_norm(x) * self.router_input_scale + routed = self.modality_router(router_inputs) + return torch.tanh(routed.float()).type_as(x) + + def predict(self, hidden_states: torch.Tensor) -> torch.Tensor: + """Predicts the output of a layer using a trainable map. + + Args: + hidden_states: A 4D tensor of shape `[num_altup_inputs, batch_size, num_tokens, hidden_size]` derived by + stacking the input embeddings and preprocessing the last `num_altup_inputs - 1` matrices. + + Returns: + A 4D tensor of shape `[num_altup_inputs, batch_size, num_tokens, hidden_size]` containing the predictions. + """ + modalities = self.compute_router_modalities(hidden_states[self.config.altup_active_idx]) + + if self.training and self.config.altup_coef_clip is not None: + self.prediction_coefs.weight.data.clamp_(-self.config.altup_coef_clip, self.config.altup_coef_clip) + + # Project and then transpose all 2D matrices contained so that mulmat gives the correct result + all_coefs: torch.Tensor = ( + self.prediction_coefs(modalities) + .reshape(*modalities.shape[:-1], self.config.altup_num_inputs, self.config.altup_num_inputs) + .permute(0, 1, 3, 2) + ) + + # permute hidden_states to [batch_size, num_tokens, hidden_size, altup_num_inputs] + predictions = torch.matmul(hidden_states.permute(1, 2, 3, 0), all_coefs) + predictions = predictions.permute(3, 0, 1, 2) # undo the permute + predictions += hidden_states # add the original input + return predictions.contiguous().type_as(hidden_states) + + def correct(self, predictions: torch.Tensor, activated: torch.Tensor) -> torch.Tensor: + """Corrects the predictions relative to the + + Args: + predictions: A 4D tensor of shape `[num_altup_inputs, batch_size, num_tokens, hidden_size]` derived by + stacking the input embeddings and preprocessing the last `num_altup_inputs - 1` matrices. + activated: A 3D tensor of shape `[batch_size, num_tokens, hidden_size]` containing the activated inputs. + + Returns: + A 4D tensor of shape `[num_altup_inputs, batch_size, num_tokens, hidden_size]` correcting the original + predictions relative to the activated input embeddings. + """ + modalities = self.compute_router_modalities(activated) + innovation = activated - predictions[self.config.altup_active_idx] # (batch, num_tokens, hidden_size) + innovation = innovation.repeat(self.config.altup_num_inputs, 1, 1, 1) # Repeat on dim0 to match predictions + + if self.config.altup_coef_clip is not None: + self.correction_coefs.weight.data.clamp_(-self.config.altup_coef_clip, self.config.altup_coef_clip) + + # all_coefs adapted from jax.numpy.einsum("...p,pi->...i", ...) + # Permute to (altup_num_inputs, batch_size, num_tokens) as the last dim is a scalar applied to each altup input + # and expand on dim1 for broadcastability + all_coefs: torch.Tensor = self.correction_coefs(modalities) + 1.0 + all_coefs = all_coefs.permute(2, 0, 1).unsqueeze(-1) + + corrected = torch.mul(innovation, all_coefs) + corrected += predictions # add the original input + return corrected.contiguous().type_as(activated) + + def forward(self, corrected: torch.Tensor) -> torch.Tensor: + """ + This is only defined as the `forward` so that accelerate hooks can move correctly `correct_output_scale` + (which is a nn.Parameter, not a Module) between devices when offloading. It is otherwise only used in + `scale_corrected_output` + """ + return (corrected.type_as(self.correct_output_scale) * self.correct_output_scale).type_as(corrected) + + def scale_corrected_output(self, corrected: torch.Tensor) -> torch.Tensor: + """Scales the provided 3D tensor of shape [batch_size, num_tokens, hidden_size].""" + return self.forward(corrected) + + +class Gemma3nTextRotaryEmbedding(Gemma2RotaryEmbedding): + pass + + +def apply_rotary_pos_emb( + x: torch.Tensor, + cos: torch.Tensor, + sin: torch.Tensor, + position_ids: Optional[torch.Tensor] = None, + unsqueeze_dim: int = 1, +): + """Applies Rotary Position Embedding to the query and key tensors. + + Args: + x (`torch.Tensor`): The tensor to embed. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + position_ids (`torch.Tensor`, *optional*): + Deprecated and unused. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + cos = cos.unsqueeze(unsqueeze_dim) + sin = sin.unsqueeze(unsqueeze_dim) + return (x * cos) + (rotate_half(x) * sin) + + +class Gemma3nTextAttention(Gemma3Attention): + def __init__(self, config: Gemma3nTextConfig, layer_idx: int): + super().__init__(config, layer_idx) + self.is_causal = True + del self.attn_logit_softcapping + del self.scaling + self.v_norm = Gemma3nRMSNorm(dim=config.head_dim, eps=config.rms_norm_eps, with_scale=False) + + first_kv_shared_layer_idx = self.config.num_hidden_layers - self.config.num_kv_shared_layers + self.is_kv_shared_layer = layer_idx >= first_kv_shared_layer_idx > 0 + prev_layers = config.layer_types[:first_kv_shared_layer_idx] + if self.is_kv_shared_layer: + # For shared layers, find the last non-shared layer of the same type before sharing starts + self.kv_shared_layer_index = len(prev_layers) - 1 - prev_layers[::-1].index(config.layer_types[layer_idx]) + self.store_full_length_kv = False + else: + self.kv_shared_layer_index = None + # For non-shared layers, store full-length kv if this is the last non-shared layer of its type + self.store_full_length_kv = layer_idx == len(prev_layers) - 1 - prev_layers[::-1].index( + config.layer_types[layer_idx] + ) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + position_embeddings: torch.Tensor, + attention_mask: Optional[torch.Tensor], + past_key_values: Optional[Cache] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Unpack[FlashAttentionKwargs], + ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]: + input_shape = hidden_states.shape[:-1] + hidden_shape = (*input_shape, -1, self.config.head_dim) + + cos, sin = position_embeddings + + query_states = self.q_proj(hidden_states).view(hidden_shape) + query_states = self.q_norm(query_states) + query_states = apply_rotary_pos_emb(query_states, cos, sin, unsqueeze_dim=2) + query_states = query_states.transpose(1, 2) + + # For layers with shared KV (from kv sharing point onwards), we reuse the same keys/values states as the last non-sharing layer + if self.is_kv_shared_layer and past_key_values is not None: + key_states, value_states = past_key_values.shared_layers[self.kv_shared_layer_index] + # Device of past layer may be different from current one + key_states = key_states.to(query_states.device) + value_states = value_states.to(query_states.device) + else: + key_states = self.k_proj(hidden_states).view(hidden_shape) + key_states = self.k_norm(key_states) + key_states = apply_rotary_pos_emb(key_states, cos, sin, unsqueeze_dim=2) + key_states = key_states.transpose(1, 2) + + value_states = self.v_proj(hidden_states).view(hidden_shape) + value_states = self.v_norm(value_states) + value_states = value_states.transpose(1, 2) + + if past_key_values is not None: + # sin and cos are specific to RoPE models; cache_position needed for the static cache + cache_kwargs = { + "sin": sin, + "cos": cos, + "cache_position": cache_position, + "sliding_window": self.sliding_window, + } + if not self.is_kv_shared_layer: + key_states, value_states = past_key_values.update( + key_states, value_states, self.layer_idx, cache_kwargs + ) + if self.store_full_length_kv: + if not hasattr(past_key_values, "shared_layers"): + past_key_values.shared_layers = {} + past_key_values.shared_layers[self.layer_idx] = key_states, value_states + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + attn_output, attn_weights = attention_interface( + self, + query_states, + key_states, + value_states, + attention_mask, + dropout=self.attention_dropout if self.training else 0.0, + scaling=1.0, + sliding_window=self.sliding_window, + **kwargs, + ) + + attn_output = attn_output.reshape(*input_shape, -1).contiguous() + attn_output = self.o_proj(attn_output) + return attn_output, attn_weights + + +class Gemma3nTextDecoderLayer(Gemma3DecoderLayer): + def __init__(self, config: Gemma3nTextConfig, layer_idx: int): + super().__init__(config, layer_idx) + self.mlp = Gemma3nTextMLP(config, layer_idx=layer_idx) + + self.hidden_size_per_layer_input = config.hidden_size_per_layer_input + self.act_fn = ACT2FN[config.hidden_activation] + + self.altup = Gemma3nTextAltUp(config) + self.laurel = Gemma3nTextLaurelBlock(config) + self.self_attn = Gemma3nTextAttention(config, layer_idx) + self.per_layer_input_gate = nn.Linear(self.hidden_size, self.hidden_size_per_layer_input, bias=False) + self.per_layer_projection = nn.Linear(self.hidden_size_per_layer_input, self.hidden_size, bias=False) + self.post_per_layer_input_norm = Gemma3nRMSNorm(self.hidden_size, eps=config.rms_norm_eps) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + position_embeddings_global: torch.Tensor, + position_embeddings_local: torch.Tensor, + per_layer_input: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + cache_position: Optional[torch.LongTensor] = None, + **kwargs, + ) -> tuple[torch.Tensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]: + predictions = self.altup.predict(hidden_states) + active_prediction = predictions[self.config.altup_active_idx] + + active_prediction_normed = self.input_layernorm(active_prediction) + laurel_output = self.laurel(active_prediction_normed) + + # apply global RoPE to non-sliding layer only + if self.self_attn.is_sliding: + position_embeddings = position_embeddings_local + else: + position_embeddings = position_embeddings_global + + attn, self_attn_weights = self.self_attn( + hidden_states=active_prediction_normed, + position_embeddings=position_embeddings, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + output_attentions=output_attentions, + use_cache=use_cache, + cache_position=cache_position, + **kwargs, + ) + attn = self.post_attention_layernorm(attn) + + attn_gated = active_prediction + attn + attn_laurel = (attn_gated + laurel_output) / math.sqrt(2) + + attn_norm = self.pre_feedforward_layernorm(attn_laurel) + attn_ffw = self.mlp(attn_norm) + attn_ffw_norm = self.post_feedforward_layernorm(attn_ffw) + attn_ffw_laurel_gated = attn_laurel + attn_ffw_norm + corrected_predictions = self.altup.correct(predictions, attn_ffw_laurel_gated) + + first_prediction = corrected_predictions[self.config.altup_active_idx].clone() + if self.config.altup_correct_scale: + first_prediction = self.altup.scale_corrected_output(first_prediction) + + # per_layer_input_gate adapted from jax.numpy.einsum("btd,dp->btp", ...) + first_prediction = self.per_layer_input_gate(first_prediction) + first_prediction = self.act_fn(first_prediction) + first_prediction = torch.multiply(first_prediction, per_layer_input) + + # per_layer_projection adapted from jax.numpy.einsum("btp,pd->btd", ...) + first_prediction = self.per_layer_projection(first_prediction) + first_prediction = self.post_per_layer_input_norm(first_prediction) + corrected_predictions[1:] += first_prediction + + outputs = (corrected_predictions,) + + if output_attentions: + outputs += (self_attn_weights,) + + return outputs + + +class Gemma3nPreTrainedModel(Gemma2PreTrainedModel): + config: Gemma3nConfig + base_model_prefix = "" + _no_split_modules = ["Gemma3nTextDecoderLayer"] + + def _init_weights(self, module): + PreTrainedModel._init_weights(self, module) + if isinstance(module, Gemma3nAudioCumulativeGroupNorm): + module.weight.data.fill_(1.0) + elif isinstance(module, Gemma3nAudioAttention): + module.per_dim_scale.data.zero_() + elif isinstance(module, Gemma3nTextAltUp): + module.correct_output_scale.data.zero_() + + +@auto_docstring(custom_intro="The base Gemma 3n language model without a language modeling head.") +class Gemma3nTextModel(Gemma3TextModel): + config: Gemma3nTextConfig + + def __init__(self, config: Gemma3nTextConfig): + super().__init__(config) + + self.hidden_size = config.hidden_size + self.hidden_size_per_layer_input = config.hidden_size_per_layer_input + + self.embed_tokens_per_layer = Gemma3nTextScaledWordEmbedding( + config.vocab_size_per_layer_input, + config.num_hidden_layers * config.hidden_size_per_layer_input, + self.padding_idx, + embed_scale=config.hidden_size_per_layer_input**0.5, + ) + + self.per_layer_model_projection = nn.Linear( + self.hidden_size, + config.num_hidden_layers * config.hidden_size_per_layer_input, + bias=False, + ) + + self.per_layer_projection_norm = Gemma3nRMSNorm(config.hidden_size_per_layer_input, eps=config.rms_norm_eps) + self.layers = nn.ModuleList( + [Gemma3nTextDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + ) + + self.norm = Gemma3nRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + self.altup_projections = nn.ModuleList( + [nn.Linear(self.hidden_size, self.hidden_size, bias=False) for _ in range(1, self.config.altup_num_inputs)] + ) + + self.altup_unembed_projections = nn.ModuleList( + [nn.Linear(self.hidden_size, self.hidden_size, bias=False) for _ in range(1, self.config.altup_num_inputs)] + ) + + self.register_buffer("per_layer_projection_scale", torch.tensor(self.hidden_size**-0.5), persistent=False) + self.register_buffer("per_layer_input_scale", torch.rsqrt(torch.tensor(2.0)), persistent=False) + self.rotary_emb = Gemma3nTextRotaryEmbedding(config=config) + + # TODO (raushan): Fix this after RoPE refactor. For now we hack it by + # reassigning thetas when we want to create a local RoPE layer. Config + # defaults should hold values for global RoPE. + config = copy.deepcopy(config) + config.rope_theta = config.rope_local_base_freq + config.rope_scaling = {"rope_type": "default"} + self.rotary_emb_local = Gemma3nTextRotaryEmbedding(config=config) + + def get_per_layer_inputs(self, input_ids: torch.LongTensor) -> torch.Tensor: + return self.embed_tokens_per_layer(input_ids).reshape( + *input_ids.shape, + self.config.num_hidden_layers, + self.hidden_size_per_layer_input, + ) + + def project_per_layer_inputs( + self, + inputs_embeds: torch.Tensor, + per_layer_inputs: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + per_layer_projection: torch.Tensor = self.per_layer_model_projection(inputs_embeds) + per_layer_projection *= self.per_layer_projection_scale.to( + dtype=inputs_embeds.dtype, device=per_layer_projection.device + ) + per_layer_projection = per_layer_projection.reshape( + *inputs_embeds.shape[:-1], + self.config.num_hidden_layers, + self.hidden_size_per_layer_input, + ) + per_layer_projection = self.per_layer_projection_norm(per_layer_projection) + + if per_layer_inputs is None: + return per_layer_projection + + if per_layer_projection.shape != per_layer_inputs.shape: + # per-layer inputs are sometimes padded with zeros, slice the relevant embeddings. + per_layer_inputs = per_layer_inputs[..., : self.config.num_hidden_layers, :] + + return (per_layer_projection + per_layer_inputs) * self.per_layer_input_scale.to( + dtype=inputs_embeds.dtype, device=per_layer_projection.device + ) + + @can_return_tuple + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + per_layer_inputs: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> BaseModelOutputWithPast: + r""" + per_layer_inputs (torch.Tensor, *optional*, defaults to None): + Pre-computed per-layer embeddings. If None, they are derived from input_ids if provided. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") + + if self.gradient_checkpointing and self.training and use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`." + ) + use_cache = False + + if input_ids is not None: + inputs_embeds = self.embed_tokens(input_ids) + per_layer_inputs = self.get_per_layer_inputs(input_ids) + + per_layer_inputs = self.project_per_layer_inputs(inputs_embeds, per_layer_inputs) + + if use_cache and past_key_values is None and not self.training: + past_key_values = DynamicCache(config=self.config) + + if cache_position is None: + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + cache_position = torch.arange( + past_seen_tokens, + past_seen_tokens + inputs_embeds.shape[1], + device=inputs_embeds.device, + ) + + if position_ids is None: + position_ids = cache_position.unsqueeze(0) + + # It may already have been prepared by e.g. `generate` + if not isinstance(causal_mask_mapping := attention_mask, dict): + # Prepare mask arguments + mask_kwargs = { + "config": self.config, + "input_embeds": inputs_embeds, + "attention_mask": attention_mask, + "cache_position": cache_position, + "past_key_values": past_key_values, + "position_ids": position_ids, + } + # Create the masks + causal_mask_mapping = { + "full_attention": create_causal_mask(**mask_kwargs), + "sliding_attention": create_sliding_window_causal_mask(**mask_kwargs), + } + + # embed positions + hidden_states_0 = inputs_embeds + + # Initialize RoPE embeddings + position_embeddings_global = self.rotary_emb(hidden_states_0, position_ids) + position_embeddings_local = self.rotary_emb_local(hidden_states_0, position_ids) + + # Expand hidden_states to support per-layer inputs + target_magnitude = torch.mean(hidden_states_0**2, dim=-1, keepdim=True) ** 0.5 + epsilon_tensor = torch.tensor(1e-5) + + temp_hidden_states = [hidden_states_0] + for i in range(1, self.config.altup_num_inputs): + # altup_proj adapted from jax.numpy.einsum("btp,pd->btd", ...) + altup_proj = self.altup_projections[i - 1](hidden_states_0) + current_hidden_state = altup_proj.to(dtype=hidden_states_0.dtype, device=target_magnitude.device) + new_magnitude = torch.mean(current_hidden_state**2, dim=-1, keepdim=True) + new_magnitude = torch.sqrt(torch.maximum(new_magnitude, epsilon_tensor.to(target_magnitude.device))) + current_hidden_state = current_hidden_state * target_magnitude / new_magnitude + temp_hidden_states.append(current_hidden_state) + + hidden_states = torch.stack(temp_hidden_states, dim=0) # [num_altup_inputs, batch, seq_len, hidden_size] + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + + for decoder_layer in self.layers[: self.config.num_hidden_layers]: + if output_hidden_states: + all_hidden_states += (hidden_states,) + + causal_mask = causal_mask_mapping[decoder_layer.attention_type] + per_layer_input = per_layer_inputs[:, :, decoder_layer.layer_idx, :] + + layer_outputs = decoder_layer( + hidden_states, + position_embeddings_global, + position_embeddings_local, + per_layer_input, + attention_mask=causal_mask, + position_ids=position_ids, + past_key_values=past_key_values, + output_attentions=output_attentions, + use_cache=use_cache, + cache_position=cache_position, + **kwargs, + ) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + # add hidden states from the last decoder layer (but before reprojecting to stay consistent with layer output) + if output_hidden_states: + all_hidden_states += (hidden_states,) + + # Per-layer inputs to single output + target_magnitude = torch.mean(hidden_states[0] ** 2, dim=-1, keepdim=True) ** 0.5 + temp_hidden_states = [hidden_states[0]] + for i in range(1, self.config.altup_num_inputs): + # altup_unembed_projections adapted from jax.numpy.einsum("btp,pd->btd", ...) + altup_unemb_proj: torch.Tensor = self.altup_unembed_projections[i - 1](hidden_states[i]) + current_hidden_state = altup_unemb_proj.to(dtype=hidden_states_0.dtype, device=target_magnitude.device) + new_magnitude = torch.mean(current_hidden_state**2, dim=-1, keepdim=True) + new_magnitude = torch.sqrt(torch.maximum(new_magnitude, epsilon_tensor.to(target_magnitude.device))) + current_hidden_state = current_hidden_state * target_magnitude / new_magnitude + temp_hidden_states.append(current_hidden_state) + + hidden_states = torch.stack(temp_hidden_states) + hidden_states = torch.mean(hidden_states, dim=0) + hidden_states = self.norm(hidden_states) + + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=past_key_values, + hidden_states=all_hidden_states, + attentions=all_self_attns, + ) + + +@auto_docstring(custom_intro="The base Gemma 3n language model with a language modeling head.") +class Gemma3nForCausalLM(Gemma3ForCausalLM): + _checkpoint_conversion_mapping = {"model.language_model": "model"} + base_model_prefix = "model" + + +class Gemma3nMultimodalEmbedder(nn.Module): + """Embeds token ids or soft tokens for multimodal content into language model space.""" + + def __init__( + self, + multimodal_config: Union[Gemma3nAudioConfig, Gemma3nVisionConfig], + text_config: Gemma3nTextConfig, + ): + super().__init__() + + self.multimodal_hidden_size = multimodal_config.hidden_size + self.eps = multimodal_config.rms_norm_eps + self.vocab_offset = multimodal_config.vocab_offset + self.vocab_size = multimodal_config.vocab_size + self.text_hidden_size = text_config.hidden_size + + self.embedding = nn.Embedding(self.vocab_size, self.multimodal_hidden_size) + self.hard_embedding_norm = Gemma3nRMSNorm(self.multimodal_hidden_size, eps=self.eps) + self.soft_embedding_norm = Gemma3nRMSNorm(self.multimodal_hidden_size, eps=self.eps) + self.embedding_projection = nn.Linear(self.multimodal_hidden_size, self.text_hidden_size, bias=False) + self.embedding_post_projection_norm = Gemma3nRMSNorm(self.text_hidden_size, eps=self.eps, with_scale=False) + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + """Embeds token ids or soft tokens for multimodal content into language model space. + + Args: + input_ids: A torch.LongTensor containing the token ids to embed. Values should be in the range + `[vocab_offset, vocab_offset + vocab_size)`. + inputs_embeds: A torch.Tensor containing the soft tokens to embed. + + Returns: + A torch.Tensor of embeddings with shape `[batch_size, seq_len, self.config.text_config.hidden_size]`. + """ + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") + + if inputs_embeds is not None: + emb_norm = self.soft_embedding_norm(inputs_embeds) + else: + hard_emb = self.embedding(input_ids - self.vocab_offset) + emb_norm = self.hard_embedding_norm(hard_emb) + + emb_norm_proj = self.embedding_projection(emb_norm) + return self.embedding_post_projection_norm(emb_norm_proj) + + +@auto_docstring( + custom_intro=""" + The base Gemma 3n model comprising a vision backbone, an audio backbone, and a language model without a + language modeling head. + """ +) +class Gemma3nModel(PaliGemmaModel): + _checkpoint_conversion_mapping = {} + + def __init__(self, config: Gemma3nConfig): + super().__init__(config) + del self.multi_modal_projector # Replaced by Gemma3nVisionEmbedder + del self.text_config_dtype + self.vocab_size_per_layer_input = config.text_config.vocab_size_per_layer_input + self.audio_tower = AutoModel.from_config(config.audio_config) + self.embed_vision = Gemma3nMultimodalEmbedder(config.vision_config, config.text_config) + self.embed_audio = Gemma3nMultimodalEmbedder(config.audio_config, config.text_config) + + def get_image_features(self, pixel_values: torch.Tensor) -> torch.Tensor: + """ + Projects the last hidden state from the vision model into language model space. + + Args: + pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`) + The tensors corresponding to the input images. + + Returns: + image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`). + """ + vision_outputs = self.vision_tower( + pixel_values=pixel_values, do_pooling=False, return_dict=True + ).last_hidden_state + # Convert from (batch, channels, height, width) to (batch, height * width, channels) where: + # height == width and height * width == Gemma3nConfig.vision_soft_tokens_per_image. + vision_outputs = vision_outputs.reshape( + vision_outputs.shape[0], + self.config.vision_config.hidden_size, + self.config.vision_soft_tokens_per_image, + ).permute(0, 2, 1) + # Normalize and embed the soft tokens into language model space. + vision_outputs *= self.config.vision_config.hidden_size**0.5 + return self.embed_vision(inputs_embeds=vision_outputs) + + def get_placeholder_mask( + self, + input_ids: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + image_features: Optional[torch.FloatTensor] = None, + audio_features: Optional[torch.FloatTensor] = None, + ): + """ + Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is + equal to the length of multimodal features. If the lengths are different, an error is raised. + """ + if input_ids is None: + special_image_mask = inputs_embeds == self.get_input_embeddings()( + torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device) + ) + special_image_mask = special_image_mask.all(-1) + special_audio_mask = ( + inputs_embeds + == self.get_input_embeddings()( + torch.tensor(self.config.audio_token_id, dtype=torch.long, device=inputs_embeds.device) + ) + ).all(-1) + else: + special_image_mask = input_ids == self.config.image_token_id + special_audio_mask = input_ids == self.config.audio_token_id + + n_image_tokens = special_image_mask.sum() + special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device) + if image_features is not None and inputs_embeds[special_image_mask].numel() != image_features.numel(): + raise ValueError( + f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {image_features.shape[0] * image_features.shape[1]}" + ) + + n_audio_tokens = special_audio_mask.sum() + special_audio_mask = special_audio_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device) + if audio_features is not None and inputs_embeds[special_audio_mask].numel() != audio_features.numel(): + raise ValueError( + f"Audio features and image tokens do not match: tokens: {n_audio_tokens}, features {audio_features.shape[0] * audio_features.shape[1]}" + ) + + return special_image_mask, special_audio_mask + + @can_return_tuple + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, # text inputs + pixel_values: Optional[torch.FloatTensor] = None, # vision inputs + input_features: Optional[torch.FloatTensor] = None, # audio inputs + attention_mask: Optional[torch.Tensor] = None, + input_features_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + token_type_ids: Optional[torch.LongTensor] = None, + cache_position: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + **lm_kwargs, + ) -> Gemma3nCausalLMOutputWithPast: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`. + + Example: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, Gemma3nForConditionalGeneration + + >>> model = Gemma3nForConditionalGeneration.from_pretrained("google/gemma3n2-3b-mix-224") + >>> processor = AutoProcessor.from_pretrained("google/gemma3n2-3b-mix-224") + + >>> prompt = "Where is the cat standing?" + >>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> inputs = processor(images=image, text=prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(**inputs,) + >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "Where is the cat standing?\nsnow" + ``` + """ + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + + if input_ids is not None: + inputs_embeds = self.get_input_embeddings()(input_ids) + + # Prepare per-layer inputs from inputs_ids + per_layer_inputs_mask = torch.logical_and(input_ids >= 0, input_ids < self.vocab_size_per_layer_input) + per_layer_inputs_tokens = torch.where(per_layer_inputs_mask, input_ids, torch.zeros_like(input_ids)) + per_layer_inputs = self.language_model.get_per_layer_inputs(per_layer_inputs_tokens) + + # Handle vision tokens (>= embed_vision.vocab_offset and < embed_audio.vocab_offset) + vision_mask = torch.logical_and( + input_ids >= self.embed_vision.vocab_offset, input_ids < self.embed_audio.vocab_offset + ) + dummy_vision_token_id = self.embed_vision.vocab_offset + self.embed_vision.vocab_size - 1 + vision_input_ids = torch.where(vision_mask, input_ids, dummy_vision_token_id).to(inputs_embeds.device) + vision_embeds = self.embed_vision(input_ids=vision_input_ids) + expanded_vision_mask = vision_mask.unsqueeze(-1).expand_as(inputs_embeds) + inputs_embeds = torch.where(expanded_vision_mask, vision_embeds, inputs_embeds) + + # Handle audio tokens (>= embed_audio.vocab_offset) + audio_mask = input_ids >= self.embed_audio.vocab_offset + dummy_audio_token_id = self.embed_audio.vocab_offset + self.embed_audio.vocab_size - 1 + audio_input_ids = torch.where(audio_mask, input_ids, dummy_audio_token_id).to(inputs_embeds.device) + audio_embeds = self.embed_audio(input_ids=audio_input_ids) + expanded_audio_mask = audio_mask.unsqueeze(-1).expand_as(inputs_embeds) + inputs_embeds = torch.where(expanded_audio_mask, audio_embeds, inputs_embeds) + else: + per_layer_inputs = None + + # Merge text and images + if pixel_values is not None: + image_features = self.get_image_features(pixel_values) + image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype) + special_image_mask, _ = self.get_placeholder_mask( + input_ids, inputs_embeds=inputs_embeds, image_features=image_features + ) + inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features) + + # Merge text and audio + if input_features is not None and input_features_mask is not None: + audio_features, audio_mask = self.get_audio_features(input_features, ~input_features_mask) + + # The Gemma3nProcessor expects all audio will be 30s in length and inserts 188 audio soft tokens into the + # text to account for this. However, the audio preprocessing and encoder do not gurarantee they will + # produce 188 soft tokens; they will produce at most that many tokens, but they may produce fewer tokens + # depending on the length of the longest audio input in the batch. When we encounter this situation, we pad + # the audio feature out to 188 soft tokens with the emebedding of the last token in the embed_audio vocab. + audio_padding_toks = torch.tensor([[self.vocab_size - 1]], dtype=torch.long, device=audio_features.device) + audio_padding_embs = self.embed_audio(input_ids=audio_padding_toks) + audio_features = torch.where(audio_mask.unsqueeze(-1), audio_padding_embs, audio_features) + + audio_batch_size, audio_seq_len, audio_embed_dim = audio_features.shape + extra_padding_tokens = self.config.audio_soft_tokens_per_image - audio_seq_len + extra_padding_features = audio_padding_embs.expand(audio_batch_size, extra_padding_tokens, audio_embed_dim) + + audio_features = torch.cat((audio_features, extra_padding_features), dim=1) + audio_features = audio_features.to(inputs_embeds.device, inputs_embeds.dtype) + _, special_audio_mask = self.get_placeholder_mask( + input_ids, inputs_embeds=inputs_embeds, audio_features=audio_features + ) + inputs_embeds = inputs_embeds.masked_scatter(special_audio_mask, audio_features) + + outputs = self.language_model( + input_ids=None, + per_layer_inputs=per_layer_inputs, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=True, + cache_position=cache_position, + **lm_kwargs, + ) + + return Gemma3nModelOutputWithPast( + last_hidden_state=outputs.last_hidden_state, + past_key_values=outputs.past_key_values if use_cache else None, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + image_hidden_states=image_features if pixel_values is not None else None, + audio_hidden_states=audio_features if input_features is not None else None, + ) + + def get_audio_features( + self, input_features: torch.Tensor, input_features_mask: torch.Tensor + ) -> tuple[torch.Tensor, torch.Tensor]: + """ + Projects the last hidden state from the audio encoder into language model space. + + Args: + input_features (`torch.FloatTensor]` of shape `(num_images, seq_length, num_features)`): + The tensors corresponding to the input audio. + input_features_mask (`torch.FloatTensor]` of shape `(num_images, seq_length)`): + The attention mask for the input audio. + + Returns: + audio_features (`torch.Tensor`): Audio feature tensor of shape `(num_images, audio_length, embed_dim)`). + """ + audio_outputs, audio_mask = self.audio_tower(input_features, input_features_mask) + return self.embed_audio(inputs_embeds=audio_outputs), audio_mask + + def _update_causal_mask(self, **super_kwargs): + raise AttributeError("We don't want to inherit it") + + +@auto_docstring( + custom_intro=""" + The base Gemma 3n model comprising a vision backbone, an audio backbone, a language model, and a language modeling + head. + """ +) +class Gemma3nForConditionalGeneration(PaliGemmaForConditionalGeneration): + _checkpoint_conversion_mapping = {} + base_model_prefix = "model" + + @property + def audio_tower(self): + return self.model.audio_tower + + @property + def multi_modal_projector(self): + raise AttributeError("Use embed_vision instead of multi_modal_projector.") + + @can_return_tuple + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, # text inputs + pixel_values: Optional[torch.FloatTensor] = None, # vision inputs + input_features: Optional[torch.FloatTensor] = None, # audio inputs + attention_mask: Optional[torch.Tensor] = None, + input_features_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + token_type_ids: Optional[torch.LongTensor] = None, + cache_position: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + logits_to_keep: Union[int, torch.Tensor] = 0, + **lm_kwargs, + ) -> Gemma3nCausalLMOutputWithPast: + r""" + input_features_mask (torch.Tensor, *optional*, defaults to None): + The attention mask for the input audio. + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are + ignored (masked), the loss is only computed for the tokens with labels in + `[0, ..., config.text_config.vocab_size]`. + + Example: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, Gemma3ForConditionalGeneration + + >>> model = Gemma3ForConditionalGeneration.from_pretrained("google/gemma-3-4b-it") + >>> processor = AutoProcessor.from_pretrained("google/gemma-3-4b-it") + + >>> messages = [ + ... { + ... "role": "system", + ... "content": [ + ... {"type": "text", "text": "You are a helpful assistant."} + ... ] + ... }, + ... { + ... "role": "user", "content": [ + ... {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"}, + ... {"type": "text", "text": "Where is the cat standing?"}, + ... ] + ... }, + ... ] + + >>> inputs = processor.apply_chat_template( + ... messages, + ... tokenizer=True, + ... return_dict=True, + ... return_tensors="pt", + ... add_generation_prompt=True + ... ) + >>> # Generate + >>> generate_ids = model.generate(**inputs) + >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "user\nYou are a helpful assistant.\n\n\n\n\n\nWhere is the cat standing?\nmodel\nBased on the image, the cat is standing in a snowy area, likely outdoors. It appears to" + ``` + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + + outputs = self.model( + input_ids=input_ids, + pixel_values=pixel_values, + input_features=input_features, + attention_mask=attention_mask, + input_features_mask=input_features_mask, + position_ids=position_ids, + past_key_values=past_key_values, + token_type_ids=token_type_ids, + cache_position=cache_position, + inputs_embeds=inputs_embeds, + labels=labels, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=True, + **lm_kwargs, + ) + + hidden_states = outputs.last_hidden_state + # Only compute necessary logits, and do not upcast them to float if we are not computing the loss + slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep + logits = self.lm_head(hidden_states[:, slice_indices, :]) + if (final_logit_softcapping := self.config.get_text_config().final_logit_softcapping) is not None: + logits = logits / final_logit_softcapping + logits = torch.tanh(logits) + logits = logits * final_logit_softcapping + + loss = None + if labels is not None: + # Upcast to float if we need to compute the loss to avoid potential precision issues + logits = logits.float() + shift_logits = logits[..., :-1, :] + shift_labels = labels[..., 1:] + if attention_mask is not None: + # we use the input attention mask to shift the logits and labels, because it is 2D. + # we also crop attn mask in case it is longer, which happens in PrefixTuning with peft + shift_attention_mask = attention_mask[:, -shift_logits.shape[1] :].to(logits.device) + shift_logits = shift_logits[shift_attention_mask.to(logits.device) != 0].contiguous() + shift_labels = shift_labels[shift_attention_mask.to(shift_labels.device) != 0].contiguous() + else: + shift_logits = shift_logits.contiguous() + shift_labels = shift_labels.contiguous() + # Flatten the tokens + loss_fct = nn.CrossEntropyLoss() + + flat_logits = shift_logits.view(-1, self.config.text_config.vocab_size) + flat_labels = shift_labels.view(-1).to(shift_logits.device) + loss = loss_fct(flat_logits, flat_labels) + + return Gemma3nCausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + image_hidden_states=outputs.image_hidden_states, + audio_hidden_states=outputs.audio_hidden_states, + ) + + def prepare_inputs_for_generation( + self, + input_ids, + past_key_values=None, + inputs_embeds=None, + cache_position=None, + position_ids=None, + pixel_values=None, + input_features=None, + attention_mask=None, + input_features_mask=None, + token_type_ids=None, + use_cache=True, + logits_to_keep=None, + labels=None, + **kwargs, + ): + # Overwritten -- custom `position_ids` and `pixel_values` handling + model_inputs = super().prepare_inputs_for_generation( + input_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + position_ids=position_ids, + cache_position=cache_position, + use_cache=use_cache, + logits_to_keep=logits_to_keep, + token_type_ids=token_type_ids, + **kwargs, + ) + + # If we're in cached decoding stage, multimodal inputs should be None because input ids do not contain special + # tokens anymore. Otherwise multimodal inputs should be passed to model. + # NOTE: use_cache=False always needs pixel_values, input_features, and input_features_mask + if cache_position[0] == 0: + model_inputs["pixel_values"] = pixel_values + model_inputs["input_features"] = input_features + model_inputs["input_features_mask"] = input_features_mask + + return model_inputs + + def _prepare_4d_causal_attention_mask_with_cache_position(self, **super_kwargs): + raise AttributeError("Do not inherit _prepare_4d_causal_attention_mask_with_cache_position from PaliGemma") + + +__all__ = [ + "Gemma3nAudioConfig", + "Gemma3nAudioEncoder", + "Gemma3nConfig", + "Gemma3nForCausalLM", + "Gemma3nForConditionalGeneration", + "Gemma3nModel", + "Gemma3nPreTrainedModel", + "Gemma3nTextConfig", + "Gemma3nTextModel", + "Gemma3nVisionConfig", +] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/gemma3n/processing_gemma3n.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/gemma3n/processing_gemma3n.py new file mode 100644 index 0000000000000000000000000000000000000000..e2c2c3ae10f8ab79f2c18d95028f57c28f6f9150 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/gemma3n/processing_gemma3n.py @@ -0,0 +1,165 @@ +# coding=utf-8 +# Copyright 2025 Google Inc. HuggingFace Inc. team. All rights reserved. +# +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Optional, Union + +import numpy as np + +from ...feature_extraction_utils import BatchFeature +from ...image_utils import ImageInput, make_nested_list_of_images +from ...processing_utils import AudioKwargs, ImagesKwargs, ProcessingKwargs, ProcessorMixin, Unpack +from ...tokenization_utils_base import PreTokenizedInput, TextInput + + +class Gemma3nImagesKwargs(ImagesKwargs): + do_convert_rgb: Optional[bool] + + +class Gemma3nProcessorKwargs(ProcessingKwargs, total=False): + audio_kwargs: AudioKwargs + images_kwargs: Gemma3nImagesKwargs + _defaults = { + "text_kwargs": { + "padding": False, + }, + } + + +class Gemma3nProcessor(ProcessorMixin): + """ + A processor for Gemma 3n, wrapping the full capabilities of a feature extractor, image processor, and tokenizer + into a single processor. + + Args: + feature_extractor (`Gemma3nAudioFeatureExtractor`): + Feature extractor that converts raw audio waveforms into MEL spectrograms for the audio encoder. This + should return a `BatchFeature` with `input_features` and `input_features_mask` features. + image_processor (`SiglipImageProcessorFast`): + Image processor that prepares batches of images for the vision encoder. This should return a `BatchFeature` + with a `pixel_values` feature. + tokenizer (`GemmaTokenizerFast`): + The text tokenizer for the model. + chat_template (`string`, *optional*): + A Jinja template for generating text prompts from a set of messages. + audio_seq_length (int, *optional*, defaults to 188): + The number of audio soft tokens that will be added to the text prompt + image_seq_length (int, *optional*, defaults to 256): + The number of image soft tokens that should be added to + """ + + attributes = ["feature_extractor", "image_processor", "tokenizer"] + feature_extractor_class = "AutoFeatureExtractor" + image_processor_class = "AutoImageProcessor" + tokenizer_class = "AutoTokenizer" + + def __init__( + self, + feature_extractor, + image_processor, + tokenizer, + chat_template=None, + audio_seq_length: int = 188, + image_seq_length: int = 256, + **kwargs, + ): + self.audio_seq_length = audio_seq_length + self.audio_token_id = tokenizer.audio_token_id + self.boa_token = tokenizer.boa_token + self.audio_token = tokenizer.audio_token + audio_tokens_expanded = "".join([tokenizer.audio_token] * audio_seq_length) + self.full_audio_sequence = f"\n\n{tokenizer.boa_token}{audio_tokens_expanded}{tokenizer.eoa_token}\n\n" + + self.image_seq_length = image_seq_length + self.image_token_id = tokenizer.image_token_id + self.boi_token = tokenizer.boi_token + self.image_token = tokenizer.image_token + image_tokens_expanded = "".join([tokenizer.image_token] * image_seq_length) + self.full_image_sequence = f"\n\n{tokenizer.boi_token}{image_tokens_expanded}{tokenizer.eoi_token}\n\n" + + super().__init__( + feature_extractor=feature_extractor, + image_processor=image_processor, + tokenizer=tokenizer, + chat_template=chat_template, + **kwargs, + ) + + def __call__( + self, + images: Optional[ImageInput] = None, + text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None, + audio: Optional[Union[np.ndarray, list[float], list[np.ndarray], list[list[float]]]] = None, + videos=None, + **kwargs: Unpack[Gemma3nProcessorKwargs], + ) -> BatchFeature: + if text is None and images is None and audio is None: + raise ValueError("Provide at least one of `text`, `images`, or `audio`.") + + output_kwargs = self._merge_kwargs( + Gemma3nProcessorKwargs, + tokenizer_init_kwargs=self.tokenizer.init_kwargs, + **kwargs, + ) + + if isinstance(text, str): + text = [text] + elif not isinstance(text, list) and not isinstance(text[0], str): + raise ValueError("Invalid input text. Please provide a string, or a list of strings") + + if audio is not None: + audio_inputs = self.feature_extractor(audio, **output_kwargs["audio_kwargs"]) + + if not text: + text = [self.audio_token for _ in audio] + + # Expand placeholder audio tokens to the full audio token sequence + text = [prompt.replace(self.audio_token, self.full_audio_sequence) for prompt in text] + else: + audio_inputs = {} + + if images is not None: + images = self.image_processor.fetch_images(images) + batched_images = make_nested_list_of_images(images) + image_inputs = self.image_processor(batched_images, **output_kwargs["images_kwargs"]) + + # Create empty text to be replaced with placeholders + if not text: + text = [" ".join([self.image_token] * len(images)) for images in batched_images] + + if len(batched_images) != len(text): + raise ValueError( + f"Received inconsistently sized batches of images ({len(batched_images)}) and text ({len(text)})." + ) + + # Expand placeholder image tokens to the full image token sequence + text = [prompt.replace(self.image_token, self.full_image_sequence) for prompt in text] + else: + image_inputs = {} + + return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None) + text_inputs = self.tokenizer(text=text, **output_kwargs["text_kwargs"], return_tensors="np") + self._check_special_mm_tokens(text, text_inputs, modalities=["image"]) + + # Add token type ids manually, as tokenizer can't do arbitrary position token types + array_ids = text_inputs["input_ids"] + token_type_ids = np.zeros_like(array_ids) + token_type_ids[array_ids == self.image_token_id] = 1 + token_type_ids[array_ids == self.audio_token_id] = 3 + text_inputs = {k: v.tolist() for k, v in text_inputs.items()} # in case user requested list inputs + text_inputs["token_type_ids"] = token_type_ids.tolist() + return BatchFeature(data={**text_inputs, **image_inputs, **audio_inputs}, tensor_type=return_tensors) + + +__all__ = ["Gemma3nProcessor"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/git/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/git/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..06e3e86927ab7901f1302a87882c4f841f35865d --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/git/__init__.py @@ -0,0 +1,28 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_git import * + from .modeling_git import * + from .processing_git import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/git/configuration_git.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/git/configuration_git.py new file mode 100644 index 0000000000000000000000000000000000000000..86c85854ff98bed91cc926db1daf48fcdb1ffb0e --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/git/configuration_git.py @@ -0,0 +1,222 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from ...configuration_utils import PretrainedConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class GitVisionConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`GitVisionModel`]. It is used to instantiate a GIT + vision encoder according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of the vision encoder of the GIT + [microsoft/git-base](https://huggingface.co/microsoft/git-base) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + intermediate_size (`int`, *optional*, defaults to 3072): + Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + image_size (`int`, *optional*, defaults to 224): + The size (resolution) of each image. + patch_size (`int`, *optional*, defaults to 16): + The size (resolution) of each patch. + hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported. + layer_norm_eps (`float`, *optional*, defaults to 1e-5): + The epsilon used by the layer normalization layers. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + + Example: + + ```python + >>> from transformers import GitVisionConfig, GitVisionModel + + >>> # Initializing a GitVisionConfig with microsoft/git-base style configuration + >>> configuration = GitVisionConfig() + + >>> # Initializing a GitVisionModel (with random weights) from the microsoft/git-base style configuration + >>> model = GitVisionModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "git_vision_model" + base_config_key = "vision_config" + + def __init__( + self, + hidden_size=768, + intermediate_size=3072, + num_hidden_layers=12, + num_attention_heads=12, + num_channels=3, + image_size=224, + patch_size=16, + hidden_act="quick_gelu", + layer_norm_eps=1e-5, + attention_dropout=0.0, + initializer_range=0.02, + **kwargs, + ): + super().__init__(**kwargs) + + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.num_channels = num_channels + self.patch_size = patch_size + self.image_size = image_size + self.initializer_range = initializer_range + self.attention_dropout = attention_dropout + self.layer_norm_eps = layer_norm_eps + self.hidden_act = hidden_act + + +class GitConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`GitModel`]. It is used to instantiate a GIT model + according to the specified arguments, defining the model architecture. Instantiating a configuration with the + defaults will yield a similar configuration to that of the GIT + [microsoft/git-base](https://huggingface.co/microsoft/git-base) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + vision_config (`dict`, *optional*): + Dictionary of configuration options used to initialize [`GitVisionConfig`]. + vocab_size (`int`, *optional*, defaults to 30522): + Vocabulary size of the GIT model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`GitModel`]. + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + num_hidden_layers (`int`, *optional*, defaults to 6): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + intermediate_size (`int`, *optional*, defaults to 3072): + Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder. + hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"silu"` and `"gelu_new"` are supported. + hidden_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout ratio for the attention probabilities. + max_position_embeddings (`int`, *optional*, defaults to 1024): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-12): + The epsilon used by the layer normalization layers. + position_embedding_type (`str`, *optional*, defaults to `"absolute"`): + Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For + positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to + [Self-Attention with Relative Position Representations (Shaw et al.)](https://huggingface.co/papers/1803.02155). + For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models + with Better Relative Position Embeddings (Huang et al.)](https://huggingface.co/papers/2009.13658). + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). + num_image_with_embedding (`int`, *optional*): + The number of temporal embeddings to add, in case the model is used for video captioning/VQA. + + Examples: + + ```python + >>> from transformers import GitConfig, GitModel + + >>> # Initializing a GIT microsoft/git-base style configuration + >>> configuration = GitConfig() + + >>> # Initializing a model (with random weights) from the microsoft/git-base style configuration + >>> model = GitModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "git" + sub_configs = {"vision_config": GitVisionConfig} + + def __init__( + self, + vision_config=None, + vocab_size=30522, + hidden_size=768, + num_hidden_layers=6, + num_attention_heads=12, + intermediate_size=3072, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=1024, + initializer_range=0.02, + layer_norm_eps=1e-12, + pad_token_id=0, + position_embedding_type="absolute", + use_cache=True, + tie_word_embeddings=False, + bos_token_id=101, + eos_token_id=102, + num_image_with_embedding=None, + **kwargs, + ): + super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, pad_token_id=pad_token_id, **kwargs) + + if vision_config is None: + vision_config = {} + logger.info("vision_config is None. initializing the GitVisionConfig with default values.") + + self.vision_config = GitVisionConfig(**vision_config) + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.hidden_act = hidden_act + self.intermediate_size = intermediate_size + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + self.position_embedding_type = position_embedding_type + self.use_cache = use_cache + self.tie_word_embeddings = tie_word_embeddings + self.num_image_with_embedding = num_image_with_embedding + + self.bos_token_id = bos_token_id + self.eos_token_id = eos_token_id + + +__all__ = ["GitConfig", "GitVisionConfig"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/git/modeling_git.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/git/modeling_git.py new file mode 100644 index 0000000000000000000000000000000000000000..bc037912c5c52b869099fd4c35ff1bd4ee0fd1c8 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/git/modeling_git.py @@ -0,0 +1,1460 @@ +# coding=utf-8 +# Copyright 2022 Microsoft Research and The HuggingFace Inc. team. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch GIT model.""" + +import math +from dataclasses import dataclass +from typing import Callable, Optional, Union + +import torch +from torch import nn + +from ...activations import ACT2FN +from ...cache_utils import Cache, DynamicCache +from ...generation import GenerationMixin +from ...modeling_attn_mask_utils import _prepare_4d_attention_mask +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_outputs import ( + BaseModelOutput, + BaseModelOutputWithPast, + BaseModelOutputWithPooling, + CausalLMOutputWithPast, +) +from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel +from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer +from ...utils import ( + ModelOutput, + auto_docstring, + can_return_tuple, + logging, + torch_int, +) +from ...utils.deprecation import deprecate_kwarg +from .configuration_git import GitConfig, GitVisionConfig + + +logger = logging.get_logger(__name__) + + +@dataclass +@auto_docstring( + custom_intro=""" + Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states. + """ +) +# Copied from transformers.models.clip.modeling_clip.CLIPVisionModelOutput with CLIP->Git +class GitVisionModelOutput(ModelOutput): + r""" + image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`): + The image embeddings obtained by applying the projection layer to the pooler_output. + """ + + image_embeds: Optional[torch.FloatTensor] = None + last_hidden_state: Optional[torch.FloatTensor] = None + hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None + attentions: Optional[tuple[torch.FloatTensor, ...]] = None + + +class GitEmbeddings(nn.Module): + """Construct the embeddings from word and position embeddings.""" + + def __init__(self, config): + super().__init__() + self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id) + self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) + + # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load + # any TensorFlow checkpoint file + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + # position_ids (1, len position emb) is contiguous in memory and exported when serialized + self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + past_key_values_length: int = 0, + ) -> torch.Tensor: + if input_ids is not None: + input_shape = input_ids.size() + else: + input_shape = inputs_embeds.size()[:-1] + + seq_length = input_shape[1] + + if position_ids is None: + position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length] + + if inputs_embeds is None: + embeddings = self.word_embeddings(input_ids) + else: + embeddings = inputs_embeds + + if self.position_embedding_type == "absolute": + position_embeddings = self.position_embeddings(position_ids) + embeddings += position_embeddings + embeddings = self.LayerNorm(embeddings) + embeddings = self.dropout(embeddings) + return embeddings + + +class GitSelfAttention(nn.Module): + def __init__(self, config, position_embedding_type=None, layer_idx=None): + super().__init__() + if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): + raise ValueError( + f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention " + f"heads ({config.num_attention_heads})" + ) + self.layer_idx = layer_idx + if layer_idx is None: + logger.warning_once( + f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will " + "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` " + "when creating this class." + ) + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + self.image_patch_tokens = int((config.vision_config.image_size / config.vision_config.patch_size) ** 2 + 1) + if config.num_image_with_embedding is not None: + self.image_patch_tokens *= config.num_image_with_embedding + + self.query = nn.Linear(config.hidden_size, self.all_head_size) + self.key = nn.Linear(config.hidden_size, self.all_head_size) + self.value = nn.Linear(config.hidden_size, self.all_head_size) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + self.position_embedding_type = position_embedding_type or getattr( + config, "position_embedding_type", "absolute" + ) + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + self.max_position_embeddings = config.max_position_embeddings + self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + past_key_values: Optional[Cache] = None, + output_attentions: Optional[bool] = False, + pixel_values_present: Optional[bool] = False, + ) -> tuple[torch.Tensor]: + batch_size, seq_length, _ = hidden_states.shape + query_layer = ( + self.query(hidden_states) + .view(batch_size, -1, self.num_attention_heads, self.attention_head_size) + .transpose(1, 2) + ) + + cutoff = self.image_patch_tokens if pixel_values_present else 0 + key_layer = ( + self.key(hidden_states) + .view(batch_size, -1, self.num_attention_heads, self.attention_head_size) + .transpose(1, 2) + ) + value_layer = ( + self.value(hidden_states) + .view(batch_size, -1, self.num_attention_heads, self.attention_head_size) + .transpose(1, 2) + ) + if past_key_values is not None: + # NOTE: like in other caches, we store the text component. In GIT it means we discard the image component. + key_layer_past, value_layer_past = past_key_values.update( + key_layer[:, :, cutoff:, :], value_layer[:, :, cutoff:, :], self.layer_idx + ) + key_layer = torch.cat([key_layer[:, :, :cutoff, :], key_layer_past], dim=2) + value_layer = torch.cat([value_layer[:, :, :cutoff, :], value_layer_past], dim=2) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + query_length, key_length = query_layer.shape[2], key_layer.shape[2] + if past_key_values is not None: + position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=hidden_states.device).view( + -1, 1 + ) + else: + position_ids_l = torch.arange(query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1) + position_ids_r = torch.arange(key_length, dtype=torch.long, device=hidden_states.device).view(1, -1) + distance = position_ids_l - position_ids_r + + positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1) + positional_embedding = positional_embedding.to(dtype=query_layer.dtype) # fp16 compatibility + + if self.position_embedding_type == "relative_key": + relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores + elif self.position_embedding_type == "relative_key_query": + relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key + + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in GitModel forward() function) + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = nn.functional.softmax(attention_scores, dim=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = attention_probs * head_mask + + context_layer = torch.matmul(attention_probs, value_layer) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(new_context_layer_shape) + + return context_layer, attention_probs + + +# Copied from transformers.models.bert.modeling_bert.BertSelfOutput +class GitSelfOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +GIT_SELF_ATTENTION_CLASSES = { + "eager": GitSelfAttention, +} + + +class GitAttention(nn.Module): + def __init__(self, config, position_embedding_type=None, layer_idx=None): + super().__init__() + self.self = GIT_SELF_ATTENTION_CLASSES[config._attn_implementation]( + config, position_embedding_type=position_embedding_type, layer_idx=layer_idx + ) + self.output = GitSelfOutput(config) + self.pruned_heads = set() + + # Copied from transformers.models.bert.modeling_bert.BertAttention.prune_heads + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads + ) + + # Prune linear layers + self.self.query = prune_linear_layer(self.self.query, index) + self.self.key = prune_linear_layer(self.self.key, index) + self.self.value = prune_linear_layer(self.self.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.self.num_attention_heads = self.self.num_attention_heads - len(heads) + self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + past_key_values: Optional[Cache] = None, + output_attentions: Optional[bool] = False, + pixel_values_present: Optional[bool] = False, + ) -> tuple[torch.Tensor]: + attn_output, self_attn_weights = self.self( + hidden_states, + attention_mask, + head_mask, + past_key_values, + output_attentions, + pixel_values_present, + ) + attention_output = self.output(attn_output, hidden_states) + return attention_output, self_attn_weights + + +# Copied from transformers.models.bert.modeling_bert.BertIntermediate +class GitIntermediate(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +# Copied from transformers.models.bert.modeling_bert.BertOutput +class GitOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class GitLayer(GradientCheckpointingLayer): + def __init__(self, config, layer_idx=None): + super().__init__() + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 + self.attention = GitAttention(config, layer_idx=layer_idx) + self.intermediate = GitIntermediate(config) + self.output = GitOutput(config) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + past_key_values: Optional[Cache] = None, + output_attentions: Optional[bool] = False, + pixel_values_present: Optional[bool] = False, + ) -> tuple[torch.Tensor]: + # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 + attention_output, self_attention_weights = self.attention( + hidden_states, + attention_mask, + head_mask, + output_attentions=output_attentions, + past_key_values=past_key_values, + pixel_values_present=pixel_values_present, + ) + + layer_output = apply_chunking_to_forward( + self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output + ) + return layer_output, self_attention_weights + + def feed_forward_chunk(self, attention_output): + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + return layer_output + + +class GitEncoder(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.layer = nn.ModuleList([GitLayer(config, i) for i in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + past_key_values: Optional[Union[Cache, tuple[tuple[torch.FloatTensor]]]] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = False, + output_hidden_states: Optional[bool] = False, + pixel_values_present: Optional[bool] = False, + return_dict: Optional[bool] = True, + ) -> Union[tuple[torch.Tensor], BaseModelOutputWithPast]: + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + if use_cache and past_key_values is None: + past_key_values = DynamicCache(config=self.config) + + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + for i, layer_module in enumerate(self.layer): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_head_mask = head_mask[i] if head_mask is not None else None + + layer_outputs = layer_module( + hidden_states, + attention_mask, + layer_head_mask, + past_key_values, + output_attentions, + pixel_values_present, + ) + + hidden_states = layer_outputs[0] + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple( + v + for v in [ + hidden_states, + past_key_values, + all_hidden_states, + all_self_attentions, + ] + if v is not None + ) + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=past_key_values, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + ) + + +@auto_docstring +class GitPreTrainedModel(PreTrainedModel): + config: GitConfig + base_model_prefix = "git" + supports_gradient_checkpointing = True + + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, GitVisionEmbeddings): + nn.init.normal_(module.class_embedding, mean=0.0, std=self.config.initializer_range) + nn.init.normal_(module.patch_embedding.weight, std=self.config.initializer_range) + nn.init.normal_(module.position_embedding.weight, std=self.config.initializer_range) + if isinstance(module, nn.Linear): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + +# Copied from transformers.models.clip.modeling_clip.CLIPVisionEmbeddings with CLIP->Git +class GitVisionEmbeddings(nn.Module): + def __init__(self, config: GitVisionConfig): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.image_size = config.image_size + self.patch_size = config.patch_size + + self.class_embedding = nn.Parameter(torch.randn(self.embed_dim)) + + self.patch_embedding = nn.Conv2d( + in_channels=config.num_channels, + out_channels=self.embed_dim, + kernel_size=self.patch_size, + stride=self.patch_size, + bias=False, + ) + + self.num_patches = (self.image_size // self.patch_size) ** 2 + self.num_positions = self.num_patches + 1 + self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim) + self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False) + + def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor: + """ + This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution + images. This method is also adapted to support torch.jit tracing. + + Adapted from: + - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and + - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211 + """ + + num_patches = embeddings.shape[1] - 1 + position_embedding = self.position_embedding.weight.unsqueeze(0) + num_positions = position_embedding.shape[1] - 1 + + # always interpolate when tracing to ensure the exported model works for dynamic input shapes + if not torch.jit.is_tracing() and num_patches == num_positions and height == width: + return self.position_embedding(self.position_ids) + + class_pos_embed = position_embedding[:, :1] + patch_pos_embed = position_embedding[:, 1:] + + dim = embeddings.shape[-1] + + new_height = height // self.patch_size + new_width = width // self.patch_size + + sqrt_num_positions = torch_int(num_positions**0.5) + patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim) + patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2) + + patch_pos_embed = nn.functional.interpolate( + patch_pos_embed, + size=(new_height, new_width), + mode="bicubic", + align_corners=False, + ) + + patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim) + + return torch.cat((class_pos_embed, patch_pos_embed), dim=1) + + def forward(self, pixel_values: torch.FloatTensor, interpolate_pos_encoding=False) -> torch.Tensor: + batch_size, _, height, width = pixel_values.shape + if not interpolate_pos_encoding and (height != self.image_size or width != self.image_size): + raise ValueError( + f"Input image size ({height}*{width}) doesn't match model ({self.image_size}*{self.image_size})." + ) + target_dtype = self.patch_embedding.weight.dtype + patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid] + patch_embeds = patch_embeds.flatten(2).transpose(1, 2) + + class_embeds = self.class_embedding.expand(batch_size, 1, -1) + embeddings = torch.cat([class_embeds, patch_embeds], dim=1) + if interpolate_pos_encoding: + embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width) + else: + embeddings = embeddings + self.position_embedding(self.position_ids) + return embeddings + + +class GitVisionMLP(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.activation_fn = ACT2FN[config.hidden_act] + self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size) + self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.fc1(hidden_states) + hidden_states = self.activation_fn(hidden_states) + hidden_states = self.fc2(hidden_states) + return hidden_states + + +# Copied from transformers.models.siglip.modeling_siglip.eager_attention_forward +def eager_attention_forward( + module: nn.Module, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attention_mask: Optional[torch.Tensor], + scaling: float, + dropout: float = 0.0, + **kwargs, +): + attn_weights = torch.matmul(query, key.transpose(-1, -2)) * scaling + if attention_mask is not None: + attn_weights = attn_weights + attention_mask + + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype) + attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) + + attn_output = torch.matmul(attn_weights, value) + attn_output = attn_output.transpose(1, 2).contiguous() + + return attn_output, attn_weights + + +class GitVisionAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.embed_dim // self.num_heads + if self.head_dim * self.num_heads != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:" + f" {self.num_heads})." + ) + self.scale = self.head_dim**-0.5 + self.dropout = config.attention_dropout + self.is_causal = False + + self.k_proj = nn.Linear(self.embed_dim, self.embed_dim) + self.v_proj = nn.Linear(self.embed_dim, self.embed_dim) + self.q_proj = nn.Linear(self.embed_dim, self.embed_dim) + self.out_proj = nn.Linear(self.embed_dim, self.embed_dim) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + causal_attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = False, + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: + """Input shape: Batch x Time x Channel""" + + batch_size, seq_length, embed_dim = hidden_states.shape + + queries = self.q_proj(hidden_states) + keys = self.k_proj(hidden_states) + values = self.v_proj(hidden_states) + + queries = queries.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2) + keys = keys.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2) + values = values.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2) + # CLIP text model uses both `causal_attention_mask` and `attention_mask` + # in case FA2 kernel is called, `is_causal` should be inferred from `causal_attention_mask` + if self.config._attn_implementation != "flash_attention_2": + if attention_mask is not None and causal_attention_mask is not None: + attention_mask = attention_mask + causal_attention_mask + elif causal_attention_mask is not None: + attention_mask = causal_attention_mask + else: + self.is_causal = causal_attention_mask is not None + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + attn_output, attn_weights = attention_interface( + self, + queries, + keys, + values, + attention_mask, + is_causal=self.is_causal, + scaling=self.scale, + dropout=0.0 if not self.training else self.dropout, + ) + + attn_output = attn_output.reshape(batch_size, seq_length, embed_dim).contiguous() + attn_output = self.out_proj(attn_output) + if not output_attentions: + attn_weights = None + return attn_output, attn_weights + + +# Copied from transformers.models.altclip.modeling_altclip.AltCLIPEncoderLayer with AltCLIP->GitVision +class GitVisionEncoderLayer(GradientCheckpointingLayer): + def __init__(self, config: GitVisionConfig): + super().__init__() + self.embed_dim = config.hidden_size + self.self_attn = GitVisionAttention(config) + self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) + self.mlp = GitVisionMLP(config) + self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: torch.Tensor, + causal_attention_mask: torch.Tensor, + output_attentions: Optional[bool] = False, + ) -> tuple[torch.FloatTensor]: + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`): attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + `(config.encoder_attention_heads,)`. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + """ + residual = hidden_states + + hidden_states = self.layer_norm1(hidden_states) + hidden_states, attn_weights = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + causal_attention_mask=causal_attention_mask, + output_attentions=output_attentions, + ) + hidden_states = residual + hidden_states + + residual = hidden_states + hidden_states = self.layer_norm2(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (attn_weights,) + + return outputs + + +# Copied from transformers.models.altclip.modeling_altclip.AltCLIPEncoder with AltCLIP->GitVision, CLIPConfig +class GitVisionEncoder(nn.Module): + """ + Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a + [`GitVisionEncoderLayer`]. + + Args: + config: GitVisionConfig + """ + + def __init__(self, config: GitVisionConfig): + super().__init__() + self.config = config + self.layers = nn.ModuleList([GitVisionEncoderLayer(config) for _ in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + + @can_return_tuple + def forward( + self, + inputs_embeds, + attention_mask: Optional[torch.Tensor] = None, + causal_attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, BaseModelOutput]: + r""" + Args: + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert `input_ids` indices into associated vectors + than the model's internal embedding lookup matrix. + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Causal mask for the text model. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + encoder_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + + hidden_states = inputs_embeds + for idx, encoder_layer in enumerate(self.layers): + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + layer_outputs = encoder_layer( + hidden_states, + attention_mask, + causal_attention_mask, + output_attentions=output_attentions, + ) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + + return BaseModelOutput( + last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions + ) + + +class GitVisionTransformer(nn.Module): + # Copied from transformers.models.altclip.modeling_altclip.AltCLIPVisionTransformer.__init__ with AltCLIPEncoder->GitVisionEncoder, AltCLIP->Git + def __init__(self, config: GitVisionConfig): + super().__init__() + self.config = config + embed_dim = config.hidden_size + + self.embeddings = GitVisionEmbeddings(config) + self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) + self.encoder = GitVisionEncoder(config) + self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) + + @auto_docstring + def forward( + self, + pixel_values: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: Optional[bool] = False, + return_dict: Optional[bool] = None, + ) -> Union[tuple, BaseModelOutput]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if pixel_values is None: + raise ValueError("You have to specify pixel_values") + + hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding) + hidden_states = self.pre_layrnorm(hidden_states) + + encoder_outputs = self.encoder( + inputs_embeds=hidden_states, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + last_hidden_state = encoder_outputs[0] + + last_hidden_state = self.post_layernorm(last_hidden_state) + + if not return_dict: + return (last_hidden_state,) + encoder_outputs[1:] + + return BaseModelOutput( + last_hidden_state=last_hidden_state, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + +@auto_docstring( + custom_intro=""" + The vision model from CLIP, used in GIT, without any head or projection on top. + """ +) +class GitVisionModel(GitPreTrainedModel): + config: GitVisionConfig + main_input_name = "pixel_values" + + # Copied from transformers.models.clip.modeling_clip.CLIPVisionModel.__init__ with CLIP->Git + def __init__(self, config: GitVisionConfig): + super().__init__(config) + self.vision_model = GitVisionTransformer(config) + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self) -> nn.Module: + return self.vision_model.embeddings.patch_embedding + + @auto_docstring + def forward( + self, + pixel_values: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: bool = False, + return_dict: Optional[bool] = None, + ) -> Union[tuple, BaseModelOutput]: + r""" + Examples: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, GitVisionModel + + >>> processor = AutoProcessor.from_pretrained("microsoft/git-base") + >>> model = GitVisionModel.from_pretrained("microsoft/git-base") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> inputs = processor(images=image, return_tensors="pt") + + >>> outputs = model(**inputs) + >>> last_hidden_state = outputs.last_hidden_state + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + return self.vision_model( + pixel_values=pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + interpolate_pos_encoding=interpolate_pos_encoding, + return_dict=return_dict, + ) + + +class GitProjection(nn.Module): + def __init__(self, config: GitConfig): + super().__init__() + self.config = config + self.visual_projection = nn.Sequential( + nn.Linear(config.vision_config.hidden_size, config.hidden_size), + nn.LayerNorm(config.hidden_size, eps=config.vision_config.layer_norm_eps), + ) + + def forward(self, embeddings: torch.Tensor) -> torch.Tensor: + return self.visual_projection(embeddings) + + +@auto_docstring( + custom_intro=""" + The bare GIT Model transformer consisting of a CLIP image encoder and text decoder outputting raw hidden-states + """ +) +class GitModel(GitPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.config = config + + self.embeddings = GitEmbeddings(config) + self.image_encoder = GitVisionModel(config.vision_config) + self.encoder = GitEncoder(config) + + self.visual_projection = GitProjection(config) + + if config.num_image_with_embedding is not None: + self.img_temporal_embedding = nn.ParameterList( + nn.Parameter(torch.zeros(1, 1, config.vision_config.hidden_size)) + for _ in range(config.num_image_with_embedding) + ) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embeddings.word_embeddings + + def set_input_embeddings(self, value): + self.embeddings.word_embeddings = value + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + def _generate_future_mask(self, size: int, dtype: torch.dtype, device: torch.device) -> torch.Tensor: + # Default mask is for forward direction. Flip for backward direction. + mask = torch.triu(torch.ones(size, size, device=device, dtype=dtype), diagonal=1) + mask = mask.masked_fill(mask == 1, float("-inf")) + return mask + + def create_attention_mask(self, tgt, memory, tgt_mask, past_key_values_length, memory_key_padding_mask=None): + num_tgt = tgt.shape[1] + num_memory = memory.shape[1] + device = tgt.device + dtype = tgt.dtype + top_left = torch.zeros((num_memory, num_memory), device=device, dtype=dtype) + top_right = torch.full( + (num_memory, num_tgt + past_key_values_length), + float("-inf"), + device=tgt.device, + dtype=dtype, + ) + bottom_left = torch.zeros( + (num_tgt, num_memory), + dtype=dtype, + device=tgt_mask.device, + ) + + if past_key_values_length > 0: + tgt_mask = torch.zeros( + (tgt_mask.shape[0], tgt_mask.shape[0] + past_key_values_length), + dtype=dtype, + device=tgt_mask.device, + ) + + left = torch.cat((top_left, bottom_left), dim=0) + right = torch.cat((top_right, tgt_mask.to(dtype)), dim=0) + + full_attention_mask = torch.cat((left, right), dim=1)[None, :] + + if memory_key_padding_mask is None: + memory_key_padding_mask = torch.full((memory.shape[0], memory.shape[1]), fill_value=False, device=device) + # if it is False, it means valid. That is, it is not a padding + if memory_key_padding_mask.dtype != torch.bool: + raise ValueError("Memory key padding mask must be a boolean tensor.") + zero_negative_infinity = torch.zeros_like(memory_key_padding_mask, dtype=tgt.dtype) + zero_negative_infinity[memory_key_padding_mask] = float("-inf") + full_attention_mask = full_attention_mask.expand( + (memory_key_padding_mask.shape[0], num_memory + num_tgt, num_memory + past_key_values_length + num_tgt) + ) + full_attention_mask = full_attention_mask.clone() + origin_left = full_attention_mask[:, :, :num_memory] + update = zero_negative_infinity[:, None, :] + full_attention_mask[:, :, :num_memory] = origin_left + update + + # add axis for multi-head + full_attention_mask = full_attention_mask[:, None, :, :] + + return full_attention_mask + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + pixel_values: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + past_key_values: Optional[Union[Cache, list[torch.FloatTensor]]] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: bool = False, + return_dict: Optional[bool] = None, + ) -> Union[tuple[torch.Tensor], BaseModelOutputWithPooling]: + r""" + Examples: + + ```python + >>> from transformers import AutoProcessor, AutoModel + >>> import requests + >>> from PIL import Image + + >>> processor = AutoProcessor.from_pretrained("microsoft/git-base") + >>> model = AutoModel.from_pretrained("microsoft/git-base") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> text = "this is an image of two cats" + + >>> inputs = processor(images=image, text=text, return_tensors="pt") + + >>> outputs = model(**inputs) + >>> last_hidden_state = outputs.last_hidden_state + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask) + input_shape = input_ids.size() + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + seq_length = input_shape[1] + + # past_key_values_length + past_key_values_length = 0 + if past_key_values is not None: + past_key_values_length = ( + past_key_values.get_seq_length() + if not isinstance(past_key_values, Cache) + else past_key_values.get_seq_length() + ) + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + + projected_visual_features = None + if pixel_values is not None: + if pixel_values.ndim == 4: + # here we assume pixel_values is of shape (batch_size, num_channels, height, width) + visual_features = self.image_encoder( + pixel_values, interpolate_pos_encoding=interpolate_pos_encoding + ).last_hidden_state + + elif pixel_values.ndim == 5: + # here we assume pixel_values is of shape (batch_size, num_frames, num_channels, height, width) + visual_features = [] + for frame_idx in range(pixel_values.shape[1]): + visual_features_frame = self.image_encoder( + pixel_values[:, frame_idx, :, :], interpolate_pos_encoding=interpolate_pos_encoding + ).last_hidden_state + visual_features_frame += self.img_temporal_embedding[frame_idx] + visual_features.append(visual_features_frame) + + # finally, concatenate all features along sequence dimension + visual_features = torch.cat(visual_features, dim=1) + + else: + raise ValueError("pixel_values must be of rank 4 or 5") + + projected_visual_features = self.visual_projection(visual_features) + + embedding_output = self.embeddings( + input_ids=input_ids, + position_ids=position_ids, + inputs_embeds=inputs_embeds, + past_key_values_length=past_key_values_length, + ) + + if projected_visual_features is None: + projected_visual_features = torch.zeros( + (embedding_output.shape[0], 0, embedding_output.shape[2]), + dtype=embedding_output.dtype, + device=embedding_output.device, + ) + + # Repeat visual features to match embedding batch size. + projected_visual_features = projected_visual_features.repeat( + embedding_output.size(0) // projected_visual_features.size(0), 1, 1 + ) + + # concatenate patch token and text token embeddings + hidden_states = torch.cat((projected_visual_features, embedding_output), dim=1) + + # By default, an additive causal mask is created + # for masking the future (one direction). + tgt_mask = self._generate_future_mask(seq_length, embedding_output.dtype, embedding_output.device) + + # Create an attention mask of shape (batch_size, 1, tgt_seq_len, src_seq_len) + combined_attention_mask = self.create_attention_mask( + tgt=embedding_output, + memory=projected_visual_features, + tgt_mask=tgt_mask, + past_key_values_length=past_key_values_length, + ) + + if attention_mask is not None: + # if the user provides an attention mask, we add it to the default one + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + expanded_attn_mask = _prepare_4d_attention_mask( + attention_mask, embedding_output.dtype, tgt_len=input_shape[-1] + ).to(embedding_output.device) + if past_key_values_length > 0: + expanded_attn_mask = expanded_attn_mask[:, :, -past_key_values_length:, :] + else: + combined_attention_mask[:, :, -input_shape[1] :, -input_shape[1] :] += expanded_attn_mask + + encoder_outputs = self.encoder( + hidden_states, + attention_mask=combined_attention_mask, + head_mask=head_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + pixel_values_present=pixel_values is not None, + ) + sequence_output = encoder_outputs[0] + + if not return_dict: + return (sequence_output,) + encoder_outputs[1:] + + return BaseModelOutputWithPast( + last_hidden_state=sequence_output, + past_key_values=encoder_outputs.past_key_values, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + +@auto_docstring( + custom_intro=""" + GIT Model with a `language modeling` head on top for autoregressive language modeling. + """ +) +class GitForCausalLM(GitPreTrainedModel, GenerationMixin): + _tied_weights_keys = ["output.weight"] + + def __init__(self, config): + super().__init__(config) + + self.git = GitModel(config) + self.output = nn.Linear(config.hidden_size, config.vocab_size) + + # Initialize weights and apply final processing + self.post_init() + + def get_output_embeddings(self): + return self.output + + def set_output_embeddings(self, new_embeddings): + self.output = new_embeddings + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + pixel_values: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + past_key_values: Optional[Union[Cache, list[torch.Tensor]]] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: bool = False, + return_dict: Optional[bool] = None, + **kwargs, + ) -> Union[tuple[torch.Tensor], CausalLMOutputWithPast]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in + `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are + ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]` + + Examples: + + Image captioning example: + + ```python + >>> from transformers import AutoProcessor, AutoModelForCausalLM + >>> import requests + >>> from PIL import Image + + >>> processor = AutoProcessor.from_pretrained("microsoft/git-base-coco") + >>> model = AutoModelForCausalLM.from_pretrained("microsoft/git-base-coco") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> pixel_values = processor(images=image, return_tensors="pt").pixel_values + + >>> generated_ids = model.generate(pixel_values=pixel_values, max_length=50) + >>> generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] + >>> print(generated_caption) + two cats sleeping on a pink blanket next to remotes. + ``` + + Visual question answering (VQA) example: + + ```python + >>> from transformers import AutoProcessor, AutoModelForCausalLM + >>> from huggingface_hub import hf_hub_download + >>> from PIL import Image + + >>> processor = AutoProcessor.from_pretrained("microsoft/git-base-textvqa") + >>> model = AutoModelForCausalLM.from_pretrained("microsoft/git-base-textvqa") + + >>> file_path = hf_hub_download(repo_id="nielsr/textvqa-sample", filename="bus.png", repo_type="dataset") + >>> image = Image.open(file_path).convert("RGB") + + >>> pixel_values = processor(images=image, return_tensors="pt").pixel_values + + >>> question = "what does the front of the bus say at the top?" + + >>> input_ids = processor(text=question, add_special_tokens=False).input_ids + >>> input_ids = [processor.tokenizer.cls_token_id] + input_ids + >>> input_ids = torch.tensor(input_ids).unsqueeze(0) + + >>> generated_ids = model.generate(pixel_values=pixel_values, input_ids=input_ids, max_length=50) + >>> print(processor.batch_decode(generated_ids, skip_special_tokens=True)) + ['what does the front of the bus say at the top? special'] + ``` + + Video captioning example: + + ```python + >>> import av + >>> import numpy as np + >>> from PIL import Image + >>> from huggingface_hub import hf_hub_download + >>> from transformers import AutoProcessor, AutoModelForCausalLM + + >>> processor = AutoProcessor.from_pretrained("microsoft/git-base-vatex") + >>> model = AutoModelForCausalLM.from_pretrained("microsoft/git-base-vatex") + + >>> # set seed for reproducibility + >>> np.random.seed(45) + + + >>> def read_video_pyav(container, indices): + ... ''' + ... Decode the video with PyAV decoder. + ... Args: + ... container (`av.container.input.InputContainer`): PyAV container. + ... indices (`list[int]`): List of frame indices to decode. + ... Returns: + ... result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3). + ... ''' + ... frames = [] + ... container.seek(0) + ... start_index = indices[0] + ... end_index = indices[-1] + ... for i, frame in enumerate(container.decode(video=0)): + ... if i > end_index: + ... break + ... if i >= start_index and i in indices: + ... frames.append(frame) + ... return np.stack([x.to_ndarray(format="rgb24") for x in frames]) + + + >>> def sample_frame_indices(clip_len, frame_sample_rate, seg_len): + ... ''' + ... Sample a given number of frame indices from the video. + ... Args: + ... clip_len (`int`): Total number of frames to sample. + ... frame_sample_rate (`int`): Sample every n-th frame. + ... seg_len (`int`): Maximum allowed index of sample's last frame. + ... Returns: + ... indices (`list[int]`): List of sampled frame indices + ... ''' + ... converted_len = int(clip_len * frame_sample_rate) + ... end_idx = np.random.randint(converted_len, seg_len) + ... start_idx = end_idx - converted_len + ... indices = np.linspace(start_idx, end_idx, num=clip_len) + ... indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64) + ... return indices + + + >>> # load video + >>> file_path = hf_hub_download( + ... repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset" + ... ) + >>> container = av.open(file_path) + + >>> # sample frames + >>> num_frames = model.config.num_image_with_embedding + >>> indices = sample_frame_indices( + ... clip_len=num_frames, frame_sample_rate=4, seg_len=container.streams.video[0].frames + ... ) + >>> frames = read_video_pyav(container, indices) + + >>> pixel_values = processor(images=list(frames), return_tensors="pt").pixel_values + + >>> generated_ids = model.generate(pixel_values=pixel_values, max_length=50) + + >>> print("Generated caption:", processor.batch_decode(generated_ids, skip_special_tokens=True)) + Generated caption: ['a woman is sitting at a table and she is talking about the food she is holding.'] + ``` + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + if labels is not None: + use_cache = False + + outputs = self.git( + input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + pixel_values=pixel_values, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + interpolate_pos_encoding=interpolate_pos_encoding, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + logits = self.output(sequence_output) + + loss = None + if labels is not None: + # we are doing next-token prediction; shift prediction scores and input ids by one + num_image_tokens = self.git.encoder.layer[0].attention.self.image_patch_tokens + shifted_logits = logits[:, num_image_tokens:-1, :].contiguous() + labels = labels[:, 1:].contiguous() + loss = self.loss_function( + shifted_logits.view(-1, self.config.vocab_size), + labels.view(-1), + vocab_size=self.config.vocab_size, + **kwargs, + ) + + if not return_dict: + output = (logits,) + outputs[1:] + return ((loss,) + output) if loss is not None else output + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def prepare_inputs_for_generation( + self, input_ids, past_key_values=None, attention_mask=None, use_cache=None, **kwargs + ): + # Overwritten -- `git` has special cache handling and doesn't support generating from `inputs_embeds` atm + + # cut decoder_input_ids if past_key_values is used + if past_key_values is not None: + past_length = past_key_values.get_seq_length() + + # Some generation methods already pass only the last input ID + if input_ids.shape[1] > past_length: + remove_prefix_length = past_length + else: + # Default to old behavior: keep only final ID + remove_prefix_length = input_ids.shape[1] - 1 + + input_ids = input_ids[:, remove_prefix_length:] + + # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly + input_shape = input_ids.shape + if attention_mask is None: + attention_mask = input_ids.new_ones(input_shape) + + model_inputs = { + "input_ids": input_ids, + "attention_mask": attention_mask, + "pixel_values": kwargs.get("pixel_values"), + "past_key_values": past_key_values, + "use_cache": use_cache, + } + + # Forward ALL kwargs that are uninitialized (e.g. `use_cache`). + for key, value in kwargs.items(): + if key not in model_inputs: + model_inputs[key] = value + + return model_inputs + + +__all__ = ["GitForCausalLM", "GitModel", "GitPreTrainedModel", "GitVisionModel"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/git/processing_git.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/git/processing_git.py new file mode 100644 index 0000000000000000000000000000000000000000..20fae1a9f36ba5ac0d0821a862667fa60940d713 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/git/processing_git.py @@ -0,0 +1,45 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Image/Text processor class for GIT +""" + +from ...processing_utils import ProcessorMixin + + +class GitProcessor(ProcessorMixin): + r""" + Constructs a GIT processor which wraps a CLIP image processor and a BERT tokenizer into a single processor. + + [`GitProcessor`] offers all the functionalities of [`CLIPImageProcessor`] and [`BertTokenizerFast`]. See the + [`~GitProcessor.__call__`] and [`~GitProcessor.decode`] for more information. + + Args: + image_processor ([`AutoImageProcessor`]): + The image processor is a required input. + tokenizer ([`AutoTokenizer`]): + The tokenizer is a required input. + """ + + attributes = ["image_processor", "tokenizer"] + image_processor_class = "AutoImageProcessor" + tokenizer_class = "AutoTokenizer" + + def __init__(self, image_processor, tokenizer): + super().__init__(image_processor, tokenizer) + self.current_processor = self.image_processor + + +__all__ = ["GitProcessor"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/glm4/__pycache__/__init__.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/glm4/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b4e9aa5993398278788c77c4304fd9add4dbce53 Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/glm4/__pycache__/__init__.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/glm4/__pycache__/configuration_glm4.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/glm4/__pycache__/configuration_glm4.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f987527046bd4ada6819cce2a06fcd4f0e47b600 Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/glm4/__pycache__/configuration_glm4.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/glm4/__pycache__/modeling_glm4.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/glm4/__pycache__/modeling_glm4.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ef9acd01f69bcb41c37ad7aa48e0ac69cd712d6e Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/glm4/__pycache__/modeling_glm4.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/glm4/__pycache__/modular_glm4.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/glm4/__pycache__/modular_glm4.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b7a829d339052616b3ad442aebd12894fb91a3c1 Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/glm4/__pycache__/modular_glm4.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/glm4v/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/glm4v/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..4216c137fbe200d424bf9283b0da2f6e60c2817c --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/glm4v/__init__.py @@ -0,0 +1,28 @@ +# Copyright 2025 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_glm4v import * + from .modeling_glm4v import * + from .processing_glm4v import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/glm4v/configuration_glm4v.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/glm4v/configuration_glm4v.py new file mode 100644 index 0000000000000000000000000000000000000000..4c417020fa84666613e09bac4f4d4d428865a279 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/glm4v/configuration_glm4v.py @@ -0,0 +1,353 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/glm4v/modular_glm4v.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_glm4v.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright 2025 The ZhipuAI Inc. team and HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from ...configuration_utils import PretrainedConfig +from ...modeling_rope_utils import rope_config_validation + + +class Glm4vVisionConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`Glm4vVisionModel`]. It is used to instantiate an Glm4vVisionModel + model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield + a similar configuration to that of + GLM-4.1V-9B-Thinking [THUDM/GLM-4.1V-9B-Thinking](https://huggingface.co/THUDM/GLM-4.1V-9B-Thinking). + + Args: + hidden_size (`int`, *optional*, defaults to 1536): + Dimensionality of the encoder layers and the pooler layer. + depth (`int`, *optional*, defaults to 24): + Number of layers (depth) in the model. + attention_bias (`bool`, *optional*, defaults to `False`): + Whether to add a bias to the queries, keys and values. + intermediate_size (`int`, *optional*, defaults to 13696): + Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + hidden_act (`str` or `function`, *optional*, defaults to `"selu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"` and `"gelu_new"` are supported. + hidden_dropout_prob (`float`, *optional*, defaults to 0.0): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_dropout (`float`, *optional*, defaults to 0.0): + Dropout probability for attention weights. + projection_dropout (`float`, *optional*, defaults to 0.0): + Dropout probability for the projection layer. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + image_size (`int` or `list[int]`, *optional*, defaults to `[336, 336]`): + The size (resolution) of each image. + patch_size (`int`, *optional*, defaults to `14`): + The size (resolution) of each patch. + num_channels (`int`, *optional*, defaults to 3): + The number of input channels. + out_hidden_size (`int`, *optional*, defaults to 4096): + The output hidden size of the vision model. + rms_norm_eps (`float`, *optional*, defaults to 1e-05): + The epsilon used by the rms normalization layers. + spatial_merge_size (`int`, *optional*, defaults to 2): + The size used for merging spatial dimensions. + temporal_patch_size (`int`, *optional*, defaults to 2): + The size used for patches along the temporal dimension. + Example: + + ```python + >>> from transformers import Glm4vVisionConfig, Glm4vVisionModel + + >>> # Initializing a Glm4vVisionConfig GLM-4.1V-9B style configuration + >>> configuration = Glm4vVisionConfig() + + >>> # Initializing a model (with random weights) from the GLM-4.1V-9B configuration + >>> model = Glm4vVisionModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "glm4v" + base_config_key = "vision_config" + + def __init__( + self, + depth=24, + hidden_size=1536, + hidden_act="silu", + attention_bias=False, + attention_dropout=0.0, + num_heads=12, + in_channels=3, + image_size=336, + patch_size=14, + rms_norm_eps=1e-05, + spatial_merge_size=2, + temporal_patch_size=2, + out_hidden_size=4096, + intermediate_size=13696, + initializer_range=0.02, + **kwargs, + ): + super().__init__(**kwargs) + + self.depth = depth + self.hidden_size = hidden_size + self.hidden_act = hidden_act + self.num_heads = num_heads + self.in_channels = in_channels + self.image_size = image_size + self.patch_size = patch_size + self.spatial_merge_size = spatial_merge_size + self.temporal_patch_size = temporal_patch_size + self.out_hidden_size = out_hidden_size + self.intermediate_size = intermediate_size + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + self.attention_bias = attention_bias + self.attention_dropout = attention_dropout + + +class Glm4vTextConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`Glm4vModel`]. It is used to instantiate a + GLM-4.1V model according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of + GLM-4.1V-9B-Thinking [THUDM/GLM-4.1V-9B-Thinking](https://huggingface.co/THUDM/GLM-4.1V-9B-Thinking). + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + vocab_size (`int`, *optional*, defaults to 151552): + Vocabulary size of the Glm4v model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`Glm4vModel`] + hidden_size (`int`, *optional*, defaults to 4096): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 13696): + Dimension of the MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 40): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 32): + Number of attention heads for each attention layer in the Transformer encoder. + num_key_value_heads (`int`, *optional*, defaults to 2): + This is the number of key_value heads that should be used to implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When + converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed + by meanpooling all the original heads within that group. For more details checkout [this + paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to `32`. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the decoder. + max_position_embeddings (`int`, *optional*, defaults to 32768): + The maximum sequence length that this model might ever be used with. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + rms_norm_eps (`float`, *optional*, defaults to 1e-05): + The epsilon used by the rms normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether the model's input and output word embeddings should be tied. + rope_theta (`float`, *optional*, defaults to 10000.0): + The base period of the RoPE embeddings. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + rope_scaling (`Dict`, *optional*): + Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type + and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value + accordingly. + Expected contents: + `rope_type` (`str`): + The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', + 'llama3'], with 'default' being the original RoPE implementation. + `factor` (`float`, *optional*): + Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In + most scaling types, a `factor` of x will enable the model to handle sequences of length x * + original maximum pre-trained length. + `original_max_position_embeddings` (`int`, *optional*): + Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during + pretraining. + `attention_factor` (`float`, *optional*): + Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention + computation. If unspecified, it defaults to value recommended by the implementation, using the + `factor` field to infer the suggested value. + image_token_id (`int`, *optional*): + Token index used as placeholder for image embeddings. + video_token_id (`int`, *optional*): + Token index used as placeholder for video embeddings. + + ```python + >>> from transformers import Glm4vTextModel, Glm4vConfig + + >>> # Initializing a GLM-4.1V style configuration + >>> configuration = Glm4vConfig() + + >>> # Initializing a model from the GLM-4.1V style configuration + >>> model = Glm4vTextModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "glm4v_text" + base_config_key = "text_config" + keys_to_ignore_at_inference = ["past_key_values"] + # Default tensor parallel plan for base model `Glm4v` + base_model_tp_plan = { + "layers.*.self_attn.q_proj": "colwise", + "layers.*.self_attn.k_proj": "colwise", + "layers.*.self_attn.v_proj": "colwise", + "layers.*.self_attn.o_proj": "rowwise", + "layers.*.mlp.gate_up_proj": "colwise_rep", # we need to replicate here due to the `chunk` operation + "layers.*.mlp.down_proj": "rowwise_rep", # we need to replicate here due to the `chunk` operation + } + base_model_pp_plan = { + "embed_tokens": (["input_ids"], ["inputs_embeds"]), + "layers": (["hidden_states", "attention_mask"], ["hidden_states"]), + "norm": (["hidden_states"], ["hidden_states"]), + } + + def __init__( + self, + vocab_size=151552, + hidden_size=4096, + intermediate_size=13696, + num_hidden_layers=40, + num_attention_heads=32, + num_key_value_heads=2, + hidden_act="silu", + max_position_embeddings=32768, + initializer_range=0.02, + rms_norm_eps=1e-05, + use_cache=True, + tie_word_embeddings=False, + rope_theta=10000.0, + attention_dropout=0.0, + rope_scaling=None, + image_token_id=None, + video_token_id=None, + **kwargs, + ): + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + + # for backward compatibility + if num_key_value_heads is None: + num_key_value_heads = num_attention_heads + + self.num_key_value_heads = num_key_value_heads + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + self.use_cache = use_cache + self.rope_theta = rope_theta + self.attention_dropout = attention_dropout + self.rope_scaling = rope_scaling + + # Validate the correctness of rotary position embeddings parameters + # BC: if there is a 'type' field, move it to 'rope_type'. + if self.rope_scaling is not None and "type" in self.rope_scaling: + self.rope_scaling["rope_type"] = self.rope_scaling["type"] + rope_config_validation(self, ignore_keys={"mrope_section"}) + self.image_token_id = image_token_id + self.video_token_id = video_token_id + + super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) + + +class Glm4vConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`Glm4vModel`]. It is used to instantiate a + GLM-4.1V model according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of + GLM-4.1V-9B-Thinking [THUDM/GLM-4.1V-9B-Thinking](https://huggingface.co/THUDM/GLM-4.1V-9B-Thinking). + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + text_config (`Union[PreTrainedConfig, dict]`, *optional*, defaults to `Glm4vTextConfig`): + The config object or dictionary of the text backbone. + vision_config (`Union[PreTrainedConfig, dict]`, *optional*, defaults to `Glm4vVisionConfig`): + The config object or dictionary of the vision backbone. + image_token_id (`int`, *optional*, defaults to 151343): + The image token index to encode the image prompt. + video_token_id (`int`, *optional*, defaults to 151344): + The video token index to encode the image prompt. + image_start_token_id (`int`, *optional*, defaults to 151339): + The image start token index to encode the start of image. + image_end_token_id (`int`, *optional*, defaults to 151340): + The image end token index to encode the end of image. + video_start_token_id (`int`, *optional*, defaults to 151341): + The video start token index to encode the start of video. + video_end_token_id (`int`, *optional*, defaults to 151342): + The video end token index to encode the end of video. + + ```python + >>> from transformers import Glm4vForConditionalGeneration, Glm4vConfig + + >>> # Initializing a GLM-4.1V style configuration + >>> configuration = Glm4vConfig() + + >>> # Initializing a model from the GLM-4.1V style configuration + >>> model = Glm4vForConditionalGeneration(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "glm4v" + sub_configs = {"vision_config": Glm4vVisionConfig, "text_config": Glm4vTextConfig} + keys_to_ignore_at_inference = ["past_key_values"] + + def __init__( + self, + text_config=None, + vision_config=None, + image_token_id=151343, + video_token_id=151344, + image_start_token_id=151339, + image_end_token_id=151340, + video_start_token_id=151341, + video_end_token_id=151342, + **kwargs, + ): + if isinstance(vision_config, dict): + self.vision_config = self.sub_configs["vision_config"](**vision_config) + elif vision_config is None: + self.vision_config = self.sub_configs["vision_config"]() + + if isinstance(text_config, dict): + self.text_config = self.sub_configs["text_config"](**text_config) + elif text_config is None: + self.text_config = self.sub_configs["text_config"](**kwargs) + + self.image_token_id = image_token_id + self.video_token_id = video_token_id + self.video_start_token_id = video_start_token_id + self.video_end_token_id = video_end_token_id + self.image_start_token_id = image_start_token_id + self.image_end_token_id = image_end_token_id + + super().__init__(**kwargs) + + +__all__ = ["Glm4vConfig", "Glm4vTextConfig"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/glm4v/image_processing_glm4v.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/glm4v/image_processing_glm4v.py new file mode 100644 index 0000000000000000000000000000000000000000..8293545deee239d1b5d19355086cb58fb06b8b1a --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/glm4v/image_processing_glm4v.py @@ -0,0 +1,472 @@ +# coding=utf-8 +# Copyright 2025 The ZhipuAI Inc. team and HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Image processor class for GLM-4.1V.""" + +import math +from typing import Optional, Union + +import numpy as np + +from ...image_processing_utils import BaseImageProcessor, BatchFeature +from ...image_transforms import ( + convert_to_rgb, + resize, + to_channel_dimension_format, +) +from ...image_utils import ( + OPENAI_CLIP_MEAN, + OPENAI_CLIP_STD, + ChannelDimension, + ImageInput, + PILImageResampling, + get_image_size, + infer_channel_dimension_format, + is_scaled_image, + make_flat_list_of_images, + to_numpy_array, + valid_images, + validate_preprocess_arguments, +) +from ...utils import TensorType, logging +from ...video_utils import VideoInput + + +logger = logging.get_logger(__name__) + + +def smart_resize( + num_frames: int, + height: int, + width: int, + temporal_factor: int = 2, + factor: int = 28, + min_pixels: int = 112 * 112, + max_pixels: int = 14 * 14 * 2 * 2 * 2 * 6144, +): + if num_frames < temporal_factor: + raise ValueError(f"t:{num_frames} must be larger than temporal_factor:{temporal_factor}") + if height < factor or width < factor: + raise ValueError(f"height:{height} or width:{width} must be larger than factor:{factor}") + elif max(height, width) / min(height, width) > 200: + raise ValueError( + f"absolute aspect ratio must be smaller than 200, got {max(height, width) / min(height, width)}" + ) + h_bar = round(height / factor) * factor + w_bar = round(width / factor) * factor + t_bar = round(num_frames / temporal_factor) * temporal_factor + + if t_bar * h_bar * w_bar > max_pixels: + beta = math.sqrt((num_frames * height * width) / max_pixels) + h_bar = max(factor, math.floor(height / beta / factor) * factor) + w_bar = max(factor, math.floor(width / beta / factor) * factor) + elif t_bar * h_bar * w_bar < min_pixels: + beta = math.sqrt(min_pixels / (num_frames * height * width)) + h_bar = math.ceil(height * beta / factor) * factor + w_bar = math.ceil(width * beta / factor) * factor + + return h_bar, w_bar + + +class Glm4vImageProcessor(BaseImageProcessor): + r""" + Constructs a GLM-4V image processor that dynamically resizes images based on the original images. + + Args: + do_resize (`bool`, *optional*, defaults to `True`): + Whether to resize the image's (height, width) dimensions. + size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 112 * 112, "longest_edge": 28 * 28 * 15000}`): + Size of the image's `(height, width)` dimensions after resizing. Can be overridden by the `size` parameter + in the `preprocess` method. Available options are: + - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`. + Do NOT keep the aspect ratio. + - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting + the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge + less or equal to `longest_edge`. + - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the + aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to + `max_width`. + resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`): + Resampling filter to use when resizing the image. + do_rescale (`bool`, *optional*, defaults to `True`): + Whether to rescale the image by the specified scale `rescale_factor`. + rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): + Scale factor to use if rescaling the image. + do_normalize (`bool`, *optional*, defaults to `True`): + Whether to normalize the image. + image_mean (`float` or `List[float]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`): + Mean to use if normalizing the image. This is a float or list of floats for each channel in the image. + image_std (`float` or `List[float]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`): + Standard deviation to use if normalizing the image. This is a float or list of floats for each channel in the image. + do_convert_rgb (`bool`, *optional*, defaults to `True`): + Whether to convert the image to RGB. + patch_size (`int`, *optional*, defaults to 14): + The spatial patch size of the vision encoder. + temporal_patch_size (`int`, *optional*, defaults to 2): + The temporal patch size of the vision encoder. + merge_size (`int`, *optional*, defaults to 2): + The merge size of the vision encoder to llm encoder. + """ + + model_input_names = ["pixel_values", "image_grid_thw"] + + def __init__( + self, + do_resize: bool = True, + size: Optional[dict[str, int]] = None, + resample: PILImageResampling = PILImageResampling.BICUBIC, + do_rescale: bool = True, + rescale_factor: Union[int, float] = 1 / 255, + do_normalize: bool = True, + image_mean: Optional[Union[float, list[float]]] = None, + image_std: Optional[Union[float, list[float]]] = None, + do_convert_rgb: bool = True, + patch_size: int = 14, + temporal_patch_size: int = 2, + merge_size: int = 2, + **kwargs, + ) -> None: + super().__init__(**kwargs) + if size is not None and ("shortest_edge" not in size or "longest_edge" not in size): + raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.") + elif size is None: + size = {"shortest_edge": 112 * 112, "longest_edge": 28 * 28 * 15000} + self.size = size + + self.do_resize = do_resize + self.resample = resample + self.do_rescale = do_rescale + self.rescale_factor = rescale_factor + self.do_normalize = do_normalize + self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN + self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD + + self.patch_size = patch_size + self.temporal_patch_size = temporal_patch_size + self.merge_size = merge_size + self.do_convert_rgb = do_convert_rgb + + def _preprocess( + self, + images: Union[ImageInput, VideoInput], + do_resize: Optional[bool] = None, + size: Optional[dict[str, int]] = None, + resample: Optional[PILImageResampling] = None, + do_rescale: Optional[bool] = None, + rescale_factor: Optional[float] = None, + do_normalize: Optional[bool] = None, + image_mean: Optional[Union[float, list[float]]] = None, + image_std: Optional[Union[float, list[float]]] = None, + patch_size: Optional[int] = None, + temporal_patch_size: Optional[int] = None, + merge_size: Optional[int] = None, + do_convert_rgb: Optional[bool] = None, + data_format: Optional[ChannelDimension] = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ): + """ + Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`. + + Args: + images (`ImageInput`): + Image or batch of images to preprocess. Expects pixel values ranging from 0 to 255. If pixel values range from 0 to 1, set `do_rescale=False`. + vision_info (`List[Dict]`, *optional*): + Optional list of dictionaries containing additional information about vision inputs. + do_resize (`bool`, *optional*, defaults to `self.do_resize`): + Whether to resize the image. + size (`Dict[str, int]`, *optional*, defaults to `self.size`): + Size of the image after resizing. `shortest_edge` and `longest_edge` keys must be present. + resample (`PILImageResampling`, *optional*, defaults to `self.resample`): + Resampling filter to use if resizing the image. This can be one of the `PILImageResampling` enums. + do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): + Whether to rescale the image. + rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): + Scale factor to use if rescaling the image. + do_normalize (`bool`, *optional*, defaults to `self.do_normalize`): + Whether to normalize the image. + image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`): + Mean to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image. + image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`): + Standard deviation to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image. + patch_size (`int`, *optional*, defaults to `self.patch_size`): + The spatial patch size of the vision encoder. + temporal_patch_size (`int`, *optional*, defaults to `self.temporal_patch_size`): + The temporal patch size of the vision encoder. + merge_size (`int`, *optional*, defaults to `self.merge_size`): + The merge size of the vision encoder to llm encoder. + do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`): + Whether to convert the image to RGB. + data_format (`ChannelDimension`, *optional*, defaults to `ChannelDimension.FIRST`): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - Unset: Use the channel dimension format of the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + """ + images = make_flat_list_of_images(images) + + if do_convert_rgb: + images = [convert_to_rgb(image) for image in images] + + # All transformations expect numpy arrays. + images = [to_numpy_array(image) for image in images] + + if do_rescale and is_scaled_image(images[0]): + logger.warning_once( + "It looks like you are trying to rescale already rescaled images. If the input" + " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again." + ) + if input_data_format is None: + # We assume that all images have the same channel dimension format. + input_data_format = infer_channel_dimension_format(images[0]) + + height, width = get_image_size(images[0], channel_dim=input_data_format) + resized_height, resized_width = height, width + processed_images = [] + for image in images: + if do_resize: + resized_height, resized_width = smart_resize( + num_frames=temporal_patch_size, + height=height, + width=width, + temporal_factor=temporal_patch_size, + factor=patch_size * merge_size, + min_pixels=size["shortest_edge"], + max_pixels=size["longest_edge"], + ) + image = resize( + image, size=(resized_height, resized_width), resample=resample, input_data_format=input_data_format + ) + + if do_rescale: + image = self.rescale(image, scale=rescale_factor, input_data_format=input_data_format) + + if do_normalize: + image = self.normalize( + image=image, mean=image_mean, std=image_std, input_data_format=input_data_format + ) + + image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) + processed_images.append(image) + + patches = np.array(processed_images) + if data_format == ChannelDimension.LAST: + patches = patches.transpose(0, 3, 1, 2) + if patches.shape[0] % temporal_patch_size != 0: + repeats = np.repeat( + patches[-1][np.newaxis], temporal_patch_size - (patches.shape[0] % temporal_patch_size), axis=0 + ) + patches = np.concatenate([patches, repeats], axis=0) + channel = patches.shape[1] + grid_t = patches.shape[0] // temporal_patch_size + grid_h, grid_w = resized_height // patch_size, resized_width // patch_size + patches = patches.reshape( + grid_t, + temporal_patch_size, + channel, + grid_h // merge_size, + merge_size, + patch_size, + grid_w // merge_size, + merge_size, + patch_size, + ) + patches = patches.transpose(0, 3, 6, 4, 7, 2, 1, 5, 8) + flatten_patches = patches.reshape( + grid_t * grid_h * grid_w, channel * temporal_patch_size * patch_size * patch_size + ) + + return flatten_patches, (grid_t, grid_h, grid_w) + + def preprocess( + self, + images: ImageInput, + videos: Optional[VideoInput] = None, + do_resize: Optional[bool] = None, + size: Optional[dict[str, int]] = None, + resample: Optional[PILImageResampling] = None, + do_rescale: Optional[bool] = None, + rescale_factor: Optional[float] = None, + do_normalize: Optional[bool] = None, + image_mean: Optional[Union[float, list[float]]] = None, + image_std: Optional[Union[float, list[float]]] = None, + patch_size: Optional[int] = None, + temporal_patch_size: Optional[int] = None, + merge_size: Optional[int] = None, + do_convert_rgb: Optional[bool] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + data_format: Optional[ChannelDimension] = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ): + """ + Args: + images (`ImageInput`): + Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If + passing in images with pixel values between 0 and 1, set `do_rescale=False`. + videos (`VideoInput`): + Video to preprocess. Expects a single or batch of videos with pixel values ranging from 0 to 255. If + passing in videos with pixel values between 0 and 1, set `do_rescale=False`. + do_resize (`bool`, *optional*, defaults to `self.do_resize`): + Whether to resize the image. + size (`Dict[str, int]`, *optional*, defaults to `self.size`): + Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with + the longest edge resized to keep the input aspect ratio. + resample (`int`, *optional*, defaults to `self.resample`): + Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only + has an effect if `do_resize` is set to `True`. + do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): + Whether to rescale the image. + rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): + Rescale factor to rescale the image by if `do_rescale` is set to `True`. + do_normalize (`bool`, *optional*, defaults to `self.do_normalize`): + Whether to normalize the image. + image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`): + Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`. + image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`): + Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to + `True`. + The max pixels of the image to resize the image. + patch_size (`int`, *optional*, defaults to `self.patch_size`): + The spatial patch size of the vision encoder. + temporal_patch_size (`int`, *optional*, defaults to `self.temporal_patch_size`): + The temporal patch size of the vision encoder. + merge_size (`int`, *optional*, defaults to `self.merge_size`): + The merge size of the vision encoder to llm encoder. + do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`): + Whether to convert the image to RGB. + return_tensors (`str` or `TensorType`, *optional*): + The type of tensors to return. Can be one of: + - Unset: Return a list of `np.ndarray`. + - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`. + - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. + - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. + - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. + data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - Unset: Use the channel dimension format of the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + + """ + # Try to use config values if set, otherwise fallback to global defaults + size = size if size is not None else self.size + if size is not None and ("shortest_edge" not in size or "longest_edge" not in size): + raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.") + elif size is None: + size = {"shortest_edge": 112 * 112, "longest_edge": 28 * 28 * 15000} + + do_resize = do_resize if do_resize is not None else self.do_resize + resample = resample if resample is not None else self.resample + do_rescale = do_rescale if do_rescale is not None else self.do_rescale + rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor + do_normalize = do_normalize if do_normalize is not None else self.do_normalize + image_mean = image_mean if image_mean is not None else self.image_mean + image_std = image_std if image_std is not None else self.image_std + patch_size = patch_size if patch_size is not None else self.patch_size + temporal_patch_size = temporal_patch_size if temporal_patch_size is not None else self.temporal_patch_size + merge_size = merge_size if merge_size is not None else self.merge_size + do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb + + if images is not None: + images = self.fetch_images(images) + images = make_flat_list_of_images(images) + + if images is not None and not valid_images(images): + raise ValueError( + "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " + "torch.Tensor, tf.Tensor or jax.ndarray." + ) + + validate_preprocess_arguments( + rescale_factor=rescale_factor, + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + do_resize=do_resize, + size=size, + resample=resample, + ) + + data = {} + if images is not None: + pixel_values, vision_grid_thws = [], [] + for image in images: + patches, image_grid_thw = self._preprocess( + image, + do_resize=do_resize, + size=size, + resample=resample, + do_rescale=do_rescale, + rescale_factor=rescale_factor, + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + patch_size=patch_size, + temporal_patch_size=temporal_patch_size, + merge_size=merge_size, + data_format=data_format, + do_convert_rgb=do_convert_rgb, + input_data_format=input_data_format, + ) + pixel_values.extend(patches) + vision_grid_thws.append(image_grid_thw) + pixel_values = np.array(pixel_values) + vision_grid_thws = np.array(vision_grid_thws) + data.update({"pixel_values": pixel_values, "image_grid_thw": vision_grid_thws}) + + return BatchFeature(data=data, tensor_type=return_tensors) + + def get_number_of_image_patches(self, height: int, width: int, images_kwargs=None): + """ + A utility that returns number of image patches for a given image size. + + Args: + height (`int`): + Height of the input image. + width (`int`): + Width of the input image. + images_kwargs (`dict`, *optional*) + Any kwargs to override defaults of the image processor. + Returns: + `int`: Number of image patches per image. + """ + patch_size = images_kwargs.get("patch_size", self.patch_size) + merge_size = images_kwargs.get("merge_size", self.merge_size) + size = images_kwargs.get("size", {"shortest_edge": 112 * 112, "longest_edge": 28 * 28 * 15000}) + + factor = patch_size * merge_size + resized_height, resized_width = smart_resize( + num_frames=self.temporal_patch_size, + height=height, + width=width, + factor=factor, + min_pixels=size["shortest_edge"], + max_pixels=size["longest_edge"], + temporal_factor=self.temporal_patch_size, + ) + grid_h, grid_w = resized_height // patch_size, resized_width // patch_size + return grid_h * grid_w + + +__all__ = ["Glm4vImageProcessor"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/glm4v/image_processing_glm4v_fast.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/glm4v/image_processing_glm4v_fast.py new file mode 100644 index 0000000000000000000000000000000000000000..8cdf31a437ae305fbfc399bd276476f9b453ed6f --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/glm4v/image_processing_glm4v_fast.py @@ -0,0 +1,213 @@ +# coding=utf-8 +# Copyright 2025 The ZhipuAI Inc. team and HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Fast Image processor class for GLM-4.1V.""" + +from typing import Optional, Union + +import torch +from torchvision.transforms.v2 import functional as F + +from ...image_processing_utils import ( + BatchFeature, +) +from ...image_processing_utils_fast import ( + BaseImageProcessorFast, + DefaultFastImageProcessorKwargs, + group_images_by_shape, + reorder_images, +) +from ...image_utils import ( + OPENAI_CLIP_MEAN, + OPENAI_CLIP_STD, + ImageInput, + PILImageResampling, + SizeDict, +) +from ...processing_utils import Unpack +from ...utils import ( + TensorType, + auto_docstring, + logging, +) +from .image_processing_glm4v import smart_resize + + +logger = logging.get_logger(__name__) + + +class Glm4vFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): + """ + patch_size (`int`, *optional*, defaults to 14): + The spatial patch size of the vision encoder. + temporal_patch_size (`int`, *optional*, defaults to 2): + The temporal patch size of the vision encoder. + merge_size (`int`, *optional*, defaults to 2): + The merge size of the vision encoder to llm encoder. + """ + + patch_size: Optional[int] + temporal_patch_size: Optional[int] + merge_size: Optional[int] + + +@auto_docstring +class Glm4vImageProcessorFast(BaseImageProcessorFast): + do_resize = True + resample = PILImageResampling.BICUBIC + size = {"shortest_edge": 112 * 112, "longest_edge": 28 * 28 * 15000} + do_rescale = True + do_normalize = True + image_mean = OPENAI_CLIP_MEAN + image_std = OPENAI_CLIP_STD + do_convert_rgb = True + patch_size = 14 + temporal_patch_size = 2 + merge_size = 2 + valid_kwargs = Glm4vFastImageProcessorKwargs + model_input_names = ["pixel_values", "image_grid_thw"] + + def __init__(self, **kwargs: Unpack[Glm4vFastImageProcessorKwargs]): + super().__init__(**kwargs) + if self.size is not None and ( + self.size.get("shortest_edge", None) is None or self.size.get("longest_edge", None) is None + ): + raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.") + + def _further_process_kwargs( + self, + size: Optional[SizeDict] = None, + **kwargs, + ) -> dict: + """ + Update kwargs that need further processing before being validated + Can be overridden by subclasses to customize the processing of kwargs. + """ + if size is not None and ("shortest_edge" not in size or "longest_edge" not in size): + raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.") + + return super()._further_process_kwargs(size=size, **kwargs) + + def _preprocess( + self, + images: list["torch.Tensor"], + do_resize: bool, + size: SizeDict, + interpolation: Optional["F.InterpolationMode"], + do_rescale: bool, + rescale_factor: float, + do_normalize: bool, + image_mean: Optional[Union[float, list[float]]], + image_std: Optional[Union[float, list[float]]], + patch_size: int, + temporal_patch_size: int, + merge_size: int, + disable_grouping: Optional[bool], + return_tensors: Optional[Union[str, TensorType]], + **kwargs, + ) -> BatchFeature: + """ + Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`. + """ + + grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping) + resized_images_grouped = {} + for shape, stacked_images in grouped_images.items(): + height, width = stacked_images.shape[-2:] + if do_resize: + resized_height, resized_width = smart_resize( + num_frames=temporal_patch_size, + height=height, + width=width, + temporal_factor=temporal_patch_size, + factor=patch_size * merge_size, + min_pixels=size.shortest_edge, + max_pixels=size.longest_edge, + ) + stacked_images = self.resize( + stacked_images, + size=SizeDict(height=resized_height, width=resized_width), + interpolation=interpolation, + ) + resized_images_grouped[shape] = stacked_images + + resized_images = reorder_images(resized_images_grouped, grouped_images_index) + + grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping) + processed_images_grouped = {} + processed_grids = {} + + for shape, stacked_images in grouped_images.items(): + resized_height, resized_width = stacked_images.shape[-2:] + + patches = self.rescale_and_normalize( + stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std + ) + if patches.ndim == 4: # (B, C, H, W) + patches = patches.unsqueeze(1) # (B, T=1, C, H, W) + + if patches.shape[1] % temporal_patch_size != 0: + repeats = patches[:, -1:].repeat( + 1, temporal_patch_size - (patches.shape[1] % temporal_patch_size), 1, 1, 1 + ) + patches = torch.cat([patches, repeats], dim=1) + + batch_size, t_len, channel = patches.shape[:3] + grid_t = t_len // temporal_patch_size + grid_h, grid_w = resized_height // patch_size, resized_width // patch_size + + patches = patches.view( + batch_size, + grid_t, + temporal_patch_size, + channel, + grid_h // merge_size, + merge_size, + patch_size, + grid_w // merge_size, + merge_size, + patch_size, + ) + # (B, grid_t, gh, gw, mh, mw, C, tp, ph, pw) + patches = patches.permute(0, 1, 4, 7, 5, 8, 3, 2, 6, 9) + + flatten_patches = patches.reshape( + batch_size, + grid_t * grid_h * grid_w, + channel * temporal_patch_size * patch_size * patch_size, + ) + + processed_images_grouped[shape] = flatten_patches + processed_grids[shape] = [[grid_t, grid_h, grid_w]] * batch_size + + processed_images = reorder_images(processed_images_grouped, grouped_images_index) + processed_grids = reorder_images(processed_grids, grouped_images_index) + + pixel_values = torch.cat(processed_images, dim=0) + image_grid_thw = torch.tensor(processed_grids) + + return BatchFeature( + data={"pixel_values": pixel_values, "image_grid_thw": image_grid_thw}, tensor_type=return_tensors + ) + + @auto_docstring + def preprocess( + self, + images: ImageInput, + **kwargs: Unpack[Glm4vFastImageProcessorKwargs], + ) -> BatchFeature: + return super().preprocess(images, **kwargs) + + +__all__ = ["Glm4vImageProcessorFast"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/glm4v/modeling_glm4v.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/glm4v/modeling_glm4v.py new file mode 100644 index 0000000000000000000000000000000000000000..fbfeae9130e1e4553ffb572e8fabc0360ff74dad --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/glm4v/modeling_glm4v.py @@ -0,0 +1,1636 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/glm4v/modular_glm4v.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_glm4v.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright 2025 The ZhipuAI Inc. team and HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import itertools +from dataclasses import dataclass +from typing import Any, Callable, Optional, Union + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.nn import LayerNorm + +from ...activations import ACT2FN +from ...cache_utils import Cache, DynamicCache +from ...generation import GenerationMixin +from ...integrations import use_kernel_forward_from_hub +from ...masking_utils import create_causal_mask +from ...modeling_flash_attention_utils import FlashAttentionKwargs +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_outputs import BaseModelOutputWithPast, ModelOutput +from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update +from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel +from ...processing_utils import Unpack +from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, is_torchdynamo_compiling +from ...utils.generic import check_model_inputs +from .configuration_glm4v import Glm4vConfig, Glm4vTextConfig, Glm4vVisionConfig + + +@use_kernel_forward_from_hub("RMSNorm") +class Glm4vRMSNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + """ + Glm4vRMSNorm is equivalent to T5LayerNorm + """ + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + return self.weight * hidden_states.to(input_dtype) + + def extra_repr(self): + return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}" + + +class Glm4VisionMlp(nn.Module): + def __init__(self, config, bias: bool = False): + super().__init__() + self.hidden_size = config.hidden_size + self.intermediate_size = config.out_hidden_size + self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=bias) + self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=bias) + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=bias) + self.act_fn = ACT2FN[config.hidden_act] + + def forward(self, hidden_state): + return self.down_proj(self.act_fn(self.gate_proj(hidden_state)) * self.up_proj(hidden_state)) + + +class Glm4vVisionPatchEmbed(nn.Module): + def __init__(self, config: Glm4vVisionConfig) -> None: + super().__init__() + self.patch_size = config.patch_size + self.temporal_patch_size = config.temporal_patch_size + self.in_channels = config.in_channels + self.embed_dim = config.hidden_size + + kernel_size = [self.temporal_patch_size, self.patch_size, self.patch_size] + self.proj = nn.Conv3d(self.in_channels, self.embed_dim, kernel_size=kernel_size, stride=kernel_size) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + target_dtype = self.proj.weight.dtype + hidden_states = hidden_states.view( + -1, self.in_channels, self.temporal_patch_size, self.patch_size, self.patch_size + ) + hidden_states = self.proj(hidden_states.to(dtype=target_dtype)).view(-1, self.embed_dim) + return hidden_states + + +class Glm4vVisionRotaryEmbedding(nn.Module): + inv_freq: torch.Tensor # fix linting for `register_buffer` + + def __init__(self, dim: int, theta: float = 10000.0) -> None: + super().__init__() + inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim)) + self.register_buffer("inv_freq", inv_freq, persistent=False) + + def forward(self, seqlen: int) -> torch.Tensor: + seq = torch.arange(seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype) + freqs = torch.outer(seq, self.inv_freq) + return freqs + + +class Glm4vVisionPatchMerger(nn.Module): + def __init__(self, dim: int, context_dim: int, hidden_act: str, bias: bool = False) -> None: + super().__init__() + self.proj = nn.Linear(dim, dim, bias=bias) + self.post_projection_norm = LayerNorm(dim) + self.gate_proj = nn.Linear(dim, context_dim, bias=bias) + self.up_proj = nn.Linear(dim, context_dim, bias=bias) + self.down_proj = nn.Linear(context_dim, dim, bias=bias) + self.act1 = nn.GELU() + self.act_fn = ACT2FN[hidden_act] + + def forward(self, hidden_state: torch.Tensor) -> torch.Tensor: + hidden_state = self.proj(hidden_state) + hidden_state = self.act1(self.post_projection_norm(hidden_state)) + return self.down_proj(self.act_fn(self.gate_proj(hidden_state)) * self.up_proj(hidden_state)) + + +class Glm4vVisionEmbeddings(nn.Module): + def __init__(self, config: Glm4vVisionConfig): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.image_size = config.image_size + self.patch_size = config.patch_size + + self.num_patches = (self.image_size // self.patch_size) ** 2 + self.num_positions = self.num_patches + self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim) + self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False) + + def forward(self, embeddings, lengths, image_shapes, h_coords, w_coords) -> torch.Tensor: + """ + Forward pass with integrated position encoding adaptation using 2D interpolation. + + Args: + embeddings: Input embeddings tensor + lengths (torch.Tensor): Sequence lengths for each image in the batch. + image_shapes (torch.Tensor): Tensor of shape [batch_size, 3] representing the image shapes (t, h, w). + h_coords (torch.Tensor): Tensor of shape [total_seq] representing the h coordinate for each patch. + w_coords (torch.Tensor): Tensor of shape [total_seq] representing the w coordinate for each patch. + + Returns: + torch.Tensor: Embeddings with adapted position encoding added. + """ + # Get position embedding parameters + pos_embed_weight = self.position_embedding.weight + hidden_size = pos_embed_weight.shape[1] + total_seq = h_coords.shape[0] + device = pos_embed_weight.device + + # Move coordinates to correct device + h_coords, w_coords = h_coords.to(device), w_coords.to(device) + + # Handle empty sequence case + if total_seq == 0: + adapted_pos_embed = torch.empty(0, hidden_size, device=device, dtype=pos_embed_weight.dtype) + else: + # Convert inputs to tensors if needed + if isinstance(lengths, list): + lengths = torch.tensor(lengths, device=device, dtype=torch.long) + if not isinstance(image_shapes, torch.Tensor): + image_shapes = torch.tensor(image_shapes, device=device, dtype=torch.long) + + # Prepare 2D position embedding + orig_size_sq = pos_embed_weight.shape[0] + orig_size = int(orig_size_sq**0.5) + pos_embed_2d = ( + pos_embed_weight.view(orig_size, orig_size, hidden_size) + .permute(2, 0, 1) + .unsqueeze(0) + .to(device=device, dtype=torch.float32) + ) + + # Calculate target dimensions for each patch + target_h = torch.cat([image_shapes[i, 1].repeat(lengths[i]) for i in range(len(lengths))]).to( + device=device, dtype=torch.float32 + ) + target_w = torch.cat([image_shapes[i, 2].repeat(lengths[i]) for i in range(len(lengths))]).to( + device=device, dtype=torch.float32 + ) + + # Normalize coordinates to [-1, 1] range for grid_sample + h_coords = h_coords.to(device=device, dtype=torch.float32) + w_coords = w_coords.to(device=device, dtype=torch.float32) + norm_w = ((w_coords + 0.5) / target_w) * 2 - 1 + norm_h = ((h_coords + 0.5) / target_h) * 2 - 1 + + # Create sampling grid + grid = torch.stack((norm_w, norm_h), dim=-1).unsqueeze(0).unsqueeze(2) + + # Perform bicubic interpolation + interpolated_embed_fp32 = F.grid_sample( + pos_embed_2d, grid, mode="bicubic", align_corners=False, padding_mode="border" + ) + + # Reshape and convert back to original dtype + adapted_pos_embed_fp32 = interpolated_embed_fp32.squeeze(0).squeeze(-1).permute(1, 0) + adapted_pos_embed = adapted_pos_embed_fp32.to(pos_embed_weight.dtype).to(embeddings.device) + + # Add adapted position encoding to embeddings + embeddings = embeddings + adapted_pos_embed + return embeddings + + +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + +def apply_rotary_pos_emb_vision( + q: torch.Tensor, k: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor +) -> tuple[torch.Tensor, torch.Tensor]: + orig_q_dtype = q.dtype + orig_k_dtype = k.dtype + q, k = q.float(), k.float() + cos, sin = cos.unsqueeze(-2).float(), sin.unsqueeze(-2).float() + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + q_embed = q_embed.to(orig_q_dtype) + k_embed = k_embed.to(orig_k_dtype) + return q_embed, k_embed + + +def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: + """ + This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, + num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) + """ + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) + + +def eager_attention_forward( + module: nn.Module, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attention_mask: Optional[torch.Tensor], + scaling: float, + dropout: float = 0.0, + **kwargs: Unpack[TransformersKwargs], +): + key_states = repeat_kv(key, module.num_key_value_groups) + value_states = repeat_kv(value, module.num_key_value_groups) + + attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling + if attention_mask is not None: + causal_mask = attention_mask[:, :, :, : key_states.shape[-2]] + attn_weights = attn_weights + causal_mask + + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype) + attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) + attn_output = torch.matmul(attn_weights, value_states) + attn_output = attn_output.transpose(1, 2).contiguous() + + return attn_output, attn_weights + + +class Glm4vVisionAttention(nn.Module): + def __init__(self, config: Glm4vVisionConfig) -> None: + super().__init__() + self.dim = config.hidden_size + self.num_heads = config.num_heads + self.head_dim = self.dim // self.num_heads + self.num_key_value_groups = 1 # needed for eager attention + self.qkv = nn.Linear(config.hidden_size, config.hidden_size * 3, bias=config.attention_bias) + self.proj = nn.Linear(config.hidden_size, config.hidden_size, bias=False) + self.scaling = self.head_dim**-0.5 + self.config = config + self.attention_dropout = config.attention_dropout + self.is_causal = False + + def forward( + self, + hidden_states: torch.Tensor, + cu_seqlens: torch.Tensor, + rotary_pos_emb: Optional[torch.Tensor] = None, + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, + **kwargs, + ) -> torch.Tensor: + seq_length = hidden_states.shape[0] + query_states, key_states, value_states = ( + self.qkv(hidden_states).reshape(seq_length, 3, self.num_heads, -1).permute(1, 0, 2, 3).unbind(0) + ) + cos, sin = position_embeddings + query_states, key_states = apply_rotary_pos_emb_vision(query_states, key_states, cos, sin) + + query_states = query_states.transpose(0, 1).unsqueeze(0) + key_states = key_states.transpose(0, 1).unsqueeze(0) + value_states = value_states.transpose(0, 1).unsqueeze(0) + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + if self.config._attn_implementation == "flash_attention_2": + # Flash Attention 2: Use cu_seqlens for variable length attention + max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max() + attn_output, _ = attention_interface( + self, + query_states, + key_states, + value_states, + attention_mask=None, + scaling=self.scaling, + dropout=0.0 if not self.training else self.attention_dropout, + cu_seq_lens_q=cu_seqlens, + cu_seq_lens_k=cu_seqlens, + max_length_q=max_seqlen, + max_length_k=max_seqlen, + is_causal=False, + **kwargs, + ) + else: + # Other implementations: Process each chunk separately + lengths = cu_seqlens[1:] - cu_seqlens[:-1] + splits = [ + torch.split(tensor, lengths.tolist(), dim=2) for tensor in (query_states, key_states, value_states) + ] + + attn_outputs = [ + attention_interface( + self, + q, + k, + v, + attention_mask=None, + scaling=self.scaling, + dropout=0.0 if not self.training else self.attention_dropout, + is_causal=False, + **kwargs, + )[0] + for q, k, v in zip(*splits) + ] + attn_output = torch.cat(attn_outputs, dim=1) + + attn_output = attn_output.reshape(seq_length, -1).contiguous() + attn_output = self.proj(attn_output) + return attn_output + + +class Glm4vVisionBlock(GradientCheckpointingLayer): + def __init__(self, config) -> None: + super().__init__() + self.norm1 = Glm4vRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.norm2 = Glm4vRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.attn = Glm4vVisionAttention(config) + self.mlp = Glm4VisionMlp(config, bias=False) + + def forward( + self, + hidden_states: torch.Tensor, + cu_seqlens: torch.Tensor, + rotary_pos_emb: Optional[torch.Tensor] = None, + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, + **kwargs, + ) -> torch.Tensor: + hidden_states = hidden_states + self.attn( + self.norm1(hidden_states), + cu_seqlens=cu_seqlens, + rotary_pos_emb=rotary_pos_emb, + position_embeddings=position_embeddings, + **kwargs, + ) + hidden_states = hidden_states + self.mlp(self.norm2(hidden_states)) + return hidden_states + + +class Glm4vTextRotaryEmbedding(nn.Module): + inv_freq: torch.Tensor # fix linting for `register_buffer` + + def __init__(self, config: Glm4vTextConfig, device=None): + super().__init__() + # BC: "rope_type" was originally "type" + if hasattr(config, "rope_scaling") and config.rope_scaling is not None: + self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) + else: + self.rope_type = "default" + self.max_seq_len_cached = config.max_position_embeddings + self.original_max_seq_len = config.max_position_embeddings + + self.config = config + self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + + inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) + self.original_inv_freq = self.inv_freq + + @torch.no_grad() + @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) + def forward(self, x, position_ids): + # In contrast to other models, Glm4vText has different position ids for the grids + # So we expand the inv_freq to shape (3, ...) + inv_freq_expanded = self.inv_freq[None, None, :, None].float().expand(3, position_ids.shape[1], -1, 1) + position_ids_expanded = position_ids[:, :, None, :].float() # shape (3, bs, 1, positions) + + device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" + with torch.autocast(device_type=device_type, enabled=False): # Force float32 + freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(2, 3) + emb = torch.cat((freqs, freqs), dim=-1) + cos = emb.cos() * self.attention_scaling + sin = emb.sin() * self.attention_scaling + + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) + + +def rotate_half_llm(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., 0::2] + x2 = x[..., 1::2] + return torch.stack((-x2, x1), dim=-1).flatten(-2) + + +def apply_multimodal_rotary_pos_emb(q, k, cos, sin, mrope_section, unsqueeze_dim=1): + """Applies Rotary Position Embedding with Multimodal Sections to the query and key tensors (https://qwenlm.github.io/blog/qwen2-vl/). + + Explanation: + Multimodal 3D rotary position embedding is an extension to 1D rotary position embedding. The input embedding + sequence contains vision (images / videos) embedding and text embedding or just contains text embedding. For + vision embedding part, we apply rotary position embedding on temporal, height and width dimension separately. + Here we split the channel dimension to 3 chunks for the temporal, height and width rotary position embedding. + For text embedding part, we just apply 1D rotary position embedding. The three rotary position index (temporal, + height and width) of text embedding is always the same, so the text embedding rotary position embedding has no + difference with modern LLMs. + + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + mrope_section(`List(int)`): + Multimodal rope section is for channel dimension of temporal, height and width in rope calculation. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + mrope_section = mrope_section * 2 + cos = torch.cat([m[i % 3] for i, m in enumerate(cos.split(mrope_section, dim=-1))], dim=-1).unsqueeze( + unsqueeze_dim + ) + sin = torch.cat([m[i % 3] for i, m in enumerate(sin.split(mrope_section, dim=-1))], dim=-1).unsqueeze( + unsqueeze_dim + ) + + # Interleave them instead of usual shape + cos = cos[..., : cos.shape[-1] // 2].repeat_interleave(2, dim=-1) + sin = sin[..., : sin.shape[-1] // 2].repeat_interleave(2, dim=-1) + + # Keep half or full tensor for later concatenation + rotary_dim = cos.shape[-1] + q_rot, q_pass = q[..., :rotary_dim], q[..., rotary_dim:] + k_rot, k_pass = k[..., :rotary_dim], k[..., rotary_dim:] + + # Apply rotary embeddings on the first half or full tensor + q_embed = (q_rot * cos) + (rotate_half_llm(q_rot) * sin) + k_embed = (k_rot * cos) + (rotate_half_llm(k_rot) * sin) + + # Concatenate back to full shape + q_embed = torch.cat([q_embed, q_pass], dim=-1) + k_embed = torch.cat([k_embed, k_pass], dim=-1) + + return q_embed, k_embed + + +class Glm4vTextAttention(nn.Module): + """ + Multi-headed attention from 'Attention Is All You Need' paper. + and "Generating Long Sequences with Sparse Transformers". + """ + + def __init__(self, config: Glm4vTextConfig, layer_idx: Optional[int] = None): + super().__init__() + self.config = config + self.layer_idx = layer_idx + + self.hidden_size = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.hidden_size // self.num_heads + self.num_key_value_heads = config.num_key_value_heads + self.num_key_value_groups = self.num_heads // self.num_key_value_heads + self.is_causal = True + self.attention_dropout = config.attention_dropout + self.rope_scaling = config.rope_scaling + self.scaling = self.head_dim**-0.5 + + self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=True) + self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True) + self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True) + self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False) + + def forward( + self, + hidden_states: torch.Tensor, + position_embeddings: tuple[torch.Tensor, torch.Tensor], + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Unpack[FlashAttentionKwargs], + ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]: + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2) + + cos, sin = position_embeddings + query_states, key_states = apply_multimodal_rotary_pos_emb( # diff with Llama + query_states, key_states, cos, sin, self.rope_scaling["mrope_section"] + ) + + if past_key_values is not None: + cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} # Specific to RoPE models + key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs) + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + attn_output, attn_weights = attention_interface( + self, + query_states, + key_states, + value_states, + attention_mask, + dropout=0.0 if not self.training else self.attention_dropout, + scaling=self.scaling, + **kwargs, + ) + + attn_output = attn_output.reshape(bsz, q_len, -1).contiguous() + attn_output = self.o_proj(attn_output) + return attn_output, attn_weights + + +class Glm4vTextMLP(nn.Module): + def __init__(self, config): + super().__init__() + + self.config = config + self.gate_up_proj = nn.Linear(config.hidden_size, 2 * config.intermediate_size, bias=False) + self.down_proj = nn.Linear(config.intermediate_size, config.hidden_size, bias=False) + self.activation_fn = ACT2FN[config.hidden_act] + + def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor: + up_states = self.gate_up_proj(hidden_states) + + gate, up_states = up_states.chunk(2, dim=-1) + up_states = up_states * self.activation_fn(gate) + + return self.down_proj(up_states) + + +class Glm4vTextDecoderLayer(GradientCheckpointingLayer): + def __init__(self, config: Glm4vTextConfig, layer_idx: int): + super().__init__() + self.hidden_size = config.hidden_size + self.self_attn = Glm4vTextAttention(config, layer_idx) + self.mlp = Glm4vTextMLP(config) + self.input_layernorm = Glm4vRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = Glm4vRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_self_attn_layernorm = Glm4vRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_mlp_layernorm = Glm4vRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + def forward( + self, + hidden_states: torch.Tensor, + position_embeddings: tuple[torch.Tensor, torch.Tensor], + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + cache_position: Optional[torch.LongTensor] = None, + **kwargs, + ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]: + residual = hidden_states + + hidden_states = self.input_layernorm(hidden_states) + + # Self Attention + hidden_states, _ = self.self_attn( + hidden_states=hidden_states, + position_embeddings=position_embeddings, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + output_attentions=output_attentions, + use_cache=use_cache, + cache_position=cache_position, + **kwargs, + ) + + hidden_states = self.post_self_attn_layernorm(hidden_states) + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = self.post_mlp_layernorm(hidden_states) + hidden_states = residual + hidden_states + + return hidden_states + + +@dataclass +@auto_docstring( + custom_intro=""" + Base class for Llava outputs, with hidden states and attentions. + """ +) +class Glm4vModelOutputWithPast(ModelOutput): + r""" + past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache). + + Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see + `past_key_values` input) to speed up sequential decoding. + rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*): + The rope index difference between sequence length and multimodal rope. + """ + + last_hidden_state: Optional[torch.FloatTensor] = None + past_key_values: Optional[Cache] = None + hidden_states: Optional[tuple[torch.FloatTensor]] = None + attentions: Optional[tuple[torch.FloatTensor]] = None + rope_deltas: Optional[torch.LongTensor] = None + + +@auto_docstring +class Glm4vPreTrainedModel(PreTrainedModel): + config: Glm4vConfig + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["Glm4vTextDecoderLayer", "Glm4vVisionBlock"] + _skip_keys_device_placement = "past_key_values" + _supports_flash_attn = True + _supports_sdpa = True + + _can_compile_fullgraph = True + _supports_attention_backend = True + _can_record_outputs = { + "hidden_states": Glm4vTextDecoderLayer, + "attentions": Glm4vTextAttention, + } + + +class Glm4vVisionModel(Glm4vPreTrainedModel): + config: Glm4vVisionConfig + _no_split_modules = ["Glm4vVisionBlock"] + + def __init__(self, config) -> None: + super().__init__(config) + self.spatial_merge_size = config.spatial_merge_size + self.patch_size = config.patch_size + + self.embeddings = Glm4vVisionEmbeddings(config) + self.patch_embed = Glm4vVisionPatchEmbed(config) + + head_dim = config.hidden_size // config.num_heads + self.rotary_pos_emb = Glm4vVisionRotaryEmbedding(head_dim // 2) + + self.blocks = nn.ModuleList([Glm4vVisionBlock(config) for _ in range(config.depth)]) + self.merger = Glm4vVisionPatchMerger( + dim=config.out_hidden_size, context_dim=config.intermediate_size, hidden_act=config.hidden_act + ) + + self.post_conv_layernorm = Glm4vRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.downsample = nn.Conv2d( + in_channels=config.hidden_size, + out_channels=config.out_hidden_size, + kernel_size=config.spatial_merge_size, + stride=config.spatial_merge_size, + ) + self.post_layernorm = Glm4vRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + self.gradient_checkpointing = False + self.post_init() + + def rot_pos_emb(self, grid_thw): + pos_ids = [] + for t, h, w in grid_thw: + hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w) + hpos_ids = hpos_ids.reshape( + h // self.spatial_merge_size, + self.spatial_merge_size, + w // self.spatial_merge_size, + self.spatial_merge_size, + ) + hpos_ids = hpos_ids.permute(0, 2, 1, 3) + hpos_ids = hpos_ids.flatten() + + wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1) + wpos_ids = wpos_ids.reshape( + h // self.spatial_merge_size, + self.spatial_merge_size, + w // self.spatial_merge_size, + self.spatial_merge_size, + ) + wpos_ids = wpos_ids.permute(0, 2, 1, 3) + wpos_ids = wpos_ids.flatten() + pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1)) + pos_ids = torch.cat(pos_ids, dim=0) + max_grid_size = grid_thw[:, 1:].max() + rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size) + rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1) + return rotary_pos_emb, pos_ids + + def forward(self, hidden_states: torch.Tensor, grid_thw: torch.Tensor) -> torch.Tensor: + """ + Args: + hidden_states (`torch.Tensor` of shape `(seq_len, hidden_size)`): + The final hidden states of the model. + grid_thw (`torch.Tensor` of shape `(num_images_or_videos, 3)`): + The temporal, height and width of feature shape of each image in LLM. + + Returns: + `torch.Tensor`: hidden_states. + """ + hidden_states = self.patch_embed(hidden_states) + hidden_states = self.post_conv_layernorm(hidden_states) + + rotary_pos_emb, image_type_ids = self.rot_pos_emb(grid_thw) + emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1) + position_embeddings = (emb.cos(), emb.sin()) + + cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]).cumsum( + dim=0, + # Select dtype based on the following factors: + # - FA2 requires that cu_seqlens_q must have dtype int32 + # - torch.onnx.export requires that cu_seqlens_q must have same dtype as grid_thw + # See https://github.com/huggingface/transformers/pull/34852 for more information + dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32, + ) + cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0) + seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist() + hidden_states = self.embeddings(hidden_states, seqlens, grid_thw, image_type_ids[:, 0], image_type_ids[:, 1]) + + for blk in self.blocks: + hidden_states = blk( + hidden_states, + cu_seqlens=cu_seqlens, + position_embeddings=position_embeddings, + ) + + hidden_states = self.post_layernorm(hidden_states) + + hidden_states = hidden_states.view( + -1, self.spatial_merge_size, self.spatial_merge_size, hidden_states.shape[-1] + ) + hidden_states = hidden_states.permute(0, 3, 1, 2) + hidden_states = self.downsample(hidden_states).view(-1, self.config.out_hidden_size) + + hidden_states = self.merger(hidden_states) + return hidden_states + + +@auto_docstring +class Glm4vTextModel(Glm4vPreTrainedModel): + config: Glm4vTextConfig + + def __init__(self, config: Glm4vTextConfig): + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) + self.layers = nn.ModuleList( + [Glm4vTextDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + ) + self.norm = Glm4vRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.rotary_emb = Glm4vTextRotaryEmbedding(config=config) + + self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + @check_model_inputs + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Unpack[FlashAttentionKwargs], + ) -> Union[tuple, BaseModelOutputWithPast]: + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") + + # torch.jit.trace() doesn't support cache objects in the output + if use_cache and past_key_values is None and not torch.jit.is_tracing(): + past_key_values = DynamicCache(config=self.config) + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + if cache_position is None: + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + cache_position = torch.arange( + past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device + ) + + # the hard coded `3` is for temporal, height and width. + if position_ids is None: + position_ids = cache_position.view(1, 1, -1).expand(3, inputs_embeds.shape[0], -1) + elif position_ids.dim() == 2: + position_ids = position_ids[None, ...].expand(3, position_ids.shape[0], -1) + + causal_mask = create_causal_mask( + config=self.config, + input_embeds=inputs_embeds, + attention_mask=attention_mask, + cache_position=cache_position, + past_key_values=past_key_values, + position_ids=position_ids, + ) + + hidden_states = inputs_embeds + + # create position embeddings to be shared across the decoder layers + position_embeddings = self.rotary_emb(hidden_states, position_ids) + + for decoder_layer in self.layers: + layer_outputs = decoder_layer( + hidden_states, + position_embeddings=position_embeddings, + attention_mask=causal_mask, + position_ids=position_ids, + past_key_values=past_key_values, + cache_position=cache_position, + **kwargs, + ) + hidden_states = layer_outputs + + hidden_states = self.norm(hidden_states) + + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=past_key_values, + ) + + +@auto_docstring +class Glm4vModel(Glm4vPreTrainedModel): + base_model_prefix = "" + _checkpoint_conversion_mapping = {} + # Reference: fix gemma3 grad acc #37208 + accepts_loss_kwargs = False + config: Glm4vConfig + _no_split_modules = ["Glm4vTextDecoderLayer", "Glm4vVisionBlock"] + + def __init__(self, config): + super().__init__(config) + self.visual = Glm4vVisionModel._from_config(config.vision_config) + self.language_model = Glm4vTextModel._from_config(config.text_config) + self.rope_deltas = None # cache rope_deltas here + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.language_model.get_input_embeddings() + + def set_input_embeddings(self, value): + self.language_model.set_input_embeddings(value) + + def set_decoder(self, decoder): + self.language_model = decoder + + def get_decoder(self): + return self.language_model + + def get_rope_index( + self, + input_ids: Optional[torch.LongTensor] = None, + image_grid_thw: Optional[torch.LongTensor] = None, + video_grid_thw: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + ) -> tuple[torch.Tensor, torch.Tensor]: + """ + Calculate the 3D rope index based on image and video's temporal, height and width in LLM. + + Explanation: + Each embedding sequence contains vision embedding and text embedding or just contains text embedding. + + For pure text embedding sequence, the rotary position embedding has no difference with modern LLMs. + Examples: + input_ids: [T T T T T], here T is for text. + temporal position_ids: [0, 1, 2, 3, 4] + height position_ids: [0, 1, 2, 3, 4] + width position_ids: [0, 1, 2, 3, 4] + + For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part + and 1D rotary position embedding for text part. + Examples: + Temporal (Time): 3 patches, representing different segments of the video in time. + Height: 2 patches, dividing each frame vertically. + Width: 2 patches, dividing each frame horizontally. + We also have some important parameters: + fps (Frames Per Second): The video's frame rate, set to 1. This means one frame is processed each second. + tokens_per_second: This is a crucial parameter. It dictates how many "time-steps" or "temporal tokens" are conceptually packed into a one-second interval of the video. In this case, we have 25 tokens per second. So each second of the video will be represented with 25 separate time points. It essentially defines the temporal granularity. + temporal_patch_size: The number of frames that compose one temporal patch. Here, it's 2 frames. + interval: The step size for the temporal position IDs, calculated as tokens_per_second * temporal_patch_size / fps. In this case, 25 * 2 / 1 = 50. This means that each temporal patch will be have a difference of 50 in the temporal position IDs. + input_ids: [V V V V V V V V V V V V T T T T T], here V is for vision. + vision temporal position_ids: [0, 0, 0, 0, 50, 50, 50, 50, 100, 100, 100, 100] + vision height position_ids: [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1] + vision width position_ids: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1] + text temporal position_ids: [101, 102, 103, 104, 105] + text height position_ids: [101, 102, 103, 104, 105] + text width position_ids: [101, 102, 103, 104, 105] + Here we calculate the text start position_ids as the max vision position_ids plus 1. + + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*): + The temporal, height and width of feature shape of each image in LLM. + video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*): + The temporal, height and width of feature shape of each video in LLM. + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + Returns: + position_ids (`torch.LongTensor` of shape `(3, batch_size, sequence_length)`) + mrope_position_deltas (`torch.Tensor` of shape `(batch_size)`) + """ + + spatial_merge_size = self.config.vision_config.spatial_merge_size + image_token_id = self.config.image_token_id + video_start_token_id = self.config.video_start_token_id + video_end_token_id = self.config.video_end_token_id + + mrope_position_deltas = [] + if input_ids is not None and (image_grid_thw is not None or video_grid_thw is not None): + total_input_ids = input_ids + if attention_mask is None: + attention_mask = torch.ones_like(total_input_ids) + position_ids = torch.ones( + 3, + input_ids.shape[0], + input_ids.shape[1], + dtype=input_ids.dtype, + device=input_ids.device, + ) + image_index, video_index = 0, 0 + video_group_index = 0 + attention_mask = attention_mask.to(total_input_ids.device) + for i, input_ids in enumerate(total_input_ids): + input_ids = input_ids[attention_mask[i] == 1] + input_tokens = input_ids.tolist() + + input_token_type = [] + video_check_flg = False + for token in input_tokens: + if token == video_start_token_id: + video_check_flg = True + elif token == video_end_token_id: + video_check_flg = False + + if token == image_token_id and not video_check_flg: + input_token_type.append("image") + elif token == image_token_id and video_check_flg: + input_token_type.append("video") + else: + input_token_type.append("text") + + input_type_group = [] + for key, group in itertools.groupby(enumerate(input_token_type), lambda x: x[1]): + group = list(group) + start_index = group[0][0] + end_index = group[-1][0] + 1 + input_type_group.append((key, start_index, end_index)) + + llm_pos_ids_list = [] + video_frame_num = 1 + for modality_type, start_idx, end_idx in input_type_group: + st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0 + + if modality_type == "image": + t, h, w = ( + image_grid_thw[image_index][0], + image_grid_thw[image_index][1], + image_grid_thw[image_index][2], + ) + llm_grid_t, llm_grid_h, llm_grid_w = ( + t.item(), + h.item() // spatial_merge_size, + w.item() // spatial_merge_size, + ) + + t_index = torch.arange(llm_grid_t).view(-1, 1).expand(-1, llm_grid_h * llm_grid_w).flatten() + h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(llm_grid_t, -1, llm_grid_w).flatten() + w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(llm_grid_t, llm_grid_h, -1).flatten() + llm_pos_ids_list.append(torch.stack([t_index, h_index, w_index]) + st_idx) + + image_index += 1 + video_frame_num = 1 + + elif modality_type == "video": + t, h, w = ( + video_frame_num, + video_grid_thw[video_index][1], + video_grid_thw[video_index][2], + ) + + llm_grid_t, llm_grid_h, llm_grid_w = ( + t, + h.item() // spatial_merge_size, + w.item() // spatial_merge_size, + ) + + for t_idx in range(llm_grid_t): + t_index = torch.tensor(t_idx).view(-1, 1).expand(-1, llm_grid_h * llm_grid_w).flatten() + + h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(1, -1, llm_grid_w).flatten() + w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(1, llm_grid_h, -1).flatten() + llm_pos_ids_list.append(torch.stack([t_index, h_index, w_index]) + st_idx) + + video_group_index += 1 + + if video_group_index >= video_grid_thw[video_index][0]: + video_index += 1 + video_group_index = 0 + + video_frame_num += 1 + + else: + text_len = end_idx - start_idx + llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx) + + video_frame_num = 1 + + llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1) + position_ids[..., i, attention_mask[i] == 1] = llm_positions.to(position_ids.device) + mrope_position_deltas.append(llm_positions.max() + 1 - len(total_input_ids[i])) + mrope_position_deltas = torch.tensor(mrope_position_deltas, device=input_ids.device).unsqueeze(1) + return position_ids, mrope_position_deltas + else: + if attention_mask is not None: + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + position_ids = position_ids.unsqueeze(0).expand(3, -1, -1).to(attention_mask.device) + max_position_ids = position_ids.max(0, keepdim=False)[0].max(-1, keepdim=True)[0] + mrope_position_deltas = max_position_ids + 1 - attention_mask.shape[-1] + else: + position_ids = ( + torch.arange(input_ids.shape[1], device=input_ids.device) + .view(1, 1, -1) + .expand(3, input_ids.shape[0], -1) + ) + mrope_position_deltas = torch.zeros( + [input_ids.shape[0], 1], + device=input_ids.device, + dtype=input_ids.dtype, + ) + + return position_ids, mrope_position_deltas + + def get_video_features( + self, pixel_values_videos: torch.FloatTensor, video_grid_thw: Optional[torch.LongTensor] = None + ): + """ + Encodes videos into continuous embeddings that can be forwarded to the language model. + + Args: + pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): + The tensors corresponding to the input videos. + video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*): + The temporal, height and width of feature shape of each video in LLM. + """ + pixel_values_videos = pixel_values_videos.type(self.visual.dtype) + # reshape video_grid_thw -> [b, 3] -> [1, h, w] * frames + temp_frames_hw = [] + for t, h, w in video_grid_thw: + repeated_row = torch.tensor([1, h.item(), w.item()]).unsqueeze(0).repeat(t, 1) + temp_frames_hw.append(repeated_row) + flattened_video_grid_thw = torch.cat(temp_frames_hw, dim=0) + video_embeds = self.visual(pixel_values_videos, grid_thw=flattened_video_grid_thw) + split_sizes = (video_grid_thw.prod(-1) // self.visual.spatial_merge_size**2).tolist() + video_embeds = torch.split(video_embeds, split_sizes) + return video_embeds + + def get_image_features(self, pixel_values: torch.FloatTensor, image_grid_thw: Optional[torch.LongTensor] = None): + """ + Encodes images into continuous embeddings that can be forwarded to the language model. + + Args: + pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): + The tensors corresponding to the input images. + image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*): + The temporal, height and width of feature shape of each image in LLM. + """ + pixel_values = pixel_values.type(self.visual.dtype) + image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw) + split_sizes = (image_grid_thw.prod(-1) // self.visual.spatial_merge_size**2).tolist() + image_embeds = torch.split(image_embeds, split_sizes) + return image_embeds + + def get_placeholder_mask( + self, + input_ids: torch.LongTensor, + inputs_embeds: torch.FloatTensor, + image_features: Optional[torch.FloatTensor] = None, + video_features: Optional[torch.FloatTensor] = None, + ): + """ + Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is + equal to the length of multimodal features. If the lengths are different, an error is raised. + """ + if input_ids is None: + special_image_mask = inputs_embeds == self.get_input_embeddings()( + torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device) + ) + special_image_mask = special_image_mask.all(-1) + special_video_mask = inputs_embeds == self.get_input_embeddings()( + torch.tensor(self.config.video_token_id, dtype=torch.long, device=inputs_embeds.device) + ) + special_video_mask = special_video_mask.all(-1) + else: + # GLM-4.1V and GLM-4.5V special_video_mask is special_image_mask + special_image_mask = input_ids == self.config.image_token_id + special_video_mask = input_ids == self.config.image_token_id + + n_image_tokens = special_image_mask.sum() + special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device) + if image_features is not None and inputs_embeds[special_image_mask].numel() != image_features.numel(): + raise ValueError( + f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {image_features.shape[0]}" + ) + + n_video_tokens = special_video_mask.sum() + special_video_mask = special_video_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device) + if video_features is not None and inputs_embeds[special_video_mask].numel() != video_features.numel(): + raise ValueError( + f"Videos features and video tokens do not match: tokens: {n_video_tokens}, features {video_features.shape[0]}" + ) + + return special_image_mask, special_video_mask + + @auto_docstring + @can_return_tuple + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + pixel_values: Optional[torch.Tensor] = None, + pixel_values_videos: Optional[torch.FloatTensor] = None, + image_grid_thw: Optional[torch.LongTensor] = None, + video_grid_thw: Optional[torch.LongTensor] = None, + rope_deltas: Optional[torch.LongTensor] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> Union[tuple, Glm4vModelOutputWithPast]: + r""" + image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*): + The temporal, height and width of feature shape of each image in LLM. + video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*): + The temporal, height and width of feature shape of each video in LLM. + rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*): + The rope index difference between sequence length and multimodal rope. + """ + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") + + if inputs_embeds is None: + inputs_embeds = self.get_input_embeddings()(input_ids) + + if pixel_values is not None: + image_embeds = self.get_image_features(pixel_values, image_grid_thw) + image_embeds = torch.cat(image_embeds, dim=0).to(inputs_embeds.device, inputs_embeds.dtype) + image_mask, _ = self.get_placeholder_mask(input_ids, inputs_embeds, image_features=image_embeds) + inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds) + + if pixel_values_videos is not None: + video_embeds = self.get_video_features(pixel_values_videos, video_grid_thw) + video_embeds = torch.cat(video_embeds, dim=0).to(inputs_embeds.device, inputs_embeds.dtype) + _, video_mask = self.get_placeholder_mask(input_ids, inputs_embeds, video_features=video_embeds) + inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds) + + if position_ids is None: + attention_mask_tensor = ( + attention_mask if not isinstance(attention_mask, dict) else attention_mask["full_attention"] + ) + if attention_mask_tensor is not None and attention_mask_tensor.ndim == 4: + attention_mask_tensor = torch.diagonal(attention_mask_tensor[:, 0], dim1=1, dim2=2) + # Only apply conversion for floating point tensors (inverted masks) + if attention_mask_tensor.dtype.is_floating_point: + attention_mask_tensor = attention_mask_tensor / torch.finfo(attention_mask_tensor.dtype).min + attention_mask_tensor = (1.0 - attention_mask_tensor).int() + + # Calculate RoPE index once per generation in the pre-fill stage only. + # When compiling, we can't check tensor values thus we check only input length + # It is safe to assume that `length!=1` means we're in pre-fill because compiled + # models currently cannot do asssisted decoding + prefill_compiled_stage = is_torchdynamo_compiling() and ( + (input_ids is not None and input_ids.shape[1] != 1) + or (inputs_embeds is not None and inputs_embeds.shape[1] != 1) + ) + prefill_noncompiled_stage = not is_torchdynamo_compiling() and ( + (cache_position is not None and cache_position[0] == 0) + or (past_key_values is None or past_key_values.get_seq_length() == 0) + ) + if (prefill_compiled_stage or prefill_noncompiled_stage) or self.rope_deltas is None: + position_ids, rope_deltas = self.get_rope_index( + input_ids, + image_grid_thw, + video_grid_thw, + attention_mask=attention_mask_tensor, + ) + self.rope_deltas = rope_deltas + # then use the prev pre-calculated rope-deltas to get the correct position ids + else: + batch_size, seq_length, _ = inputs_embeds.shape + delta = ( + (cache_position[0] + self.rope_deltas).to(inputs_embeds.device) + if cache_position is not None + else 0 + ) + position_ids = torch.arange(seq_length, device=inputs_embeds.device) + position_ids = position_ids.view(1, -1).expand(batch_size, -1) + if cache_position is not None: # otherwise `deltas` is an int `0` + delta = delta.repeat_interleave(batch_size // delta.shape[0], dim=0) + position_ids = position_ids.add(delta) + position_ids = position_ids.unsqueeze(0).expand(3, -1, -1) + + outputs = self.language_model( + input_ids=None, + position_ids=position_ids, + attention_mask=attention_mask, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + cache_position=cache_position, + **kwargs, + ) + + return Glm4vModelOutputWithPast( + last_hidden_state=outputs.last_hidden_state, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + rope_deltas=self.rope_deltas, + ) + + +@dataclass +@auto_docstring( + custom_intro=""" + Base class for Glm4v causal language model (or autoregressive) outputs. + """ +) +class Glm4vCausalLMOutputWithPast(ModelOutput): + r""" + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Language modeling loss (for next-token prediction). + logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache). + + Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see + `past_key_values` input) to speed up sequential decoding. + rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*): + The rope index difference between sequence length and multimodal rope. + """ + + loss: Optional[torch.FloatTensor] = None + logits: Optional[torch.FloatTensor] = None + past_key_values: Optional[Cache] = None + hidden_states: Optional[tuple[torch.FloatTensor]] = None + attentions: Optional[tuple[torch.FloatTensor]] = None + rope_deltas: Optional[torch.LongTensor] = None + + +class Glm4vForConditionalGeneration(Glm4vPreTrainedModel, GenerationMixin): + _checkpoint_conversion_mapping = {} + _tied_weights_keys = ["lm_head.weight"] + # Reference: fix gemma3 grad acc #37208 + accepts_loss_kwargs = False + + def __init__(self, config): + super().__init__(config) + self.model = Glm4vModel(config) + self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False) + + self.post_init() + + def get_input_embeddings(self): + return self.model.get_input_embeddings() + + def set_input_embeddings(self, value): + self.model.set_input_embeddings(value) + + def set_decoder(self, decoder): + self.model.set_decoder(decoder) + + def get_decoder(self): + return self.model.get_decoder() + + def get_video_features( + self, pixel_values_videos: torch.FloatTensor, video_grid_thw: Optional[torch.LongTensor] = None + ): + return self.model.get_video_features(pixel_values_videos, video_grid_thw) + + def get_image_features(self, pixel_values: torch.FloatTensor, image_grid_thw: Optional[torch.LongTensor] = None): + return self.model.get_image_features(pixel_values, image_grid_thw) + + # Make modules available through conditional class for BC + @property + def language_model(self): + return self.model.language_model + + @property + def visual(self): + return self.model.visual + + @can_return_tuple + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.Tensor] = None, + pixel_values_videos: Optional[torch.FloatTensor] = None, + image_grid_thw: Optional[torch.LongTensor] = None, + video_grid_thw: Optional[torch.LongTensor] = None, + rope_deltas: Optional[torch.LongTensor] = None, + cache_position: Optional[torch.LongTensor] = None, + logits_to_keep: Union[int, torch.Tensor] = 0, + **kwargs: Unpack[TransformersKwargs], + ) -> Union[tuple, Glm4vCausalLMOutputWithPast]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*): + The temporal, height and width of feature shape of each image in LLM. + video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*): + The temporal, height and width of feature shape of each video in LLM. + rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*): + The rope index difference between sequence length and multimodal rope. + + Example: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, Glm4vForConditionalGeneration + + >>> model = Glm4vForConditionalGeneration.from_pretrained("THUDM/GLM-4.1V-9B-Thinking") + >>> processor = AutoProcessor.from_pretrained("THUDM/GLM-4.1V-9B-Thinking") + + >>> messages = [ + { + "role": "user", + "content": [ + {"type": "image"}, + {"type": "text", "text": "What is shown in this image?"}, + ], + }, + ] + >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) + >>> inputs = processor(text=[text], images=[image], vision_infos=[vision_infos]) + + >>> # Generate + >>> generate_ids = model.generate(inputs.input_ids, max_length=30) + >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "The image shows a street scene with a red stop sign in the foreground. In the background, there is a large red gate with Chinese characters ..." + ```""" + outputs = self.model( + input_ids=input_ids, + pixel_values=pixel_values, + pixel_values_videos=pixel_values_videos, + image_grid_thw=image_grid_thw, + video_grid_thw=video_grid_thw, + position_ids=position_ids, + attention_mask=attention_mask, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + cache_position=cache_position, + **kwargs, + ) + + hidden_states = outputs[0] + + # Only compute necessary logits, and do not upcast them to float if we are not computing the loss + slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep + logits = self.lm_head(hidden_states[:, slice_indices, :]) + + loss = None + if labels is not None: + loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size) + + return Glm4vCausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + rope_deltas=outputs.rope_deltas, + ) + + def prepare_inputs_for_generation( + self, + input_ids, + past_key_values=None, + attention_mask=None, + inputs_embeds=None, + cache_position=None, + position_ids=None, + use_cache=True, + pixel_values=None, + pixel_values_videos=None, + image_grid_thw=None, + video_grid_thw=None, + **kwargs, + ): + # Overwritten -- in specific circumstances we don't want to forward image inputs to the model + + model_inputs = super().prepare_inputs_for_generation( + input_ids, + past_key_values=past_key_values, + attention_mask=attention_mask, + inputs_embeds=inputs_embeds, + cache_position=cache_position, + position_ids=position_ids, + pixel_values=pixel_values, + pixel_values_videos=pixel_values_videos, + image_grid_thw=image_grid_thw, + video_grid_thw=video_grid_thw, + use_cache=use_cache, + **kwargs, + ) + + # GLM-4.1V position_ids are prepareed with rope_deltas in forward + model_inputs["position_ids"] = None + + if cache_position[0] != 0: + model_inputs["pixel_values"] = None + model_inputs["pixel_values_videos"] = None + + return model_inputs + + def _get_image_nums_and_video_nums( + self, + input_ids: Optional[torch.LongTensor], + inputs_embeds: Optional[torch.Tensor] = None, + ) -> tuple[torch.Tensor, torch.Tensor]: + """ + Get the number of images and videos for each sample to calculate the separation length of the sample tensor. + These parameters are not passed through the processor to avoid unpredictable impacts from interface modifications. + + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. + + Returns: + image_nums (`torch.LongTensor` of shape `(batch_size, num_images_sample)`) + video_nums (`torch.LongTensor` of shape `(batch_size, num_videos_sample)`) + """ + + if inputs_embeds is not None: + is_image = ( + inputs_embeds + == self.get_input_embeddings()( + torch.tensor(self.config.image_start_token_id, dtype=torch.long, device=inputs_embeds.device) + ) + )[..., 0] + is_video_start = ( + inputs_embeds + == self.get_input_embeddings()( + torch.tensor(self.config.video_start_token_id, dtype=torch.long, device=inputs_embeds.device) + ) + )[..., 0] + is_video_end = ( + inputs_embeds + == self.get_input_embeddings()( + torch.tensor(self.config.video_end_token_id, dtype=torch.long, device=inputs_embeds.device) + ) + )[..., 0] + else: + is_image = input_ids == self.config.image_start_token_id + is_video_start = input_ids == self.config.video_start_token_id + is_video_end = input_ids == self.config.video_end_token_id + + # Cumulative sum to track if we're inside a video span + # We'll assume well-formed video tags (i.e. matching starts and ends) + video_level = torch.cumsum(is_video_start.int() - is_video_end.int(), dim=1) + inside_video = video_level > 0 # shape (batch_size, seq_length) + + # Mask out image tokens that are inside video spans + standalone_images = is_image & (~inside_video) + + # Count per batch + image_counts = standalone_images.sum(dim=1) + video_counts = is_video_start.sum(dim=1) + + return image_counts, video_counts + + def _expand_inputs_for_generation( + self, + expand_size: int = 1, + is_encoder_decoder: bool = False, + input_ids: Optional[torch.LongTensor] = None, + **model_kwargs, + ) -> tuple[torch.LongTensor, dict[str, Any]]: + # Overwritten -- Support for expanding tensors without a batch size dimension + # e.g., pixel_values, image_grid_thw, pixel_values_videos, video_grid_thw, second_per_grid_t + # pixel_values.shape[0] is sum(seqlen_images for samples) + # image_grid_thw.shape[0] is sum(num_images for samples) + + if expand_size == 1: + return input_ids, model_kwargs + + visual_keys = ["pixel_values", "image_grid_thw", "pixel_values_videos", "video_grid_thw", "second_per_grid_ts"] + + def _expand_dict_for_generation_visual(dict_to_expand): + image_grid_thw = model_kwargs.get("image_grid_thw", None) + video_grid_thw = model_kwargs.get("video_grid_thw", None) + image_nums, video_nums = self._get_image_nums_and_video_nums( + input_ids, inputs_embeds=model_kwargs.get("inputs_embeds", None) + ) + + def _repeat_interleave_samples(x, lengths, repeat_times): + samples = torch.split(x, lengths) + repeat_args = [repeat_times] + [1] * (x.dim() - 1) + result = torch.cat([sample.repeat(*repeat_args) for sample in samples], dim=0) + return result + + for key in dict_to_expand: + if key == "pixel_values": + # split images into samples + samples = torch.split(image_grid_thw, list(image_nums)) + # compute the sequence length of images for each sample + lengths = [torch.prod(sample, dim=1).sum() for sample in samples] + dict_to_expand[key] = _repeat_interleave_samples( + dict_to_expand[key], lengths=lengths, repeat_times=expand_size + ) + elif key == "image_grid_thw": + # get the num of images for each sample + lengths = list(image_nums) + dict_to_expand[key] = _repeat_interleave_samples( + dict_to_expand[key], lengths=lengths, repeat_times=expand_size + ) + elif key == "pixel_values_videos": + samples = torch.split(video_grid_thw, list(video_nums)) + lengths = [torch.prod(sample, dim=1).sum() for sample in samples] + dict_to_expand[key] = _repeat_interleave_samples( + dict_to_expand[key], lengths=lengths, repeat_times=expand_size + ) + elif key == "video_grid_thw": + lengths = list(video_nums) + dict_to_expand[key] = _repeat_interleave_samples( + dict_to_expand[key], lengths=lengths, repeat_times=expand_size + ) + elif key == "second_per_grid_ts": + dict_to_expand[key] = _repeat_interleave_samples( + dict_to_expand[key], lengths=list(video_nums), repeat_times=expand_size + ) + return dict_to_expand + + def _expand_dict_for_generation(dict_to_expand): + for key in dict_to_expand: + if ( + key != "cache_position" + and dict_to_expand[key] is not None + and isinstance(dict_to_expand[key], torch.Tensor) + and key not in visual_keys + ): + dict_to_expand[key] = dict_to_expand[key].repeat_interleave(expand_size, dim=0) + return dict_to_expand + + model_kwargs = _expand_dict_for_generation_visual(model_kwargs) + + if input_ids is not None: + input_ids = input_ids.repeat_interleave(expand_size, dim=0) + + model_kwargs = _expand_dict_for_generation(model_kwargs) + + if is_encoder_decoder: + if model_kwargs.get("encoder_outputs") is None: + raise ValueError("If `is_encoder_decoder` is True, make sure that `encoder_outputs` is defined.") + model_kwargs["encoder_outputs"] = _expand_dict_for_generation(model_kwargs["encoder_outputs"]) + + return input_ids, model_kwargs + + +__all__ = ["Glm4vForConditionalGeneration", "Glm4vModel", "Glm4vPreTrainedModel", "Glm4vTextModel"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/glm4v/modular_glm4v.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/glm4v/modular_glm4v.py new file mode 100644 index 0000000000000000000000000000000000000000..3f870db9db05b633d0a552b7576b44d27942493d --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/glm4v/modular_glm4v.py @@ -0,0 +1,1699 @@ +# coding=utf-8 +# Copyright 2025 The ZhipuAI Inc. team and HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import itertools +from typing import Callable, Optional, Union + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.nn import LayerNorm + +from ...activations import ACT2FN +from ...cache_utils import Cache, DynamicCache +from ...configuration_utils import PretrainedConfig +from ...feature_extraction_utils import BatchFeature +from ...image_utils import ImageInput +from ...masking_utils import create_causal_mask +from ...modeling_flash_attention_utils import FlashAttentionKwargs +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_outputs import BaseModelOutputWithPast +from ...modeling_rope_utils import rope_config_validation +from ...modeling_utils import ALL_ATTENTION_FUNCTIONS +from ...processing_utils import ImagesKwargs, Unpack +from ...tokenization_utils_base import PreTokenizedInput, TextInput +from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, is_torchdynamo_compiling, logging +from ...utils.generic import check_model_inputs +from ...video_utils import VideoInput +from ..glm4.modeling_glm4 import Glm4MLP, Glm4RMSNorm, eager_attention_forward +from ..qwen2_5_vl.modeling_qwen2_5_vl import ( + Qwen2_5_VisionPatchEmbed, + Qwen2_5_VisionRotaryEmbedding, + Qwen2_5_VLCausalLMOutputWithPast, + Qwen2_5_VLForConditionalGeneration, + Qwen2_5_VLMLP, + Qwen2_5_VLModel, + Qwen2_5_VLModelOutputWithPast, + Qwen2_5_VLPreTrainedModel, + Qwen2_5_VLRotaryEmbedding, + Qwen2_5_VLTextModel, + Qwen2_5_VLVisionAttention, + Qwen2_5_VLVisionBlock, +) +from ..qwen2_5_vl.processing_qwen2_5_vl import Qwen2_5_VLVideosProcessorKwargs +from ..qwen2_vl.processing_qwen2_vl import ( + Qwen2_VLProcessor, + Qwen2_VLProcessorKwargs, +) + + +logger = logging.get_logger(__name__) + + +class Glm4vVisionConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`Glm4vVisionModel`]. It is used to instantiate an Glm4vVisionModel + model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield + a similar configuration to that of + GLM-4.1V-9B-Thinking [THUDM/GLM-4.1V-9B-Thinking](https://huggingface.co/THUDM/GLM-4.1V-9B-Thinking). + + Args: + hidden_size (`int`, *optional*, defaults to 1536): + Dimensionality of the encoder layers and the pooler layer. + depth (`int`, *optional*, defaults to 24): + Number of layers (depth) in the model. + attention_bias (`bool`, *optional*, defaults to `False`): + Whether to add a bias to the queries, keys and values. + intermediate_size (`int`, *optional*, defaults to 13696): + Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + hidden_act (`str` or `function`, *optional*, defaults to `"selu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"` and `"gelu_new"` are supported. + hidden_dropout_prob (`float`, *optional*, defaults to 0.0): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_dropout (`float`, *optional*, defaults to 0.0): + Dropout probability for attention weights. + projection_dropout (`float`, *optional*, defaults to 0.0): + Dropout probability for the projection layer. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + image_size (`int` or `list[int]`, *optional*, defaults to `[336, 336]`): + The size (resolution) of each image. + patch_size (`int`, *optional*, defaults to `14`): + The size (resolution) of each patch. + num_channels (`int`, *optional*, defaults to 3): + The number of input channels. + out_hidden_size (`int`, *optional*, defaults to 4096): + The output hidden size of the vision model. + rms_norm_eps (`float`, *optional*, defaults to 1e-05): + The epsilon used by the rms normalization layers. + spatial_merge_size (`int`, *optional*, defaults to 2): + The size used for merging spatial dimensions. + temporal_patch_size (`int`, *optional*, defaults to 2): + The size used for patches along the temporal dimension. + Example: + + ```python + >>> from transformers import Glm4vVisionConfig, Glm4vVisionModel + + >>> # Initializing a Glm4vVisionConfig GLM-4.1V-9B style configuration + >>> configuration = Glm4vVisionConfig() + + >>> # Initializing a model (with random weights) from the GLM-4.1V-9B configuration + >>> model = Glm4vVisionModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "glm4v" + base_config_key = "vision_config" + + def __init__( + self, + depth=24, + hidden_size=1536, + hidden_act="silu", + attention_bias=False, + attention_dropout=0.0, + num_heads=12, + in_channels=3, + image_size=336, + patch_size=14, + rms_norm_eps=1e-05, + spatial_merge_size=2, + temporal_patch_size=2, + out_hidden_size=4096, + intermediate_size=13696, + initializer_range=0.02, + **kwargs, + ): + super().__init__(**kwargs) + + self.depth = depth + self.hidden_size = hidden_size + self.hidden_act = hidden_act + self.num_heads = num_heads + self.in_channels = in_channels + self.image_size = image_size + self.patch_size = patch_size + self.spatial_merge_size = spatial_merge_size + self.temporal_patch_size = temporal_patch_size + self.out_hidden_size = out_hidden_size + self.intermediate_size = intermediate_size + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + self.attention_bias = attention_bias + self.attention_dropout = attention_dropout + + +class Glm4vTextConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`Glm4vModel`]. It is used to instantiate a + GLM-4.1V model according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of + GLM-4.1V-9B-Thinking [THUDM/GLM-4.1V-9B-Thinking](https://huggingface.co/THUDM/GLM-4.1V-9B-Thinking). + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + vocab_size (`int`, *optional*, defaults to 151552): + Vocabulary size of the Glm4v model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`Glm4vModel`] + hidden_size (`int`, *optional*, defaults to 4096): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 13696): + Dimension of the MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 40): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 32): + Number of attention heads for each attention layer in the Transformer encoder. + num_key_value_heads (`int`, *optional*, defaults to 2): + This is the number of key_value heads that should be used to implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When + converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed + by meanpooling all the original heads within that group. For more details checkout [this + paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to `32`. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the decoder. + max_position_embeddings (`int`, *optional*, defaults to 32768): + The maximum sequence length that this model might ever be used with. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + rms_norm_eps (`float`, *optional*, defaults to 1e-05): + The epsilon used by the rms normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether the model's input and output word embeddings should be tied. + rope_theta (`float`, *optional*, defaults to 10000.0): + The base period of the RoPE embeddings. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + rope_scaling (`Dict`, *optional*): + Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type + and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value + accordingly. + Expected contents: + `rope_type` (`str`): + The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', + 'llama3'], with 'default' being the original RoPE implementation. + `factor` (`float`, *optional*): + Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In + most scaling types, a `factor` of x will enable the model to handle sequences of length x * + original maximum pre-trained length. + `original_max_position_embeddings` (`int`, *optional*): + Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during + pretraining. + `attention_factor` (`float`, *optional*): + Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention + computation. If unspecified, it defaults to value recommended by the implementation, using the + `factor` field to infer the suggested value. + image_token_id (`int`, *optional*): + Token index used as placeholder for image embeddings. + video_token_id (`int`, *optional*): + Token index used as placeholder for video embeddings. + + ```python + >>> from transformers import Glm4vTextModel, Glm4vConfig + + >>> # Initializing a GLM-4.1V style configuration + >>> configuration = Glm4vConfig() + + >>> # Initializing a model from the GLM-4.1V style configuration + >>> model = Glm4vTextModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "glm4v_text" + base_config_key = "text_config" + keys_to_ignore_at_inference = ["past_key_values"] + # Default tensor parallel plan for base model `Glm4v` + base_model_tp_plan = { + "layers.*.self_attn.q_proj": "colwise", + "layers.*.self_attn.k_proj": "colwise", + "layers.*.self_attn.v_proj": "colwise", + "layers.*.self_attn.o_proj": "rowwise", + "layers.*.mlp.gate_up_proj": "colwise_rep", # we need to replicate here due to the `chunk` operation + "layers.*.mlp.down_proj": "rowwise_rep", # we need to replicate here due to the `chunk` operation + } + base_model_pp_plan = { + "embed_tokens": (["input_ids"], ["inputs_embeds"]), + "layers": (["hidden_states", "attention_mask"], ["hidden_states"]), + "norm": (["hidden_states"], ["hidden_states"]), + } + + def __init__( + self, + vocab_size=151552, + hidden_size=4096, + intermediate_size=13696, + num_hidden_layers=40, + num_attention_heads=32, + num_key_value_heads=2, + hidden_act="silu", + max_position_embeddings=32768, + initializer_range=0.02, + rms_norm_eps=1e-05, + use_cache=True, + tie_word_embeddings=False, + rope_theta=10000.0, + attention_dropout=0.0, + rope_scaling=None, + image_token_id=None, + video_token_id=None, + **kwargs, + ): + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + + # for backward compatibility + if num_key_value_heads is None: + num_key_value_heads = num_attention_heads + + self.num_key_value_heads = num_key_value_heads + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + self.use_cache = use_cache + self.rope_theta = rope_theta + self.attention_dropout = attention_dropout + self.rope_scaling = rope_scaling + + # Validate the correctness of rotary position embeddings parameters + # BC: if there is a 'type' field, move it to 'rope_type'. + if self.rope_scaling is not None and "type" in self.rope_scaling: + self.rope_scaling["rope_type"] = self.rope_scaling["type"] + rope_config_validation(self, ignore_keys={"mrope_section"}) + self.image_token_id = image_token_id + self.video_token_id = video_token_id + + super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) + + +class Glm4vConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`Glm4vModel`]. It is used to instantiate a + GLM-4.1V model according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of + GLM-4.1V-9B-Thinking [THUDM/GLM-4.1V-9B-Thinking](https://huggingface.co/THUDM/GLM-4.1V-9B-Thinking). + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + text_config (`Union[PreTrainedConfig, dict]`, *optional*, defaults to `Glm4vTextConfig`): + The config object or dictionary of the text backbone. + vision_config (`Union[PreTrainedConfig, dict]`, *optional*, defaults to `Glm4vVisionConfig`): + The config object or dictionary of the vision backbone. + image_token_id (`int`, *optional*, defaults to 151343): + The image token index to encode the image prompt. + video_token_id (`int`, *optional*, defaults to 151344): + The video token index to encode the image prompt. + image_start_token_id (`int`, *optional*, defaults to 151339): + The image start token index to encode the start of image. + image_end_token_id (`int`, *optional*, defaults to 151340): + The image end token index to encode the end of image. + video_start_token_id (`int`, *optional*, defaults to 151341): + The video start token index to encode the start of video. + video_end_token_id (`int`, *optional*, defaults to 151342): + The video end token index to encode the end of video. + + ```python + >>> from transformers import Glm4vForConditionalGeneration, Glm4vConfig + + >>> # Initializing a GLM-4.1V style configuration + >>> configuration = Glm4vConfig() + + >>> # Initializing a model from the GLM-4.1V style configuration + >>> model = Glm4vForConditionalGeneration(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "glm4v" + sub_configs = {"vision_config": Glm4vVisionConfig, "text_config": Glm4vTextConfig} + keys_to_ignore_at_inference = ["past_key_values"] + + def __init__( + self, + text_config=None, + vision_config=None, + image_token_id=151343, + video_token_id=151344, + image_start_token_id=151339, + image_end_token_id=151340, + video_start_token_id=151341, + video_end_token_id=151342, + **kwargs, + ): + if isinstance(vision_config, dict): + self.vision_config = self.sub_configs["vision_config"](**vision_config) + elif vision_config is None: + self.vision_config = self.sub_configs["vision_config"]() + + if isinstance(text_config, dict): + self.text_config = self.sub_configs["text_config"](**text_config) + elif text_config is None: + self.text_config = self.sub_configs["text_config"](**kwargs) + + self.image_token_id = image_token_id + self.video_token_id = video_token_id + self.video_start_token_id = video_start_token_id + self.video_end_token_id = video_end_token_id + self.image_start_token_id = image_start_token_id + self.image_end_token_id = image_end_token_id + + super().__init__(**kwargs) + + +# Will be used for both Text and Vision modalities +class Glm4vRMSNorm(Glm4RMSNorm): + pass + + +class Glm4VisionMlp(Qwen2_5_VLMLP): + def __init__(self, config, bias: bool = False): + super().__init__(config, bias) + self.intermediate_size = config.out_hidden_size + + +class Glm4vVisionPatchEmbed(Qwen2_5_VisionPatchEmbed): + def __init__(self, config: Glm4vVisionConfig) -> None: + nn.Module.__init__(self) + self.patch_size = config.patch_size + self.temporal_patch_size = config.temporal_patch_size + self.in_channels = config.in_channels + self.embed_dim = config.hidden_size + + kernel_size = [self.temporal_patch_size, self.patch_size, self.patch_size] + self.proj = nn.Conv3d(self.in_channels, self.embed_dim, kernel_size=kernel_size, stride=kernel_size) + + +class Glm4vVisionRotaryEmbedding(Qwen2_5_VisionRotaryEmbedding): + pass + + +class Glm4vVisionPatchMerger(nn.Module): + def __init__(self, dim: int, context_dim: int, hidden_act: str, bias: bool = False) -> None: + super().__init__() + self.proj = nn.Linear(dim, dim, bias=bias) + self.post_projection_norm = LayerNorm(dim) + self.gate_proj = nn.Linear(dim, context_dim, bias=bias) + self.up_proj = nn.Linear(dim, context_dim, bias=bias) + self.down_proj = nn.Linear(context_dim, dim, bias=bias) + self.act1 = nn.GELU() + self.act_fn = ACT2FN[hidden_act] + + def forward(self, hidden_state: torch.Tensor) -> torch.Tensor: + hidden_state = self.proj(hidden_state) + hidden_state = self.act1(self.post_projection_norm(hidden_state)) + return self.down_proj(self.act_fn(self.gate_proj(hidden_state)) * self.up_proj(hidden_state)) + + +class Glm4vVisionEmbeddings(nn.Module): + def __init__(self, config: Glm4vVisionConfig): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.image_size = config.image_size + self.patch_size = config.patch_size + + self.num_patches = (self.image_size // self.patch_size) ** 2 + self.num_positions = self.num_patches + self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim) + self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False) + + def forward(self, embeddings, lengths, image_shapes, h_coords, w_coords) -> torch.Tensor: + """ + Forward pass with integrated position encoding adaptation using 2D interpolation. + + Args: + embeddings: Input embeddings tensor + lengths (torch.Tensor): Sequence lengths for each image in the batch. + image_shapes (torch.Tensor): Tensor of shape [batch_size, 3] representing the image shapes (t, h, w). + h_coords (torch.Tensor): Tensor of shape [total_seq] representing the h coordinate for each patch. + w_coords (torch.Tensor): Tensor of shape [total_seq] representing the w coordinate for each patch. + + Returns: + torch.Tensor: Embeddings with adapted position encoding added. + """ + # Get position embedding parameters + pos_embed_weight = self.position_embedding.weight + hidden_size = pos_embed_weight.shape[1] + total_seq = h_coords.shape[0] + device = pos_embed_weight.device + + # Move coordinates to correct device + h_coords, w_coords = h_coords.to(device), w_coords.to(device) + + # Handle empty sequence case + if total_seq == 0: + adapted_pos_embed = torch.empty(0, hidden_size, device=device, dtype=pos_embed_weight.dtype) + else: + # Convert inputs to tensors if needed + if isinstance(lengths, list): + lengths = torch.tensor(lengths, device=device, dtype=torch.long) + if not isinstance(image_shapes, torch.Tensor): + image_shapes = torch.tensor(image_shapes, device=device, dtype=torch.long) + + # Prepare 2D position embedding + orig_size_sq = pos_embed_weight.shape[0] + orig_size = int(orig_size_sq**0.5) + pos_embed_2d = ( + pos_embed_weight.view(orig_size, orig_size, hidden_size) + .permute(2, 0, 1) + .unsqueeze(0) + .to(device=device, dtype=torch.float32) + ) + + # Calculate target dimensions for each patch + target_h = torch.cat([image_shapes[i, 1].repeat(lengths[i]) for i in range(len(lengths))]).to( + device=device, dtype=torch.float32 + ) + target_w = torch.cat([image_shapes[i, 2].repeat(lengths[i]) for i in range(len(lengths))]).to( + device=device, dtype=torch.float32 + ) + + # Normalize coordinates to [-1, 1] range for grid_sample + h_coords = h_coords.to(device=device, dtype=torch.float32) + w_coords = w_coords.to(device=device, dtype=torch.float32) + norm_w = ((w_coords + 0.5) / target_w) * 2 - 1 + norm_h = ((h_coords + 0.5) / target_h) * 2 - 1 + + # Create sampling grid + grid = torch.stack((norm_w, norm_h), dim=-1).unsqueeze(0).unsqueeze(2) + + # Perform bicubic interpolation + interpolated_embed_fp32 = F.grid_sample( + pos_embed_2d, grid, mode="bicubic", align_corners=False, padding_mode="border" + ) + + # Reshape and convert back to original dtype + adapted_pos_embed_fp32 = interpolated_embed_fp32.squeeze(0).squeeze(-1).permute(1, 0) + adapted_pos_embed = adapted_pos_embed_fp32.to(pos_embed_weight.dtype).to(embeddings.device) + + # Add adapted position encoding to embeddings + embeddings = embeddings + adapted_pos_embed + return embeddings + + +class Glm4vVisionAttention(Qwen2_5_VLVisionAttention): + def __init__(self, config: Glm4vVisionConfig) -> None: + super().__init__(config) + self.attention_dropout = config.attention_dropout + self.qkv = nn.Linear(config.hidden_size, config.hidden_size * 3, bias=config.attention_bias) + self.proj = nn.Linear(config.hidden_size, config.hidden_size, bias=False) + + +class Glm4vVisionBlock(Qwen2_5_VLVisionBlock): + def __init__(self, config) -> None: + super().__init__(config) + self.norm1 = Glm4vRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.norm2 = Glm4vRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.attn = Glm4vVisionAttention(config) + self.mlp = Glm4VisionMlp(config, bias=False) + + +class Glm4vTextRotaryEmbedding(Qwen2_5_VLRotaryEmbedding): + pass + + +def rotate_half_llm(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., 0::2] + x2 = x[..., 1::2] + return torch.stack((-x2, x1), dim=-1).flatten(-2) + + +def apply_multimodal_rotary_pos_emb(q, k, cos, sin, mrope_section, unsqueeze_dim=1): + """Applies Rotary Position Embedding with Multimodal Sections to the query and key tensors (https://qwenlm.github.io/blog/qwen2-vl/). + + Explanation: + Multimodal 3D rotary position embedding is an extension to 1D rotary position embedding. The input embedding + sequence contains vision (images / videos) embedding and text embedding or just contains text embedding. For + vision embedding part, we apply rotary position embedding on temporal, height and width dimension separately. + Here we split the channel dimension to 3 chunks for the temporal, height and width rotary position embedding. + For text embedding part, we just apply 1D rotary position embedding. The three rotary position index (temporal, + height and width) of text embedding is always the same, so the text embedding rotary position embedding has no + difference with modern LLMs. + + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + mrope_section(`List(int)`): + Multimodal rope section is for channel dimension of temporal, height and width in rope calculation. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + mrope_section = mrope_section * 2 + cos = torch.cat([m[i % 3] for i, m in enumerate(cos.split(mrope_section, dim=-1))], dim=-1).unsqueeze( + unsqueeze_dim + ) + sin = torch.cat([m[i % 3] for i, m in enumerate(sin.split(mrope_section, dim=-1))], dim=-1).unsqueeze( + unsqueeze_dim + ) + + # Interleave them instead of usual shape + cos = cos[..., : cos.shape[-1] // 2].repeat_interleave(2, dim=-1) + sin = sin[..., : sin.shape[-1] // 2].repeat_interleave(2, dim=-1) + + # Keep half or full tensor for later concatenation + rotary_dim = cos.shape[-1] + q_rot, q_pass = q[..., :rotary_dim], q[..., rotary_dim:] + k_rot, k_pass = k[..., :rotary_dim], k[..., rotary_dim:] + + # Apply rotary embeddings on the first half or full tensor + q_embed = (q_rot * cos) + (rotate_half_llm(q_rot) * sin) + k_embed = (k_rot * cos) + (rotate_half_llm(k_rot) * sin) + + # Concatenate back to full shape + q_embed = torch.cat([q_embed, q_pass], dim=-1) + k_embed = torch.cat([k_embed, k_pass], dim=-1) + + return q_embed, k_embed + + +class Glm4vTextAttention(nn.Module): + """ + Multi-headed attention from 'Attention Is All You Need' paper. + and "Generating Long Sequences with Sparse Transformers". + """ + + def __init__(self, config: Glm4vTextConfig, layer_idx: Optional[int] = None): + super().__init__() + self.config = config + self.layer_idx = layer_idx + + self.hidden_size = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.hidden_size // self.num_heads + self.num_key_value_heads = config.num_key_value_heads + self.num_key_value_groups = self.num_heads // self.num_key_value_heads + self.is_causal = True + self.attention_dropout = config.attention_dropout + self.rope_scaling = config.rope_scaling + self.scaling = self.head_dim**-0.5 + + self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=True) + self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True) + self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True) + self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False) + + def forward( + self, + hidden_states: torch.Tensor, + position_embeddings: tuple[torch.Tensor, torch.Tensor], + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Unpack[FlashAttentionKwargs], + ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]: + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2) + + cos, sin = position_embeddings + query_states, key_states = apply_multimodal_rotary_pos_emb( # diff with Llama + query_states, key_states, cos, sin, self.rope_scaling["mrope_section"] + ) + + if past_key_values is not None: + cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} # Specific to RoPE models + key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs) + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + attn_output, attn_weights = attention_interface( + self, + query_states, + key_states, + value_states, + attention_mask, + dropout=0.0 if not self.training else self.attention_dropout, + scaling=self.scaling, + **kwargs, + ) + + attn_output = attn_output.reshape(bsz, q_len, -1).contiguous() + attn_output = self.o_proj(attn_output) + return attn_output, attn_weights + + +class Glm4vTextMLP(Glm4MLP): + pass + + +class Glm4vTextDecoderLayer(GradientCheckpointingLayer): + def __init__(self, config: Glm4vTextConfig, layer_idx: int): + super().__init__() + self.hidden_size = config.hidden_size + self.self_attn = Glm4vTextAttention(config, layer_idx) + self.mlp = Glm4vTextMLP(config) + self.input_layernorm = Glm4vRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = Glm4vRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_self_attn_layernorm = Glm4vRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_mlp_layernorm = Glm4vRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + def forward( + self, + hidden_states: torch.Tensor, + position_embeddings: tuple[torch.Tensor, torch.Tensor], + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + cache_position: Optional[torch.LongTensor] = None, + **kwargs, + ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]: + residual = hidden_states + + hidden_states = self.input_layernorm(hidden_states) + + # Self Attention + hidden_states, _ = self.self_attn( + hidden_states=hidden_states, + position_embeddings=position_embeddings, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + output_attentions=output_attentions, + use_cache=use_cache, + cache_position=cache_position, + **kwargs, + ) + + hidden_states = self.post_self_attn_layernorm(hidden_states) + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = self.post_mlp_layernorm(hidden_states) + hidden_states = residual + hidden_states + + return hidden_states + + +class Glm4vModelOutputWithPast(Qwen2_5_VLModelOutputWithPast): + pass + + +class Glm4vPreTrainedModel(Qwen2_5_VLPreTrainedModel): + _no_split_modules = ["Glm4vTextDecoderLayer", "Glm4vVisionBlock"] + _can_record_outputs = { + "hidden_states": Glm4vTextDecoderLayer, + "attentions": Glm4vTextAttention, + } + + +class Glm4vVisionModel(Glm4vPreTrainedModel): + config: Glm4vVisionConfig + _no_split_modules = ["Glm4vVisionBlock"] + + def __init__(self, config) -> None: + super().__init__(config) + self.spatial_merge_size = config.spatial_merge_size + self.patch_size = config.patch_size + + self.embeddings = Glm4vVisionEmbeddings(config) + self.patch_embed = Glm4vVisionPatchEmbed(config) + + head_dim = config.hidden_size // config.num_heads + self.rotary_pos_emb = Glm4vVisionRotaryEmbedding(head_dim // 2) + + self.blocks = nn.ModuleList([Glm4vVisionBlock(config) for _ in range(config.depth)]) + self.merger = Glm4vVisionPatchMerger( + dim=config.out_hidden_size, context_dim=config.intermediate_size, hidden_act=config.hidden_act + ) + + self.post_conv_layernorm = Glm4vRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.downsample = nn.Conv2d( + in_channels=config.hidden_size, + out_channels=config.out_hidden_size, + kernel_size=config.spatial_merge_size, + stride=config.spatial_merge_size, + ) + self.post_layernorm = Glm4vRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + self.gradient_checkpointing = False + self.post_init() + + def rot_pos_emb(self, grid_thw): + pos_ids = [] + for t, h, w in grid_thw: + hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w) + hpos_ids = hpos_ids.reshape( + h // self.spatial_merge_size, + self.spatial_merge_size, + w // self.spatial_merge_size, + self.spatial_merge_size, + ) + hpos_ids = hpos_ids.permute(0, 2, 1, 3) + hpos_ids = hpos_ids.flatten() + + wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1) + wpos_ids = wpos_ids.reshape( + h // self.spatial_merge_size, + self.spatial_merge_size, + w // self.spatial_merge_size, + self.spatial_merge_size, + ) + wpos_ids = wpos_ids.permute(0, 2, 1, 3) + wpos_ids = wpos_ids.flatten() + pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1)) + pos_ids = torch.cat(pos_ids, dim=0) + max_grid_size = grid_thw[:, 1:].max() + rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size) + rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1) + return rotary_pos_emb, pos_ids + + def forward(self, hidden_states: torch.Tensor, grid_thw: torch.Tensor) -> torch.Tensor: + """ + Args: + hidden_states (`torch.Tensor` of shape `(seq_len, hidden_size)`): + The final hidden states of the model. + grid_thw (`torch.Tensor` of shape `(num_images_or_videos, 3)`): + The temporal, height and width of feature shape of each image in LLM. + + Returns: + `torch.Tensor`: hidden_states. + """ + hidden_states = self.patch_embed(hidden_states) + hidden_states = self.post_conv_layernorm(hidden_states) + + rotary_pos_emb, image_type_ids = self.rot_pos_emb(grid_thw) + emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1) + position_embeddings = (emb.cos(), emb.sin()) + + cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]).cumsum( + dim=0, + # Select dtype based on the following factors: + # - FA2 requires that cu_seqlens_q must have dtype int32 + # - torch.onnx.export requires that cu_seqlens_q must have same dtype as grid_thw + # See https://github.com/huggingface/transformers/pull/34852 for more information + dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32, + ) + cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0) + seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist() + hidden_states = self.embeddings(hidden_states, seqlens, grid_thw, image_type_ids[:, 0], image_type_ids[:, 1]) + + for blk in self.blocks: + hidden_states = blk( + hidden_states, + cu_seqlens=cu_seqlens, + position_embeddings=position_embeddings, + ) + + hidden_states = self.post_layernorm(hidden_states) + + hidden_states = hidden_states.view( + -1, self.spatial_merge_size, self.spatial_merge_size, hidden_states.shape[-1] + ) + hidden_states = hidden_states.permute(0, 3, 1, 2) + hidden_states = self.downsample(hidden_states).view(-1, self.config.out_hidden_size) + + hidden_states = self.merger(hidden_states) + return hidden_states + + +class Glm4vTextModel(Qwen2_5_VLTextModel): + def __init__(self, config: Glm4vTextConfig): + super().__init__(config) + self.layers = nn.ModuleList( + [Glm4vTextDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + ) + self.norm = Glm4vRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.rotary_emb = Glm4vTextRotaryEmbedding(config=config) + del self._attn_implementation + del self.has_sliding_layers + + @auto_docstring + @check_model_inputs + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Unpack[FlashAttentionKwargs], + ) -> Union[tuple, BaseModelOutputWithPast]: + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") + + # torch.jit.trace() doesn't support cache objects in the output + if use_cache and past_key_values is None and not torch.jit.is_tracing(): + past_key_values = DynamicCache(config=self.config) + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + if cache_position is None: + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + cache_position = torch.arange( + past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device + ) + + # the hard coded `3` is for temporal, height and width. + if position_ids is None: + position_ids = cache_position.view(1, 1, -1).expand(3, inputs_embeds.shape[0], -1) + elif position_ids.dim() == 2: + position_ids = position_ids[None, ...].expand(3, position_ids.shape[0], -1) + + causal_mask = create_causal_mask( + config=self.config, + input_embeds=inputs_embeds, + attention_mask=attention_mask, + cache_position=cache_position, + past_key_values=past_key_values, + position_ids=position_ids, + ) + + hidden_states = inputs_embeds + + # create position embeddings to be shared across the decoder layers + position_embeddings = self.rotary_emb(hidden_states, position_ids) + + for decoder_layer in self.layers: + layer_outputs = decoder_layer( + hidden_states, + position_embeddings=position_embeddings, + attention_mask=causal_mask, + position_ids=position_ids, + past_key_values=past_key_values, + cache_position=cache_position, + **kwargs, + ) + hidden_states = layer_outputs + + hidden_states = self.norm(hidden_states) + + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=past_key_values, + ) + + +class Glm4vModel(Qwen2_5_VLModel): + _checkpoint_conversion_mapping = {} + _no_split_modules = ["Glm4vTextDecoderLayer", "Glm4vVisionBlock"] + + def __init__(self, config): + super().__init__(config) + self.visual = Glm4vVisionModel._from_config(config.vision_config) + + def get_rope_index( + self, + input_ids: Optional[torch.LongTensor] = None, + image_grid_thw: Optional[torch.LongTensor] = None, + video_grid_thw: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + ) -> tuple[torch.Tensor, torch.Tensor]: + """ + Calculate the 3D rope index based on image and video's temporal, height and width in LLM. + + Explanation: + Each embedding sequence contains vision embedding and text embedding or just contains text embedding. + + For pure text embedding sequence, the rotary position embedding has no difference with modern LLMs. + Examples: + input_ids: [T T T T T], here T is for text. + temporal position_ids: [0, 1, 2, 3, 4] + height position_ids: [0, 1, 2, 3, 4] + width position_ids: [0, 1, 2, 3, 4] + + For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part + and 1D rotary position embedding for text part. + Examples: + Temporal (Time): 3 patches, representing different segments of the video in time. + Height: 2 patches, dividing each frame vertically. + Width: 2 patches, dividing each frame horizontally. + We also have some important parameters: + fps (Frames Per Second): The video's frame rate, set to 1. This means one frame is processed each second. + tokens_per_second: This is a crucial parameter. It dictates how many "time-steps" or "temporal tokens" are conceptually packed into a one-second interval of the video. In this case, we have 25 tokens per second. So each second of the video will be represented with 25 separate time points. It essentially defines the temporal granularity. + temporal_patch_size: The number of frames that compose one temporal patch. Here, it's 2 frames. + interval: The step size for the temporal position IDs, calculated as tokens_per_second * temporal_patch_size / fps. In this case, 25 * 2 / 1 = 50. This means that each temporal patch will be have a difference of 50 in the temporal position IDs. + input_ids: [V V V V V V V V V V V V T T T T T], here V is for vision. + vision temporal position_ids: [0, 0, 0, 0, 50, 50, 50, 50, 100, 100, 100, 100] + vision height position_ids: [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1] + vision width position_ids: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1] + text temporal position_ids: [101, 102, 103, 104, 105] + text height position_ids: [101, 102, 103, 104, 105] + text width position_ids: [101, 102, 103, 104, 105] + Here we calculate the text start position_ids as the max vision position_ids plus 1. + + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*): + The temporal, height and width of feature shape of each image in LLM. + video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*): + The temporal, height and width of feature shape of each video in LLM. + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + Returns: + position_ids (`torch.LongTensor` of shape `(3, batch_size, sequence_length)`) + mrope_position_deltas (`torch.Tensor` of shape `(batch_size)`) + """ + + spatial_merge_size = self.config.vision_config.spatial_merge_size + image_token_id = self.config.image_token_id + video_start_token_id = self.config.video_start_token_id + video_end_token_id = self.config.video_end_token_id + + mrope_position_deltas = [] + if input_ids is not None and (image_grid_thw is not None or video_grid_thw is not None): + total_input_ids = input_ids + if attention_mask is None: + attention_mask = torch.ones_like(total_input_ids) + position_ids = torch.ones( + 3, + input_ids.shape[0], + input_ids.shape[1], + dtype=input_ids.dtype, + device=input_ids.device, + ) + image_index, video_index = 0, 0 + video_group_index = 0 + attention_mask = attention_mask.to(total_input_ids.device) + for i, input_ids in enumerate(total_input_ids): + input_ids = input_ids[attention_mask[i] == 1] + input_tokens = input_ids.tolist() + + input_token_type = [] + video_check_flg = False + for token in input_tokens: + if token == video_start_token_id: + video_check_flg = True + elif token == video_end_token_id: + video_check_flg = False + + if token == image_token_id and not video_check_flg: + input_token_type.append("image") + elif token == image_token_id and video_check_flg: + input_token_type.append("video") + else: + input_token_type.append("text") + + input_type_group = [] + for key, group in itertools.groupby(enumerate(input_token_type), lambda x: x[1]): + group = list(group) + start_index = group[0][0] + end_index = group[-1][0] + 1 + input_type_group.append((key, start_index, end_index)) + + llm_pos_ids_list = [] + video_frame_num = 1 + for modality_type, start_idx, end_idx in input_type_group: + st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0 + + if modality_type == "image": + t, h, w = ( + image_grid_thw[image_index][0], + image_grid_thw[image_index][1], + image_grid_thw[image_index][2], + ) + llm_grid_t, llm_grid_h, llm_grid_w = ( + t.item(), + h.item() // spatial_merge_size, + w.item() // spatial_merge_size, + ) + + t_index = torch.arange(llm_grid_t).view(-1, 1).expand(-1, llm_grid_h * llm_grid_w).flatten() + h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(llm_grid_t, -1, llm_grid_w).flatten() + w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(llm_grid_t, llm_grid_h, -1).flatten() + llm_pos_ids_list.append(torch.stack([t_index, h_index, w_index]) + st_idx) + + image_index += 1 + video_frame_num = 1 + + elif modality_type == "video": + t, h, w = ( + video_frame_num, + video_grid_thw[video_index][1], + video_grid_thw[video_index][2], + ) + + llm_grid_t, llm_grid_h, llm_grid_w = ( + t, + h.item() // spatial_merge_size, + w.item() // spatial_merge_size, + ) + + for t_idx in range(llm_grid_t): + t_index = torch.tensor(t_idx).view(-1, 1).expand(-1, llm_grid_h * llm_grid_w).flatten() + + h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(1, -1, llm_grid_w).flatten() + w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(1, llm_grid_h, -1).flatten() + llm_pos_ids_list.append(torch.stack([t_index, h_index, w_index]) + st_idx) + + video_group_index += 1 + + if video_group_index >= video_grid_thw[video_index][0]: + video_index += 1 + video_group_index = 0 + + video_frame_num += 1 + + else: + text_len = end_idx - start_idx + llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx) + + video_frame_num = 1 + + llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1) + position_ids[..., i, attention_mask[i] == 1] = llm_positions.to(position_ids.device) + mrope_position_deltas.append(llm_positions.max() + 1 - len(total_input_ids[i])) + mrope_position_deltas = torch.tensor(mrope_position_deltas, device=input_ids.device).unsqueeze(1) + return position_ids, mrope_position_deltas + else: + if attention_mask is not None: + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + position_ids = position_ids.unsqueeze(0).expand(3, -1, -1).to(attention_mask.device) + max_position_ids = position_ids.max(0, keepdim=False)[0].max(-1, keepdim=True)[0] + mrope_position_deltas = max_position_ids + 1 - attention_mask.shape[-1] + else: + position_ids = ( + torch.arange(input_ids.shape[1], device=input_ids.device) + .view(1, 1, -1) + .expand(3, input_ids.shape[0], -1) + ) + mrope_position_deltas = torch.zeros( + [input_ids.shape[0], 1], + device=input_ids.device, + dtype=input_ids.dtype, + ) + + return position_ids, mrope_position_deltas + + def get_video_features( + self, pixel_values_videos: torch.FloatTensor, video_grid_thw: Optional[torch.LongTensor] = None + ): + """ + Encodes videos into continuous embeddings that can be forwarded to the language model. + + Args: + pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): + The tensors corresponding to the input videos. + video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*): + The temporal, height and width of feature shape of each video in LLM. + """ + pixel_values_videos = pixel_values_videos.type(self.visual.dtype) + # reshape video_grid_thw -> [b, 3] -> [1, h, w] * frames + temp_frames_hw = [] + for t, h, w in video_grid_thw: + repeated_row = torch.tensor([1, h.item(), w.item()]).unsqueeze(0).repeat(t, 1) + temp_frames_hw.append(repeated_row) + flattened_video_grid_thw = torch.cat(temp_frames_hw, dim=0) + video_embeds = self.visual(pixel_values_videos, grid_thw=flattened_video_grid_thw) + split_sizes = (video_grid_thw.prod(-1) // self.visual.spatial_merge_size**2).tolist() + video_embeds = torch.split(video_embeds, split_sizes) + return video_embeds + + def get_placeholder_mask( + self, + input_ids: torch.LongTensor, + inputs_embeds: torch.FloatTensor, + image_features: Optional[torch.FloatTensor] = None, + video_features: Optional[torch.FloatTensor] = None, + ): + """ + Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is + equal to the length of multimodal features. If the lengths are different, an error is raised. + """ + if input_ids is None: + special_image_mask = inputs_embeds == self.get_input_embeddings()( + torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device) + ) + special_image_mask = special_image_mask.all(-1) + special_video_mask = inputs_embeds == self.get_input_embeddings()( + torch.tensor(self.config.video_token_id, dtype=torch.long, device=inputs_embeds.device) + ) + special_video_mask = special_video_mask.all(-1) + else: + # GLM-4.1V and GLM-4.5V special_video_mask is special_image_mask + special_image_mask = input_ids == self.config.image_token_id + special_video_mask = input_ids == self.config.image_token_id + + n_image_tokens = special_image_mask.sum() + special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device) + if image_features is not None and inputs_embeds[special_image_mask].numel() != image_features.numel(): + raise ValueError( + f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {image_features.shape[0]}" + ) + + n_video_tokens = special_video_mask.sum() + special_video_mask = special_video_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device) + if video_features is not None and inputs_embeds[special_video_mask].numel() != video_features.numel(): + raise ValueError( + f"Videos features and video tokens do not match: tokens: {n_video_tokens}, features {video_features.shape[0]}" + ) + + return special_image_mask, special_video_mask + + @auto_docstring + @can_return_tuple + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + pixel_values: Optional[torch.Tensor] = None, + pixel_values_videos: Optional[torch.FloatTensor] = None, + image_grid_thw: Optional[torch.LongTensor] = None, + video_grid_thw: Optional[torch.LongTensor] = None, + rope_deltas: Optional[torch.LongTensor] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> Union[tuple, Glm4vModelOutputWithPast]: + r""" + image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*): + The temporal, height and width of feature shape of each image in LLM. + video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*): + The temporal, height and width of feature shape of each video in LLM. + rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*): + The rope index difference between sequence length and multimodal rope. + """ + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") + + if inputs_embeds is None: + inputs_embeds = self.get_input_embeddings()(input_ids) + + if pixel_values is not None: + image_embeds = self.get_image_features(pixel_values, image_grid_thw) + image_embeds = torch.cat(image_embeds, dim=0).to(inputs_embeds.device, inputs_embeds.dtype) + image_mask, _ = self.get_placeholder_mask(input_ids, inputs_embeds, image_features=image_embeds) + inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds) + + if pixel_values_videos is not None: + video_embeds = self.get_video_features(pixel_values_videos, video_grid_thw) + video_embeds = torch.cat(video_embeds, dim=0).to(inputs_embeds.device, inputs_embeds.dtype) + _, video_mask = self.get_placeholder_mask(input_ids, inputs_embeds, video_features=video_embeds) + inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds) + + if position_ids is None: + attention_mask_tensor = ( + attention_mask if not isinstance(attention_mask, dict) else attention_mask["full_attention"] + ) + if attention_mask_tensor is not None and attention_mask_tensor.ndim == 4: + attention_mask_tensor = torch.diagonal(attention_mask_tensor[:, 0], dim1=1, dim2=2) + # Only apply conversion for floating point tensors (inverted masks) + if attention_mask_tensor.dtype.is_floating_point: + attention_mask_tensor = attention_mask_tensor / torch.finfo(attention_mask_tensor.dtype).min + attention_mask_tensor = (1.0 - attention_mask_tensor).int() + + # Calculate RoPE index once per generation in the pre-fill stage only. + # When compiling, we can't check tensor values thus we check only input length + # It is safe to assume that `length!=1` means we're in pre-fill because compiled + # models currently cannot do asssisted decoding + prefill_compiled_stage = is_torchdynamo_compiling() and ( + (input_ids is not None and input_ids.shape[1] != 1) + or (inputs_embeds is not None and inputs_embeds.shape[1] != 1) + ) + prefill_noncompiled_stage = not is_torchdynamo_compiling() and ( + (cache_position is not None and cache_position[0] == 0) + or (past_key_values is None or past_key_values.get_seq_length() == 0) + ) + if (prefill_compiled_stage or prefill_noncompiled_stage) or self.rope_deltas is None: + position_ids, rope_deltas = self.get_rope_index( + input_ids, + image_grid_thw, + video_grid_thw, + attention_mask=attention_mask_tensor, + ) + self.rope_deltas = rope_deltas + # then use the prev pre-calculated rope-deltas to get the correct position ids + else: + batch_size, seq_length, _ = inputs_embeds.shape + delta = ( + (cache_position[0] + self.rope_deltas).to(inputs_embeds.device) + if cache_position is not None + else 0 + ) + position_ids = torch.arange(seq_length, device=inputs_embeds.device) + position_ids = position_ids.view(1, -1).expand(batch_size, -1) + if cache_position is not None: # otherwise `deltas` is an int `0` + delta = delta.repeat_interleave(batch_size // delta.shape[0], dim=0) + position_ids = position_ids.add(delta) + position_ids = position_ids.unsqueeze(0).expand(3, -1, -1) + + outputs = self.language_model( + input_ids=None, + position_ids=position_ids, + attention_mask=attention_mask, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + cache_position=cache_position, + **kwargs, + ) + + return Glm4vModelOutputWithPast( + last_hidden_state=outputs.last_hidden_state, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + rope_deltas=self.rope_deltas, + ) + + +class Glm4vCausalLMOutputWithPast(Qwen2_5_VLCausalLMOutputWithPast): + pass + + +class Glm4vForConditionalGeneration(Qwen2_5_VLForConditionalGeneration): + _checkpoint_conversion_mapping = {} + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.Tensor] = None, + pixel_values_videos: Optional[torch.FloatTensor] = None, + image_grid_thw: Optional[torch.LongTensor] = None, + video_grid_thw: Optional[torch.LongTensor] = None, + rope_deltas: Optional[torch.LongTensor] = None, + cache_position: Optional[torch.LongTensor] = None, + logits_to_keep: Union[int, torch.Tensor] = 0, + **kwargs: Unpack[TransformersKwargs], + ) -> Union[tuple, Glm4vCausalLMOutputWithPast]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*): + The temporal, height and width of feature shape of each image in LLM. + video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*): + The temporal, height and width of feature shape of each video in LLM. + rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*): + The rope index difference between sequence length and multimodal rope. + + Example: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, Glm4vForConditionalGeneration + + >>> model = Glm4vForConditionalGeneration.from_pretrained("THUDM/GLM-4.1V-9B-Thinking") + >>> processor = AutoProcessor.from_pretrained("THUDM/GLM-4.1V-9B-Thinking") + + >>> messages = [ + { + "role": "user", + "content": [ + {"type": "image"}, + {"type": "text", "text": "What is shown in this image?"}, + ], + }, + ] + >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) + >>> inputs = processor(text=[text], images=[image], vision_infos=[vision_infos]) + + >>> # Generate + >>> generate_ids = model.generate(inputs.input_ids, max_length=30) + >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "The image shows a street scene with a red stop sign in the foreground. In the background, there is a large red gate with Chinese characters ..." + ```""" + outputs = self.model( + input_ids=input_ids, + pixel_values=pixel_values, + pixel_values_videos=pixel_values_videos, + image_grid_thw=image_grid_thw, + video_grid_thw=video_grid_thw, + position_ids=position_ids, + attention_mask=attention_mask, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + cache_position=cache_position, + **kwargs, + ) + + hidden_states = outputs[0] + + # Only compute necessary logits, and do not upcast them to float if we are not computing the loss + slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep + logits = self.lm_head(hidden_states[:, slice_indices, :]) + + loss = None + if labels is not None: + loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size) + + return Glm4vCausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + rope_deltas=outputs.rope_deltas, + ) + + def prepare_inputs_for_generation( + self, + input_ids, + past_key_values=None, + attention_mask=None, + inputs_embeds=None, + cache_position=None, + position_ids=None, + use_cache=True, + pixel_values=None, + pixel_values_videos=None, + image_grid_thw=None, + video_grid_thw=None, + **kwargs, + ): + # Overwritten -- in specific circumstances we don't want to forward image inputs to the model + + model_inputs = super().prepare_inputs_for_generation( + input_ids, + past_key_values=past_key_values, + attention_mask=attention_mask, + inputs_embeds=inputs_embeds, + cache_position=cache_position, + position_ids=position_ids, + pixel_values=pixel_values, + pixel_values_videos=pixel_values_videos, + image_grid_thw=image_grid_thw, + video_grid_thw=video_grid_thw, + use_cache=use_cache, + **kwargs, + ) + + # GLM-4.1V position_ids are prepareed with rope_deltas in forward + model_inputs["position_ids"] = None + + if cache_position[0] != 0: + model_inputs["pixel_values"] = None + model_inputs["pixel_values_videos"] = None + + return model_inputs + + def _get_image_nums_and_video_nums( + self, + input_ids: Optional[torch.LongTensor], + inputs_embeds: Optional[torch.Tensor] = None, + ) -> tuple[torch.Tensor, torch.Tensor]: + """ + Get the number of images and videos for each sample to calculate the separation length of the sample tensor. + These parameters are not passed through the processor to avoid unpredictable impacts from interface modifications. + + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. + + Returns: + image_nums (`torch.LongTensor` of shape `(batch_size, num_images_sample)`) + video_nums (`torch.LongTensor` of shape `(batch_size, num_videos_sample)`) + """ + + if inputs_embeds is not None: + is_image = ( + inputs_embeds + == self.get_input_embeddings()( + torch.tensor(self.config.image_start_token_id, dtype=torch.long, device=inputs_embeds.device) + ) + )[..., 0] + is_video_start = ( + inputs_embeds + == self.get_input_embeddings()( + torch.tensor(self.config.video_start_token_id, dtype=torch.long, device=inputs_embeds.device) + ) + )[..., 0] + is_video_end = ( + inputs_embeds + == self.get_input_embeddings()( + torch.tensor(self.config.video_end_token_id, dtype=torch.long, device=inputs_embeds.device) + ) + )[..., 0] + else: + is_image = input_ids == self.config.image_start_token_id + is_video_start = input_ids == self.config.video_start_token_id + is_video_end = input_ids == self.config.video_end_token_id + + # Cumulative sum to track if we're inside a video span + # We'll assume well-formed video tags (i.e. matching starts and ends) + video_level = torch.cumsum(is_video_start.int() - is_video_end.int(), dim=1) + inside_video = video_level > 0 # shape (batch_size, seq_length) + + # Mask out image tokens that are inside video spans + standalone_images = is_image & (~inside_video) + + # Count per batch + image_counts = standalone_images.sum(dim=1) + video_counts = is_video_start.sum(dim=1) + + return image_counts, video_counts + + +class Glm4vVideosProcessorKwargs(Qwen2_5_VLVideosProcessorKwargs): + pass + + +class Glm4vImagesKwargs(ImagesKwargs): + patch_size: Optional[int] + temporal_patch_size: Optional[int] + merge_size: Optional[int] + + +class Glm4vProcessorKwargs(Qwen2_VLProcessorKwargs): + images_kwargs: Glm4vImagesKwargs + videos_kwargs: Glm4vVideosProcessorKwargs + _defaults = { + "text_kwargs": { + "padding": False, + "return_token_type_ids": False, + "return_mm_token_type_ids": False, + }, + "videos_kwargs": {"return_metadata": True}, + } + + +class Glm4vProcessor(Qwen2_VLProcessor): + r""" + Constructs a GLM-4V processor which wraps a GLM-4V image processor and a GLM-4 tokenizer into a single processor. + [`~Glm4vProcessor.__call__`] and [`~Glm4vProcessor.decode`] for more information. + Args: + image_processor ([`Glm4vProcessor`], *optional*): + The image processor is a required input. + tokenizer ([`PreTrainedTokenizerFast`], *optional*): + The tokenizer is a required input. + video_processor ([`Glm4vVideoProcessor`], *optional*): + The video processor is a required input. + chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages + in a chat into a tokenizable string. + """ + + tokenizer_class = ("PreTrainedTokenizer", "PreTrainedTokenizerFast") + + def __init__(self, image_processor=None, tokenizer=None, video_processor=None, chat_template=None, **kwargs): + super().__init__(image_processor, tokenizer, video_processor, chat_template=chat_template) + self.image_token = "<|image|>" if not hasattr(tokenizer, "image_token") else tokenizer.image_token + self.video_token = "<|video|>" if not hasattr(tokenizer, "video_token") else tokenizer.video_token + + def __call__( + self, + images: Optional[ImageInput] = None, + text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None, + videos: Optional[VideoInput] = None, + **kwargs: Unpack[Glm4vProcessorKwargs], + ) -> BatchFeature: + """ + Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text` + and `kwargs` arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizerFast.__call__`] if `text` is not `None` to encode + the text. + + Args: + images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`): + The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch + tensor. Both channels-first and channels-last formats are supported. + text (`str`, `List[str]`, `List[List[str]]`): + The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings + (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set + `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). + videos (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`): + The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch + tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported. + return_tensors (`str` or [`~utils.TensorType`], *optional*): + If set, will return tensors of a particular framework. Acceptable values are: + - `'tf'`: Return TensorFlow `tf.constant` objects. + - `'pt'`: Return PyTorch `torch.Tensor` objects. + - `'np'`: Return NumPy `np.ndarray` objects. + - `'jax'`: Return JAX `jnp.ndarray` objects. + + Returns: + [`BatchFeature`]: A [`BatchFeature`] with the following fields: + + - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`. + - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when + `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not + `None`). + - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`. + - **pixel_values_videos** -- Pixel values of videos to be fed to a model. Returned when `videos` is not `None`. + - **image_grid_thw** -- List of image 3D grid in LLM. Returned when `images` is not `None`. + - **video_grid_thw** -- List of video 3D grid in LLM. Returned when `videos` is not `None`. + """ + output_kwargs = self._merge_kwargs( + Glm4vProcessorKwargs, + tokenizer_init_kwargs=self.tokenizer.init_kwargs, + **kwargs, + ) + if images is not None: + image_inputs = self.image_processor(images=images, **output_kwargs["images_kwargs"]) + image_grid_thw = image_inputs["image_grid_thw"] + else: + image_inputs = {} + image_grid_thw = None + + if videos is not None: + videos_inputs = self.video_processor(videos=videos, **output_kwargs["videos_kwargs"]) + # If user has not requested video metadata, pop it + if "return_metadata" not in kwargs: + video_metadata = videos_inputs.pop("video_metadata") + else: + video_metadata = videos_inputs["video_metadata"] + video_grid_thw = videos_inputs["video_grid_thw"] + else: + videos_inputs = {} + video_grid_thw = None + + if not isinstance(text, list): + text = [text] + + text = text.copy() # below lines change text in-place + if image_grid_thw is not None: + merge_length = self.image_processor.merge_size**2 + index = 0 + for i in range(len(text)): + while self.image_token in text[i]: + num_image_tokens = image_grid_thw[index].prod() // merge_length + text[i] = text[i].replace(self.image_token, "<|placeholder|>" * num_image_tokens, 1) + index += 1 + text[i] = text[i].replace("<|placeholder|>", self.image_token) + + if video_grid_thw is not None: + merge_length = self.video_processor.merge_size**2 + video_index = 0 + for i in range(len(text)): + while self.video_token in text[i]: + num_frames = video_grid_thw[video_index][0] + video_structure = "" + + metadata = video_metadata[video_index] + if metadata.fps is None: + logger.warning_once( + "SmolVLM requires frame timestamps to construct prompts, but the `fps` of the input video could not be inferred. " + "Probably `video_metadata` was missing from inputs and you passed pre-sampled frames. " + "Defaulting to `fps=24`. Please provide `video_metadata` for more accurate results." + ) + metadata.fps = 24 if metadata.fps is None else metadata.fps + timestamps = metadata.timestamps[::2] # mrope + + unique_timestamps = [] + for idx in range(0, len(timestamps)): + unique_timestamps.append(timestamps[idx]) + + selected_timestamps = unique_timestamps[:num_frames] + while len(selected_timestamps) < num_frames: + selected_timestamps.append(selected_timestamps[-1] if selected_timestamps else 0) + + for frame_idx in range(num_frames): + timestamp_sec = selected_timestamps[frame_idx] + frame_structure = f"<|begin_of_image|>{self.image_token}<|end_of_image|>{int(timestamp_sec)}" + video_structure += frame_structure + + text[i] = text[i].replace(self.video_token, video_structure, 1) + num_image_tokens = ( + video_grid_thw[video_index].prod() // merge_length // video_grid_thw[video_index][0] + ) + for frame_idx in range(num_frames): + if self.image_token in text[i]: + text[i] = text[i].replace(self.image_token, "<|placeholder|>" * num_image_tokens, 1) + + video_index += 1 + + text[i] = text[i].replace("<|placeholder|>", self.image_token) + return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None) + return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", False) + text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"]) + self._check_special_mm_tokens(text, text_inputs, modalities=["image", "video"]) + + if return_mm_token_type_ids: + array_ids = np.array(text_inputs["input_ids"]) + mm_token_type_ids = np.zeros_like(text_inputs["input_ids"]) + mm_token_type_ids[array_ids == self.image_token_id] = 1 + text_inputs["mm_token_type_ids"] = mm_token_type_ids.tolist() + return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs}, tensor_type=return_tensors) + + +__all__ = [ + "Glm4vConfig", + "Glm4vTextConfig", + "Glm4vForConditionalGeneration", + "Glm4vModel", + "Glm4vPreTrainedModel", + "Glm4vProcessor", + "Glm4vTextModel", +] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/glm4v/processing_glm4v.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/glm4v/processing_glm4v.py new file mode 100644 index 0000000000000000000000000000000000000000..a8ebb4d41b497c47a05864a1fe3f53e1273b5dd2 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/glm4v/processing_glm4v.py @@ -0,0 +1,295 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/glm4v/modular_glm4v.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_glm4v.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright 2025 The ZhipuAI Inc. team and HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Optional, Union + +import numpy as np + +from ...feature_extraction_utils import BatchFeature +from ...image_utils import ImageInput +from ...processing_utils import ImagesKwargs, MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack, VideosKwargs +from ...tokenization_utils_base import PreTokenizedInput, TextInput +from ...utils import logging +from ...video_utils import VideoInput + + +logger = logging.get_logger(__name__) + + +class Glm4vVideosProcessorKwargs(VideosKwargs, total=False): + fps: Union[list[float], float] + + +class Glm4vImagesKwargs(ImagesKwargs): + patch_size: Optional[int] + temporal_patch_size: Optional[int] + merge_size: Optional[int] + + +class Glm4vProcessorKwargs(ProcessingKwargs, total=False): + images_kwargs: Glm4vImagesKwargs + _defaults = { + "text_kwargs": { + "padding": False, + "return_token_type_ids": False, + "return_mm_token_type_ids": False, + }, + "videos_kwargs": {"return_metadata": True}, + } + videos_kwargs: Glm4vVideosProcessorKwargs + + +class Glm4vProcessor(ProcessorMixin): + r""" + Constructs a GLM-4V processor which wraps a GLM-4V image processor and a GLM-4 tokenizer into a single processor. + [`~Glm4vProcessor.__call__`] and [`~Glm4vProcessor.decode`] for more information. + Args: + image_processor ([`Glm4vProcessor`], *optional*): + The image processor is a required input. + tokenizer ([`PreTrainedTokenizerFast`], *optional*): + The tokenizer is a required input. + video_processor ([`Glm4vVideoProcessor`], *optional*): + The video processor is a required input. + chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages + in a chat into a tokenizable string. + """ + + attributes = ["image_processor", "tokenizer", "video_processor"] + image_processor_class = "AutoImageProcessor" + video_processor_class = "AutoVideoProcessor" + + tokenizer_class = ("PreTrainedTokenizer", "PreTrainedTokenizerFast") + + def __init__(self, image_processor=None, tokenizer=None, video_processor=None, chat_template=None, **kwargs): + super().__init__(image_processor, tokenizer, video_processor, chat_template=chat_template) + self.image_token = "<|image|>" if not hasattr(tokenizer, "image_token") else tokenizer.image_token + self.video_token = "<|video|>" if not hasattr(tokenizer, "video_token") else tokenizer.video_token + self.image_token_id = ( + tokenizer.image_token_id + if getattr(tokenizer, "image_token_id", None) + else tokenizer.convert_tokens_to_ids(self.image_token) + ) + self.video_token_id = ( + tokenizer.video_token_id + if getattr(tokenizer, "video_token_id", None) + else tokenizer.convert_tokens_to_ids(self.video_token) + ) + + def __call__( + self, + images: Optional[ImageInput] = None, + text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None, + videos: Optional[VideoInput] = None, + **kwargs: Unpack[Glm4vProcessorKwargs], + ) -> BatchFeature: + """ + Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text` + and `kwargs` arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizerFast.__call__`] if `text` is not `None` to encode + the text. + + Args: + images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`): + The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch + tensor. Both channels-first and channels-last formats are supported. + text (`str`, `List[str]`, `List[List[str]]`): + The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings + (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set + `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). + videos (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`): + The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch + tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported. + return_tensors (`str` or [`~utils.TensorType`], *optional*): + If set, will return tensors of a particular framework. Acceptable values are: + - `'tf'`: Return TensorFlow `tf.constant` objects. + - `'pt'`: Return PyTorch `torch.Tensor` objects. + - `'np'`: Return NumPy `np.ndarray` objects. + - `'jax'`: Return JAX `jnp.ndarray` objects. + + Returns: + [`BatchFeature`]: A [`BatchFeature`] with the following fields: + + - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`. + - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when + `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not + `None`). + - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`. + - **pixel_values_videos** -- Pixel values of videos to be fed to a model. Returned when `videos` is not `None`. + - **image_grid_thw** -- List of image 3D grid in LLM. Returned when `images` is not `None`. + - **video_grid_thw** -- List of video 3D grid in LLM. Returned when `videos` is not `None`. + """ + output_kwargs = self._merge_kwargs( + Glm4vProcessorKwargs, + tokenizer_init_kwargs=self.tokenizer.init_kwargs, + **kwargs, + ) + if images is not None: + image_inputs = self.image_processor(images=images, **output_kwargs["images_kwargs"]) + image_grid_thw = image_inputs["image_grid_thw"] + else: + image_inputs = {} + image_grid_thw = None + + if videos is not None: + videos_inputs = self.video_processor(videos=videos, **output_kwargs["videos_kwargs"]) + # If user has not requested video metadata, pop it + if "return_metadata" not in kwargs: + video_metadata = videos_inputs.pop("video_metadata") + else: + video_metadata = videos_inputs["video_metadata"] + video_grid_thw = videos_inputs["video_grid_thw"] + else: + videos_inputs = {} + video_grid_thw = None + + if not isinstance(text, list): + text = [text] + + text = text.copy() # below lines change text in-place + if image_grid_thw is not None: + merge_length = self.image_processor.merge_size**2 + index = 0 + for i in range(len(text)): + while self.image_token in text[i]: + num_image_tokens = image_grid_thw[index].prod() // merge_length + text[i] = text[i].replace(self.image_token, "<|placeholder|>" * num_image_tokens, 1) + index += 1 + text[i] = text[i].replace("<|placeholder|>", self.image_token) + + if video_grid_thw is not None: + merge_length = self.video_processor.merge_size**2 + video_index = 0 + for i in range(len(text)): + while self.video_token in text[i]: + num_frames = video_grid_thw[video_index][0] + video_structure = "" + + metadata = video_metadata[video_index] + if metadata.fps is None: + logger.warning_once( + "SmolVLM requires frame timestamps to construct prompts, but the `fps` of the input video could not be inferred. " + "Probably `video_metadata` was missing from inputs and you passed pre-sampled frames. " + "Defaulting to `fps=24`. Please provide `video_metadata` for more accurate results." + ) + metadata.fps = 24 if metadata.fps is None else metadata.fps + timestamps = metadata.timestamps[::2] # mrope + + unique_timestamps = [] + for idx in range(0, len(timestamps)): + unique_timestamps.append(timestamps[idx]) + + selected_timestamps = unique_timestamps[:num_frames] + while len(selected_timestamps) < num_frames: + selected_timestamps.append(selected_timestamps[-1] if selected_timestamps else 0) + + for frame_idx in range(num_frames): + timestamp_sec = selected_timestamps[frame_idx] + frame_structure = f"<|begin_of_image|>{self.image_token}<|end_of_image|>{int(timestamp_sec)}" + video_structure += frame_structure + + text[i] = text[i].replace(self.video_token, video_structure, 1) + num_image_tokens = ( + video_grid_thw[video_index].prod() // merge_length // video_grid_thw[video_index][0] + ) + for frame_idx in range(num_frames): + if self.image_token in text[i]: + text[i] = text[i].replace(self.image_token, "<|placeholder|>" * num_image_tokens, 1) + + video_index += 1 + + text[i] = text[i].replace("<|placeholder|>", self.image_token) + return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None) + return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", False) + text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"]) + self._check_special_mm_tokens(text, text_inputs, modalities=["image", "video"]) + + if return_mm_token_type_ids: + array_ids = np.array(text_inputs["input_ids"]) + mm_token_type_ids = np.zeros_like(text_inputs["input_ids"]) + mm_token_type_ids[array_ids == self.image_token_id] = 1 + text_inputs["mm_token_type_ids"] = mm_token_type_ids.tolist() + return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs}, tensor_type=return_tensors) + + def _get_num_multimodal_tokens(self, image_sizes=None, video_sizes=None, **kwargs): + """ + Computes the number of placeholder tokens needed for multimodal inputs with the given sizes. + Args: + image_sizes (`list[list[int]]`, *optional*): + The input sizes formatted as (height, width) per each image. + video_sizes (`list[list[int]]`, *optional*): + The input sizes formatted as (num_frames, height, width) per each video. + Returns: + `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided + input modalities, along with other useful data. + """ + + vision_data = {} + if image_sizes is not None: + images_kwargs = Glm4vProcessorKwargs._defaults.get("images_kwargs", {}) + images_kwargs.update(kwargs) + merge_size = images_kwargs.get("merge_size", None) or self.image_processor.merge_size + + num_image_patches = [ + self.image_processor.get_number_of_image_patches(*image_size, images_kwargs) + for image_size in image_sizes + ] + num_image_tokens = [(num_patches // merge_size**2) for num_patches in num_image_patches] + vision_data.update({"num_image_tokens": num_image_tokens, "num_image_patches": num_image_patches}) + + if video_sizes is not None: + videos_kwargs = Glm4vProcessorKwargs._defaults.get("videos_kwargs", {}) + videos_kwargs.update(kwargs) + num_video_patches = [ + self.video_processor.get_number_of_video_patches(*video_size, videos_kwargs) + for video_size in video_sizes + ] + num_video_tokens = [(num_patches // merge_size**2) for num_patches in num_video_patches] + vision_data["num_video_tokens"] = num_video_tokens + + return MultiModalData(**vision_data) + + def post_process_image_text_to_text( + self, generated_outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False, **kwargs + ): + """ + Post-process the output of the model to decode the text. + + Args: + generated_outputs (`torch.Tensor` or `np.ndarray`): + The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)` + or `(sequence_length,)`. + skip_special_tokens (`bool`, *optional*, defaults to `True`): + Whether or not to remove special tokens in the output. Argument passed to the tokenizer's `batch_decode` method. + clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`): + Whether or not to clean up the tokenization spaces. Argument passed to the tokenizer's `batch_decode` method. + **kwargs: + Additional arguments to be passed to the tokenizer's `batch_decode method`. + + Returns: + `list[str]`: The decoded text. + """ + return self.tokenizer.batch_decode( + generated_outputs, + skip_special_tokens=skip_special_tokens, + clean_up_tokenization_spaces=clean_up_tokenization_spaces, + **kwargs, + ) + + +__all__ = ["Glm4vProcessor"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/glm4v/video_processing_glm4v.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/glm4v/video_processing_glm4v.py new file mode 100644 index 0000000000000000000000000000000000000000..0986c414f1d374589c2f14cdcc514917c09344a2 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/glm4v/video_processing_glm4v.py @@ -0,0 +1,252 @@ +# coding=utf-8 +# Copyright 2025 The ZhipuAI Inc. team and HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""video processor class for GLM-4.1V.""" + +import math +from typing import Optional, Union + +import numpy as np +import torch + +from ...image_processing_utils import BatchFeature +from ...image_utils import ( + OPENAI_CLIP_MEAN, + OPENAI_CLIP_STD, + ChannelDimension, + PILImageResampling, + SizeDict, + get_image_size, +) +from ...processing_utils import Unpack, VideosKwargs +from ...utils import TensorType, add_start_docstrings +from ...video_processing_utils import BASE_VIDEO_PROCESSOR_DOCSTRING, BaseVideoProcessor +from ...video_utils import VideoMetadata, group_videos_by_shape, reorder_videos +from .image_processing_glm4v import smart_resize + + +class Glm4vVideoProcessorInitKwargs(VideosKwargs): + max_image_size: dict[str, int] = None + patch_size: Optional[int] = None + temporal_patch_size: Optional[int] = None + merge_size: Optional[int] = None + image_mean: Optional[list[float]] = None + image_std: Optional[list[float]] = None + + +@add_start_docstrings( + "Constructs a fast GLM-4V image processor that dynamically resizes videos based on the original videos.", + BASE_VIDEO_PROCESSOR_DOCSTRING, + """ + patch_size (`int`, *optional*, defaults to 14): + The spacial patch size of the vision encoder. + temporal_patch_size (`int`, *optional*, defaults to 2): + The temporal patch size of the vision encoder. + merge_size (`int`, *optional*, defaults to 2): + The merge size of the vision encoder to llm encoder. + """, +) +class Glm4vVideoProcessor(BaseVideoProcessor): + resample = PILImageResampling.BICUBIC + size = {"shortest_edge": 112 * 112, "longest_edge": 28 * 28 * 2 * 30000} + max_image_size = {"longest_edge": 28 * 28 * 2 * 30000} + image_mean = OPENAI_CLIP_MEAN + image_std = OPENAI_CLIP_STD + do_resize = True + do_rescale = True + do_normalize = True + do_convert_rgb = True + do_sample_frames = True + patch_size = 14 + temporal_patch_size = 2 + max_duration = 300 + merge_size = 2 + valid_kwargs = Glm4vVideoProcessorInitKwargs + num_frames = 16 + fps = 2 + + model_input_names = ["pixel_values_videos", "video_grid_thw"] + + def __init__(self, **kwargs: Unpack[Glm4vVideoProcessorInitKwargs]): + super().__init__(**kwargs) + if self.size is not None and ( + self.size.get("shortest_edge", None) is None or self.size.get("longest_edge", None) is None + ): + raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.") + + def _further_process_kwargs( + self, + size: Optional[SizeDict] = None, + **kwargs, + ) -> dict: + """ + Update kwargs that need further processing before being validated + Can be overridden by subclasses to customize the processing of kwargs. + """ + if size is not None and ("shortest_edge" not in size or "longest_edge" not in size): + raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.") + + return super()._further_process_kwargs(size=size, **kwargs) + + def sample_frames( + self, + metadata: VideoMetadata, + fps: Optional[Union[int, float]] = None, + **kwargs, + ): + """ + Args: + metadata (`VideoMetadata`): + Metadata of the video containing information about total duration, fps and total number of frames. + fps (`int` or `float`, *optional*): + Target frames to sample per second. Defaults to `self.fps`. + Returns: + np.ndarray: + Indices to sample video frames. + """ + if metadata is None or getattr(metadata, "fps", None) is None: + raise ValueError( + "Asked to sample frames per second but no video metadata was provided which is required when sampling in GLM4V. " + "Please pass in `VideoMetadata` object or set `do_sample_frames=False`" + ) + + total_frames = metadata.total_num_frames + requested_fps = fps if fps is not None else self.fps + + max_frame_idx = total_frames - 1 + duration = metadata.duration or round(max_frame_idx / metadata.fps) + 1 + + if duration <= self.max_duration: + n = int(math.floor(duration * requested_fps)) + frame_indices = [min(max_frame_idx, int(math.ceil(i * metadata.fps / requested_fps))) for i in range(n)] + else: + num_samples = int(self.max_duration * requested_fps) + if num_samples >= total_frames: + frame_indices = list(range(total_frames)) + else: + target_seconds = np.linspace(0, duration, num_samples, endpoint=True) + frame_indices = [min(max_frame_idx, int(math.ceil(t * metadata.fps))) for t in target_seconds] + + seen, uniq = set(), [] + for idx in frame_indices: + if idx not in seen: + seen.add(idx) + uniq.append(idx) + + if len(uniq) & 1: + uniq.append(uniq[-1]) + + return np.array(uniq) + + def _preprocess( + self, + videos: list[torch.Tensor], + do_convert_rgb: bool = True, + do_resize: bool = True, + size: Optional[SizeDict] = None, + interpolation: PILImageResampling = PILImageResampling.BICUBIC, + do_rescale: bool = True, + rescale_factor: float = 1 / 255.0, + do_normalize: bool = True, + image_mean: Optional[Union[float, list[float]]] = None, + image_std: Optional[Union[float, list[float]]] = None, + patch_size: Optional[int] = None, + temporal_patch_size: Optional[int] = None, + merge_size: Optional[int] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + **kwargs, + ): + grouped_videos, grouped_videos_index = group_videos_by_shape(videos) + resized_videos_grouped = {} + + for shape, stacked_videos in grouped_videos.items(): + B, T, C, H, W = stacked_videos.shape + num_frames, height, width = T, H, W + if do_resize: + resized_height, resized_width = smart_resize( + num_frames=num_frames, + height=height, + width=width, + temporal_factor=temporal_patch_size, + factor=patch_size * merge_size, + min_pixels=size.shortest_edge, + max_pixels=size.longest_edge, + ) + stacked_videos = stacked_videos.view(B * T, C, H, W) + stacked_videos = self.resize( + stacked_videos, + size=SizeDict(height=resized_height, width=resized_width), + interpolation=interpolation, + ) + stacked_videos = stacked_videos.view(B, T, C, resized_height, resized_width) + resized_videos_grouped[shape] = stacked_videos + resized_videos = reorder_videos(resized_videos_grouped, grouped_videos_index) + + # Group videos by size for further processing + # Needed in case do_resize is False, or resize returns videos with different sizes + grouped_videos, grouped_videos_index = group_videos_by_shape(resized_videos) + processed_videos_grouped = {} + processed_grids = {} + for shape, stacked_videos in grouped_videos.items(): + resized_height, resized_width = get_image_size(stacked_videos[0], channel_dim=ChannelDimension.FIRST) + + # Fused rescale and normalize + stacked_videos = self.rescale_and_normalize( + stacked_videos, do_rescale, rescale_factor, do_normalize, image_mean, image_std + ) + patches = stacked_videos + + # Check that videos have `num_frames` divisible by `temporal_patch_size` + if patches.shape[1] % temporal_patch_size != 0: + repeats = patches[:, -1:].repeat(1, temporal_patch_size - 1, 1, 1, 1) + patches = torch.cat([patches, repeats], dim=1) + batch_size, grid_t, channel = patches.shape[:3] + grid_t = grid_t // temporal_patch_size + grid_h, grid_w = resized_height // patch_size, resized_width // patch_size + + patches = patches.view( + batch_size, + grid_t, + temporal_patch_size, + channel, + grid_h // merge_size, + merge_size, + patch_size, + grid_w // merge_size, + merge_size, + patch_size, + ) + patches = patches.permute(0, 1, 4, 7, 5, 8, 3, 2, 6, 9) + flatten_patches = patches.reshape( + batch_size, + grid_t * grid_h * grid_w, + channel * temporal_patch_size * patch_size * patch_size, + ) + + processed_videos_grouped[shape] = flatten_patches + processed_grids[shape] = [[grid_t, grid_h, grid_w]] * batch_size + + processed_videos = reorder_videos(processed_videos_grouped, grouped_videos_index) + processed_grids = reorder_videos(processed_grids, grouped_videos_index) + pixel_values_videos = torch.cat(processed_videos, dim=0) + video_grid_thw = torch.tensor(processed_grids) + data = { + "pixel_values_videos": pixel_values_videos, + "video_grid_thw": video_grid_thw, + } + + return BatchFeature(data=data, tensor_type=return_tensors) + + +__all__ = ["Glm4vVideoProcessor"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/glm4v_moe/__pycache__/__init__.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/glm4v_moe/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ea56e2eeb5ed91c399314cd33aa03e3b2c5b9a35 Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/glm4v_moe/__pycache__/__init__.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/glm4v_moe/__pycache__/configuration_glm4v_moe.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/glm4v_moe/__pycache__/configuration_glm4v_moe.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3dbfe85b9bd04c94da45cf1f674f114ee18ae6f3 Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/glm4v_moe/__pycache__/configuration_glm4v_moe.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/glm4v_moe/__pycache__/modeling_glm4v_moe.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/glm4v_moe/__pycache__/modeling_glm4v_moe.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e6c8820312332103f48f227a1f2a339c0250fc2b Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/glm4v_moe/__pycache__/modeling_glm4v_moe.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/glm4v_moe/__pycache__/modular_glm4v_moe.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/glm4v_moe/__pycache__/modular_glm4v_moe.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4cfc842b63b973bcf3ae11c066e4dbe0c5534f90 Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/glm4v_moe/__pycache__/modular_glm4v_moe.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/glpn/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/glpn/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..2a5b38675c34780fd7554db92ea870121f31fd76 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/glpn/__init__.py @@ -0,0 +1,29 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_glpn import * + from .feature_extraction_glpn import * + from .image_processing_glpn import * + from .modeling_glpn import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/glpn/configuration_glpn.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/glpn/configuration_glpn.py new file mode 100644 index 0000000000000000000000000000000000000000..6fb35bb0b08cdedde15e3afc05ef2d8e3f2dbb5b --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/glpn/configuration_glpn.py @@ -0,0 +1,135 @@ +# coding=utf-8 +# Copyright 2022 KAIST and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""GLPN model configuration""" + +from ...configuration_utils import PretrainedConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class GLPNConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`GLPNModel`]. It is used to instantiate an GLPN + model according to the specified arguments, defining the model architecture. Instantiating a configuration with the + defaults will yield a similar configuration to that of the GLPN + [vinvino02/glpn-kitti](https://huggingface.co/vinvino02/glpn-kitti) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + num_channels (`int`, *optional*, defaults to 3): + The number of input channels. + num_encoder_blocks (`int`, *optional*, defaults to 4): + The number of encoder blocks (i.e. stages in the Mix Transformer encoder). + depths (`list[int]`, *optional*, defaults to `[2, 2, 2, 2]`): + The number of layers in each encoder block. + sr_ratios (`list[int]`, *optional*, defaults to `[8, 4, 2, 1]`): + Sequence reduction ratios in each encoder block. + hidden_sizes (`list[int]`, *optional*, defaults to `[32, 64, 160, 256]`): + Dimension of each of the encoder blocks. + patch_sizes (`list[int]`, *optional*, defaults to `[7, 3, 3, 3]`): + Patch size before each encoder block. + strides (`list[int]`, *optional*, defaults to `[4, 2, 2, 2]`): + Stride before each encoder block. + num_attention_heads (`list[int]`, *optional*, defaults to `[1, 2, 5, 8]`): + Number of attention heads for each attention layer in each block of the Transformer encoder. + mlp_ratios (`list[int]`, *optional*, defaults to `[4, 4, 4, 4]`): + Ratio of the size of the hidden layer compared to the size of the input layer of the Mix FFNs in the + encoder blocks. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"` and `"gelu_new"` are supported. + hidden_dropout_prob (`float`, *optional*, defaults to 0.0): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + drop_path_rate (`float`, *optional*, defaults to 0.1): + The dropout probability for stochastic depth, used in the blocks of the Transformer encoder. + layer_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon used by the layer normalization layers. + decoder_hidden_size (`int`, *optional*, defaults to 64): + The dimension of the decoder. + max_depth (`int`, *optional*, defaults to 10): + The maximum depth of the decoder. + head_in_index (`int`, *optional*, defaults to -1): + The index of the features to use in the head. + + Example: + + ```python + >>> from transformers import GLPNModel, GLPNConfig + + >>> # Initializing a GLPN vinvino02/glpn-kitti style configuration + >>> configuration = GLPNConfig() + + >>> # Initializing a model from the vinvino02/glpn-kitti style configuration + >>> model = GLPNModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "glpn" + + def __init__( + self, + num_channels=3, + num_encoder_blocks=4, + depths=[2, 2, 2, 2], + sr_ratios=[8, 4, 2, 1], + hidden_sizes=[32, 64, 160, 256], + patch_sizes=[7, 3, 3, 3], + strides=[4, 2, 2, 2], + num_attention_heads=[1, 2, 5, 8], + mlp_ratios=[4, 4, 4, 4], + hidden_act="gelu", + hidden_dropout_prob=0.0, + attention_probs_dropout_prob=0.0, + initializer_range=0.02, + drop_path_rate=0.1, + layer_norm_eps=1e-6, + decoder_hidden_size=64, + max_depth=10, + head_in_index=-1, + **kwargs, + ): + super().__init__(**kwargs) + + self.num_channels = num_channels + self.num_encoder_blocks = num_encoder_blocks + self.depths = depths + self.sr_ratios = sr_ratios + self.hidden_sizes = hidden_sizes + self.patch_sizes = patch_sizes + self.strides = strides + self.mlp_ratios = mlp_ratios + self.num_attention_heads = num_attention_heads + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.initializer_range = initializer_range + self.drop_path_rate = drop_path_rate + self.layer_norm_eps = layer_norm_eps + self.decoder_hidden_size = decoder_hidden_size + self.max_depth = max_depth + self.head_in_index = head_in_index + + +__all__ = ["GLPNConfig"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/glpn/feature_extraction_glpn.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/glpn/feature_extraction_glpn.py new file mode 100644 index 0000000000000000000000000000000000000000..327fee4a11fd308f980845a971a9fc8335decaf1 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/glpn/feature_extraction_glpn.py @@ -0,0 +1,38 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Feature extractor class for GLPN.""" + +import warnings + +from ...utils import logging +from ...utils.import_utils import requires +from .image_processing_glpn import GLPNImageProcessor + + +logger = logging.get_logger(__name__) + + +@requires(backends=("vision",)) +class GLPNFeatureExtractor(GLPNImageProcessor): + def __init__(self, *args, **kwargs) -> None: + warnings.warn( + "The class GLPNFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please" + " use GLPNImageProcessor instead.", + FutureWarning, + ) + super().__init__(*args, **kwargs) + + +__all__ = ["GLPNFeatureExtractor"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/glpn/image_processing_glpn.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/glpn/image_processing_glpn.py new file mode 100644 index 0000000000000000000000000000000000000000..e3e0255e2b47c3fcb1a617a5793502d6babb5537 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/glpn/image_processing_glpn.py @@ -0,0 +1,276 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Image processor class for GLPN.""" + +from typing import TYPE_CHECKING, Optional, Union + +from ...utils.import_utils import requires + + +if TYPE_CHECKING: + from ...modeling_outputs import DepthEstimatorOutput + +import numpy as np +import PIL.Image + +from ...image_processing_utils import BaseImageProcessor, BatchFeature +from ...image_transforms import resize, to_channel_dimension_format +from ...image_utils import ( + ChannelDimension, + PILImageResampling, + get_image_size, + infer_channel_dimension_format, + is_scaled_image, + is_torch_available, + make_flat_list_of_images, + to_numpy_array, + valid_images, + validate_preprocess_arguments, +) +from ...utils import TensorType, filter_out_non_signature_kwargs, logging, requires_backends + + +if is_torch_available(): + import torch + + +logger = logging.get_logger(__name__) + + +@requires(backends=("vision",)) +class GLPNImageProcessor(BaseImageProcessor): + r""" + Constructs a GLPN image processor. + + Args: + do_resize (`bool`, *optional*, defaults to `True`): + Whether to resize the image's (height, width) dimensions, rounding them down to the closest multiple of + `size_divisor`. Can be overridden by `do_resize` in `preprocess`. + size_divisor (`int`, *optional*, defaults to 32): + When `do_resize` is `True`, images are resized so their height and width are rounded down to the closest + multiple of `size_divisor`. Can be overridden by `size_divisor` in `preprocess`. + resample (`PIL.Image` resampling filter, *optional*, defaults to `Resampling.BILINEAR`): + Resampling filter to use if resizing the image. Can be overridden by `resample` in `preprocess`. + do_rescale (`bool`, *optional*, defaults to `True`): + Whether or not to apply the scaling factor (to make pixel values floats between 0. and 1.). Can be + overridden by `do_rescale` in `preprocess`. + """ + + model_input_names = ["pixel_values"] + + def __init__( + self, + do_resize: bool = True, + size_divisor: int = 32, + resample=PILImageResampling.BILINEAR, + do_rescale: bool = True, + **kwargs, + ) -> None: + self.do_resize = do_resize + self.do_rescale = do_rescale + self.size_divisor = size_divisor + self.resample = resample + super().__init__(**kwargs) + + def resize( + self, + image: np.ndarray, + size_divisor: int, + resample: PILImageResampling = PILImageResampling.BILINEAR, + data_format: Optional[ChannelDimension] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ) -> np.ndarray: + """ + Resize the image, rounding the (height, width) dimensions down to the closest multiple of size_divisor. + + If the image is of dimension (3, 260, 170) and size_divisor is 32, the image will be resized to (3, 256, 160). + + Args: + image (`np.ndarray`): + The image to resize. + size_divisor (`int`): + The image is resized so its height and width are rounded down to the closest multiple of + `size_divisor`. + resample: + `PIL.Image` resampling filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`. + data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the output image. If `None`, the channel dimension format of the input + image is used. Can be one of: + - `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `ChannelDimension.LAST`: image in (height, width, num_channels) format. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format of the input image. If not set, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + + Returns: + `np.ndarray`: The resized image. + """ + height, width = get_image_size(image, channel_dim=input_data_format) + # Rounds the height and width down to the closest multiple of size_divisor + new_h = height // size_divisor * size_divisor + new_w = width // size_divisor * size_divisor + image = resize( + image, + (new_h, new_w), + resample=resample, + data_format=data_format, + input_data_format=input_data_format, + **kwargs, + ) + return image + + @filter_out_non_signature_kwargs() + def preprocess( + self, + images: Union["PIL.Image.Image", TensorType, list["PIL.Image.Image"], list[TensorType]], + do_resize: Optional[bool] = None, + size_divisor: Optional[int] = None, + resample=None, + do_rescale: Optional[bool] = None, + return_tensors: Optional[Union[TensorType, str]] = None, + data_format: ChannelDimension = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> BatchFeature: + """ + Preprocess the given images. + + Args: + images (`PIL.Image.Image` or `TensorType` or `list[np.ndarray]` or `list[TensorType]`): + Images to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If + passing in images with pixel values between 0 and 1, set `do_normalize=False`. + do_resize (`bool`, *optional*, defaults to `self.do_resize`): + Whether to resize the input such that the (height, width) dimensions are a multiple of `size_divisor`. + size_divisor (`int`, *optional*, defaults to `self.size_divisor`): + When `do_resize` is `True`, images are resized so their height and width are rounded down to the + closest multiple of `size_divisor`. + resample (`PIL.Image` resampling filter, *optional*, defaults to `self.resample`): + `PIL.Image` resampling filter to use if resizing the image e.g. `PILImageResampling.BILINEAR`. Only has + an effect if `do_resize` is set to `True`. + do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): + Whether or not to apply the scaling factor (to make pixel values floats between 0. and 1.). + return_tensors (`str` or `TensorType`, *optional*): + The type of tensors to return. Can be one of: + - `None`: Return a list of `np.ndarray`. + - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`. + - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. + - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. + - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. + data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): + The channel dimension format for the output image. Can be one of: + - `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `ChannelDimension.LAST`: image in (height, width, num_channels) format. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + """ + do_resize = do_resize if do_resize is not None else self.do_resize + do_rescale = do_rescale if do_rescale is not None else self.do_rescale + size_divisor = size_divisor if size_divisor is not None else self.size_divisor + resample = resample if resample is not None else self.resample + + images = make_flat_list_of_images(images) + + if not valid_images(images): + raise ValueError( + "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " + "torch.Tensor, tf.Tensor or jax.ndarray." + ) + + # Here, the rescale() method uses a constant rescale_factor. It does not need to be validated + # with a rescale_factor. + validate_preprocess_arguments( + do_resize=do_resize, + size=size_divisor, # Here, size_divisor is used as a parameter for optimal resizing instead of size. + resample=resample, + ) + + # All transformations expect numpy arrays. + images = [to_numpy_array(img) for img in images] + + if do_rescale and is_scaled_image(images[0]): + logger.warning_once( + "It looks like you are trying to rescale already rescaled images. If the input" + " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again." + ) + + if input_data_format is None: + # We assume that all images have the same channel dimension format. + input_data_format = infer_channel_dimension_format(images[0]) + + if do_resize: + images = [ + self.resize(image, size_divisor=size_divisor, resample=resample, input_data_format=input_data_format) + for image in images + ] + + if do_rescale: + images = [self.rescale(image, scale=1 / 255, input_data_format=input_data_format) for image in images] + + images = [ + to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images + ] + + data = {"pixel_values": images} + return BatchFeature(data=data, tensor_type=return_tensors) + + def post_process_depth_estimation( + self, + outputs: "DepthEstimatorOutput", + target_sizes: Optional[Union[TensorType, list[tuple[int, int]], None]] = None, + ) -> list[dict[str, TensorType]]: + """ + Converts the raw output of [`DepthEstimatorOutput`] into final depth predictions and depth PIL images. + Only supports PyTorch. + + Args: + outputs ([`DepthEstimatorOutput`]): + Raw outputs of the model. + target_sizes (`TensorType` or `list[tuple[int, int]]`, *optional*): + Tensor of shape `(batch_size, 2)` or list of tuples (`tuple[int, int]`) containing the target size + (height, width) of each image in the batch. If left to None, predictions will not be resized. + + Returns: + `list[dict[str, TensorType]]`: A list of dictionaries of tensors representing the processed depth + predictions. + """ + requires_backends(self, "torch") + + predicted_depth = outputs.predicted_depth + + if (target_sizes is not None) and (len(predicted_depth) != len(target_sizes)): + raise ValueError( + "Make sure that you pass in as many target sizes as the batch dimension of the predicted depth" + ) + + results = [] + target_sizes = [None] * len(predicted_depth) if target_sizes is None else target_sizes + for depth, target_size in zip(predicted_depth, target_sizes): + if target_size is not None: + depth = depth[None, None, ...] + depth = torch.nn.functional.interpolate(depth, size=target_size, mode="bicubic", align_corners=False) + depth = depth.squeeze() + + results.append({"predicted_depth": depth}) + + return results + + +__all__ = ["GLPNImageProcessor"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/glpn/modeling_glpn.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/glpn/modeling_glpn.py new file mode 100644 index 0000000000000000000000000000000000000000..e326750743a1aa4cb30791cfde1d46b7a3af16e4 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/glpn/modeling_glpn.py @@ -0,0 +1,724 @@ +# coding=utf-8 +# Copyright 2022 KAIST and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch GLPN model.""" + +import math +from typing import Optional, Union + +import torch +from torch import nn + +from ...activations import ACT2FN +from ...modeling_outputs import BaseModelOutput, DepthEstimatorOutput +from ...modeling_utils import PreTrainedModel +from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer +from ...utils import auto_docstring, logging +from .configuration_glpn import GLPNConfig + + +logger = logging.get_logger(__name__) + + +# Copied from transformers.models.beit.modeling_beit.drop_path +def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor: + """ + Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + + Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks, + however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... + See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the + layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the + argument. + """ + if drop_prob == 0.0 or not training: + return input + keep_prob = 1 - drop_prob + shape = (input.shape[0],) + (1,) * (input.ndim - 1) # work with diff dim tensors, not just 2D ConvNets + random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device) + random_tensor.floor_() # binarize + output = input.div(keep_prob) * random_tensor + return output + + +# Copied from transformers.models.segformer.modeling_segformer.SegformerDropPath +class GLPNDropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" + + def __init__(self, drop_prob: Optional[float] = None) -> None: + super().__init__() + self.drop_prob = drop_prob + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + return drop_path(hidden_states, self.drop_prob, self.training) + + def extra_repr(self) -> str: + return f"p={self.drop_prob}" + + +# Copied from transformers.models.segformer.modeling_segformer.SegformerOverlapPatchEmbeddings +class GLPNOverlapPatchEmbeddings(nn.Module): + """Construct the overlapping patch embeddings.""" + + def __init__(self, patch_size, stride, num_channels, hidden_size): + super().__init__() + self.proj = nn.Conv2d( + num_channels, + hidden_size, + kernel_size=patch_size, + stride=stride, + padding=patch_size // 2, + ) + + self.layer_norm = nn.LayerNorm(hidden_size) + + def forward(self, pixel_values): + embeddings = self.proj(pixel_values) + _, _, height, width = embeddings.shape + # (batch_size, num_channels, height, width) -> (batch_size, num_channels, height*width) -> (batch_size, height*width, num_channels) + # this can be fed to a Transformer layer + embeddings = embeddings.flatten(2).transpose(1, 2) + embeddings = self.layer_norm(embeddings) + return embeddings, height, width + + +# Copied from transformers.models.segformer.modeling_segformer.SegformerEfficientSelfAttention +class GLPNEfficientSelfAttention(nn.Module): + """SegFormer's efficient self-attention mechanism. Employs the sequence reduction process introduced in the [PvT + paper](https://huggingface.co/papers/2102.12122).""" + + def __init__(self, config, hidden_size, num_attention_heads, sequence_reduction_ratio): + super().__init__() + self.hidden_size = hidden_size + self.num_attention_heads = num_attention_heads + + if self.hidden_size % self.num_attention_heads != 0: + raise ValueError( + f"The hidden size ({self.hidden_size}) is not a multiple of the number of attention " + f"heads ({self.num_attention_heads})" + ) + + self.attention_head_size = int(self.hidden_size / self.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(self.hidden_size, self.all_head_size) + self.key = nn.Linear(self.hidden_size, self.all_head_size) + self.value = nn.Linear(self.hidden_size, self.all_head_size) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + + self.sr_ratio = sequence_reduction_ratio + if sequence_reduction_ratio > 1: + self.sr = nn.Conv2d( + hidden_size, hidden_size, kernel_size=sequence_reduction_ratio, stride=sequence_reduction_ratio + ) + self.layer_norm = nn.LayerNorm(hidden_size) + + def forward( + self, + hidden_states, + height, + width, + output_attentions=False, + ): + batch_size, seq_length, _ = hidden_states.shape + query_layer = ( + self.query(hidden_states) + .view(batch_size, -1, self.num_attention_heads, self.attention_head_size) + .transpose(1, 2) + ) + + if self.sr_ratio > 1: + batch_size, seq_len, num_channels = hidden_states.shape + # Reshape to (batch_size, num_channels, height, width) + hidden_states = hidden_states.permute(0, 2, 1).reshape(batch_size, num_channels, height, width) + # Apply sequence reduction + hidden_states = self.sr(hidden_states) + # Reshape back to (batch_size, seq_len, num_channels) + hidden_states = hidden_states.reshape(batch_size, num_channels, -1).permute(0, 2, 1) + hidden_states = self.layer_norm(hidden_states) + + key_layer = ( + self.key(hidden_states) + .view(batch_size, -1, self.num_attention_heads, self.attention_head_size) + .transpose(1, 2) + ) + value_layer = ( + self.value(hidden_states) + .view(batch_size, -1, self.num_attention_heads, self.attention_head_size) + .transpose(1, 2) + ) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + + # Normalize the attention scores to probabilities. + attention_probs = nn.functional.softmax(attention_scores, dim=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + context_layer = torch.matmul(attention_probs, value_layer) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(new_context_layer_shape) + + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + + return outputs + + +# Copied from transformers.models.segformer.modeling_segformer.SegformerSelfOutput +class GLPNSelfOutput(nn.Module): + def __init__(self, config, hidden_size): + super().__init__() + self.dense = nn.Linear(hidden_size, hidden_size) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states, input_tensor): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + return hidden_states + + +# Copied from transformers.models.segformer.modeling_segformer.SegformerAttention with Segformer->GLPN +class GLPNAttention(nn.Module): + def __init__(self, config, hidden_size, num_attention_heads, sequence_reduction_ratio): + super().__init__() + self.self = GLPNEfficientSelfAttention( + config=config, + hidden_size=hidden_size, + num_attention_heads=num_attention_heads, + sequence_reduction_ratio=sequence_reduction_ratio, + ) + self.output = GLPNSelfOutput(config, hidden_size=hidden_size) + self.pruned_heads = set() + + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads + ) + + # Prune linear layers + self.self.query = prune_linear_layer(self.self.query, index) + self.self.key = prune_linear_layer(self.self.key, index) + self.self.value = prune_linear_layer(self.self.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.self.num_attention_heads = self.self.num_attention_heads - len(heads) + self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + def forward(self, hidden_states, height, width, output_attentions=False): + self_outputs = self.self(hidden_states, height, width, output_attentions) + + attention_output = self.output(self_outputs[0], hidden_states) + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + return outputs + + +# Copied from transformers.models.segformer.modeling_segformer.SegformerDWConv +class GLPNDWConv(nn.Module): + def __init__(self, dim=768): + super().__init__() + self.dwconv = nn.Conv2d(dim, dim, 3, 1, 1, bias=True, groups=dim) + + def forward(self, hidden_states, height, width): + batch_size, seq_len, num_channels = hidden_states.shape + hidden_states = hidden_states.transpose(1, 2).view(batch_size, num_channels, height, width) + hidden_states = self.dwconv(hidden_states) + hidden_states = hidden_states.flatten(2).transpose(1, 2) + + return hidden_states + + +# Copied from transformers.models.segformer.modeling_segformer.SegformerMixFFN with Segformer->GLPN +class GLPNMixFFN(nn.Module): + def __init__(self, config, in_features, hidden_features=None, out_features=None): + super().__init__() + out_features = out_features or in_features + self.dense1 = nn.Linear(in_features, hidden_features) + self.dwconv = GLPNDWConv(hidden_features) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + self.dense2 = nn.Linear(hidden_features, out_features) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states, height, width): + hidden_states = self.dense1(hidden_states) + hidden_states = self.dwconv(hidden_states, height, width) + hidden_states = self.intermediate_act_fn(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.dense2(hidden_states) + hidden_states = self.dropout(hidden_states) + return hidden_states + + +# Copied from transformers.models.segformer.modeling_segformer.SegformerLayer with Segformer->GLPN +class GLPNLayer(nn.Module): + """This corresponds to the Block class in the original implementation.""" + + def __init__(self, config, hidden_size, num_attention_heads, drop_path, sequence_reduction_ratio, mlp_ratio): + super().__init__() + self.layer_norm_1 = nn.LayerNorm(hidden_size) + self.attention = GLPNAttention( + config, + hidden_size=hidden_size, + num_attention_heads=num_attention_heads, + sequence_reduction_ratio=sequence_reduction_ratio, + ) + self.drop_path = GLPNDropPath(drop_path) if drop_path > 0.0 else nn.Identity() + self.layer_norm_2 = nn.LayerNorm(hidden_size) + mlp_hidden_size = int(hidden_size * mlp_ratio) + self.mlp = GLPNMixFFN(config, in_features=hidden_size, hidden_features=mlp_hidden_size) + + def forward(self, hidden_states, height, width, output_attentions=False): + self_attention_outputs = self.attention( + self.layer_norm_1(hidden_states), # in GLPN, layernorm is applied before self-attention + height, + width, + output_attentions=output_attentions, + ) + + attention_output = self_attention_outputs[0] + outputs = self_attention_outputs[1:] # add self attentions if we output attention weights + + # first residual connection (with stochastic depth) + attention_output = self.drop_path(attention_output) + hidden_states = attention_output + hidden_states + + mlp_output = self.mlp(self.layer_norm_2(hidden_states), height, width) + + # second residual connection (with stochastic depth) + mlp_output = self.drop_path(mlp_output) + layer_output = mlp_output + hidden_states + + outputs = (layer_output,) + outputs + + return outputs + + +class GLPNEncoder(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + + # stochastic depth decay rule + dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths), device="cpu")] + + # patch embeddings + embeddings = [] + for i in range(config.num_encoder_blocks): + embeddings.append( + GLPNOverlapPatchEmbeddings( + patch_size=config.patch_sizes[i], + stride=config.strides[i], + num_channels=config.num_channels if i == 0 else config.hidden_sizes[i - 1], + hidden_size=config.hidden_sizes[i], + ) + ) + self.patch_embeddings = nn.ModuleList(embeddings) + + # Transformer blocks + blocks = [] + cur = 0 + for i in range(config.num_encoder_blocks): + # each block consists of layers + layers = [] + if i != 0: + cur += config.depths[i - 1] + for j in range(config.depths[i]): + layers.append( + GLPNLayer( + config, + hidden_size=config.hidden_sizes[i], + num_attention_heads=config.num_attention_heads[i], + drop_path=dpr[cur + j], + sequence_reduction_ratio=config.sr_ratios[i], + mlp_ratio=config.mlp_ratios[i], + ) + ) + blocks.append(nn.ModuleList(layers)) + + self.block = nn.ModuleList(blocks) + + # Layer norms + self.layer_norm = nn.ModuleList( + [nn.LayerNorm(config.hidden_sizes[i]) for i in range(config.num_encoder_blocks)] + ) + + def forward( + self, + pixel_values, + output_attentions=False, + output_hidden_states=False, + return_dict=True, + ): + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + + batch_size = pixel_values.shape[0] + + hidden_states = pixel_values + for idx, x in enumerate(zip(self.patch_embeddings, self.block, self.layer_norm)): + embedding_layer, block_layer, norm_layer = x + # first, obtain patch embeddings + hidden_states, height, width = embedding_layer(hidden_states) + # second, send embeddings through blocks + for i, blk in enumerate(block_layer): + layer_outputs = blk(hidden_states, height, width, output_attentions) + hidden_states = layer_outputs[0] + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + # third, apply layer norm + hidden_states = norm_layer(hidden_states) + # fourth, optionally reshape back to (batch_size, num_channels, height, width) + hidden_states = hidden_states.reshape(batch_size, height, width, -1).permute(0, 3, 1, 2).contiguous() + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None) + return BaseModelOutput( + last_hidden_state=hidden_states, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + ) + + +@auto_docstring +class GLPNPreTrainedModel(PreTrainedModel): + config: GLPNConfig + base_model_prefix = "glpn" + main_input_name = "pixel_values" + _no_split_modules = [] + + # Copied from transformers.models.segformer.modeling_segformer.SegformerPreTrainedModel._init_weights + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, (nn.Linear, nn.Conv2d)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, (nn.LayerNorm, nn.BatchNorm2d)): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + +@auto_docstring +class GLPNModel(GLPNPreTrainedModel): + # Copied from transformers.models.segformer.modeling_segformer.SegformerModel.__init__ with Segformer->GLPN + def __init__(self, config): + super().__init__(config) + self.config = config + + # hierarchical Transformer encoder + self.encoder = GLPNEncoder(config) + + # Initialize weights and apply final processing + self.post_init() + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + @auto_docstring + # Copied from transformers.models.segformer.modeling_segformer.SegformerModel.forward + def forward( + self, + pixel_values: torch.FloatTensor, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, BaseModelOutput]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + encoder_outputs = self.encoder( + pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + sequence_output = encoder_outputs[0] + + if not return_dict: + return (sequence_output,) + encoder_outputs[1:] + + return BaseModelOutput( + last_hidden_state=sequence_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + +class GLPNSelectiveFeatureFusion(nn.Module): + """ + Selective Feature Fusion module, as explained in the [paper](https://huggingface.co/papers/2201.07436) (section 3.4). This + module adaptively selects and integrates local and global features by attaining an attention map for each feature. + """ + + def __init__(self, in_channel=64): + super().__init__() + + self.convolutional_layer1 = nn.Sequential( + nn.Conv2d(in_channels=int(in_channel * 2), out_channels=in_channel, kernel_size=3, stride=1, padding=1), + nn.BatchNorm2d(in_channel), + nn.ReLU(), + ) + + self.convolutional_layer2 = nn.Sequential( + nn.Conv2d(in_channels=in_channel, out_channels=int(in_channel / 2), kernel_size=3, stride=1, padding=1), + nn.BatchNorm2d(int(in_channel / 2)), + nn.ReLU(), + ) + + self.convolutional_layer3 = nn.Conv2d( + in_channels=int(in_channel / 2), out_channels=2, kernel_size=3, stride=1, padding=1 + ) + + self.sigmoid = nn.Sigmoid() + + def forward(self, local_features, global_features): + # concatenate features along the channel dimension + features = torch.cat((local_features, global_features), dim=1) + # pass through convolutional layers + features = self.convolutional_layer1(features) + features = self.convolutional_layer2(features) + features = self.convolutional_layer3(features) + # apply sigmoid to get two-channel attention map + attn = self.sigmoid(features) + # construct hybrid features by adding element-wise + hybrid_features = local_features * attn[:, 0, :, :].unsqueeze(1) + global_features * attn[ + :, 1, :, : + ].unsqueeze(1) + + return hybrid_features + + +class GLPNDecoderStage(nn.Module): + def __init__(self, in_channels, out_channels): + super().__init__() + should_skip = in_channels == out_channels + self.convolution = nn.Conv2d(in_channels, out_channels, kernel_size=1) if not should_skip else nn.Identity() + self.fusion = GLPNSelectiveFeatureFusion(out_channels) + self.upsample = nn.Upsample(scale_factor=2, mode="bilinear", align_corners=False) + + def forward(self, hidden_state, residual=None): + hidden_state = self.convolution(hidden_state) + if residual is not None: + hidden_state = self.fusion(hidden_state, residual) + hidden_state = self.upsample(hidden_state) + + return hidden_state + + hidden_state = self.upsample(hidden_state) + return hidden_state + + +class GLPNDecoder(nn.Module): + def __init__(self, config): + super().__init__() + # we use features from end -> start + reserved_hidden_sizes = config.hidden_sizes[::-1] + out_channels = config.decoder_hidden_size + + self.stages = nn.ModuleList( + [GLPNDecoderStage(hidden_size, out_channels) for hidden_size in reserved_hidden_sizes] + ) + # don't fuse in first stage + self.stages[0].fusion = None + + self.final_upsample = nn.Upsample(scale_factor=2, mode="bilinear", align_corners=False) + + def forward(self, hidden_states: list[torch.Tensor]) -> list[torch.Tensor]: + stage_hidden_states = [] + stage_hidden_state = None + for hidden_state, stage in zip(hidden_states[::-1], self.stages): + stage_hidden_state = stage(hidden_state, stage_hidden_state) + stage_hidden_states.append(stage_hidden_state) + + stage_hidden_states[-1] = self.final_upsample(stage_hidden_state) + + return stage_hidden_states + + +class SiLogLoss(nn.Module): + r""" + Implements the Scale-invariant log scale loss [Eigen et al., 2014](https://huggingface.co/papers/1406.2283). + + $$L=\frac{1}{n} \sum_{i} d_{i}^{2}-\frac{1}{2 n^{2}}\left(\sum_{i} d_{i}^{2}\right)$$ where $d_{i}=\log y_{i}-\log + y_{i}^{*}$. + + """ + + def __init__(self, lambd=0.5): + super().__init__() + self.lambd = lambd + + def forward(self, pred, target): + valid_mask = (target > 0).detach() + diff_log = torch.log(target[valid_mask]) - torch.log(pred[valid_mask]) + loss = torch.sqrt(torch.pow(diff_log, 2).mean() - self.lambd * torch.pow(diff_log.mean(), 2)) + + return loss + + +class GLPNDepthEstimationHead(nn.Module): + def __init__(self, config): + super().__init__() + + self.config = config + + channels = config.decoder_hidden_size + self.head = nn.Sequential( + nn.Conv2d(channels, channels, kernel_size=3, stride=1, padding=1), + nn.ReLU(inplace=False), + nn.Conv2d(channels, 1, kernel_size=3, stride=1, padding=1), + ) + + def forward(self, hidden_states: list[torch.Tensor]) -> torch.Tensor: + # use last features of the decoder + hidden_states = hidden_states[self.config.head_in_index] + + hidden_states = self.head(hidden_states) + + predicted_depth = torch.sigmoid(hidden_states) * self.config.max_depth + predicted_depth = predicted_depth.squeeze(dim=1) + + return predicted_depth + + +@auto_docstring( + custom_intro=""" + GLPN Model transformer with a lightweight depth estimation head on top e.g. for KITTI, NYUv2. + """ +) +class GLPNForDepthEstimation(GLPNPreTrainedModel): + def __init__(self, config): + super().__init__(config) + + self.glpn = GLPNModel(config) + self.decoder = GLPNDecoder(config) + self.head = GLPNDepthEstimationHead(config) + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + pixel_values: torch.FloatTensor, + labels: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple[torch.Tensor], DepthEstimatorOutput]: + r""" + labels (`torch.FloatTensor` of shape `(batch_size, height, width)`, *optional*): + Ground truth depth estimation maps for computing the loss. + + Examples: + + ```python + >>> from transformers import AutoImageProcessor, GLPNForDepthEstimation + >>> import torch + >>> import numpy as np + >>> from PIL import Image + >>> import requests + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> image_processor = AutoImageProcessor.from_pretrained("vinvino02/glpn-kitti") + >>> model = GLPNForDepthEstimation.from_pretrained("vinvino02/glpn-kitti") + + >>> # prepare image for the model + >>> inputs = image_processor(images=image, return_tensors="pt") + + >>> with torch.no_grad(): + ... outputs = model(**inputs) + + >>> # interpolate to original size + >>> post_processed_output = image_processor.post_process_depth_estimation( + ... outputs, + ... target_sizes=[(image.height, image.width)], + ... ) + + >>> # visualize the prediction + >>> predicted_depth = post_processed_output[0]["predicted_depth"] + >>> depth = predicted_depth * 255 / predicted_depth.max() + >>> depth = depth.detach().cpu().numpy() + >>> depth = Image.fromarray(depth.astype("uint8")) + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + + outputs = self.glpn( + pixel_values, + output_attentions=output_attentions, + output_hidden_states=True, # we need the intermediate hidden states + return_dict=return_dict, + ) + + hidden_states = outputs.hidden_states if return_dict else outputs[1] + + out = self.decoder(hidden_states) + predicted_depth = self.head(out) + + loss = None + if labels is not None: + loss_fct = SiLogLoss() + loss = loss_fct(predicted_depth, labels) + + if not return_dict: + if output_hidden_states: + output = (predicted_depth,) + outputs[1:] + else: + output = (predicted_depth,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return DepthEstimatorOutput( + loss=loss, + predicted_depth=predicted_depth, + hidden_states=outputs.hidden_states if output_hidden_states else None, + attentions=outputs.attentions, + ) + + +__all__ = ["GLPNForDepthEstimation", "GLPNLayer", "GLPNModel", "GLPNPreTrainedModel"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/got_ocr2/__pycache__/image_processing_got_ocr2_fast.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/got_ocr2/__pycache__/image_processing_got_ocr2_fast.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..002683cf71f0f796812670a5e0c1407c55de691c Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/got_ocr2/__pycache__/image_processing_got_ocr2_fast.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/got_ocr2/__pycache__/modular_got_ocr2.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/got_ocr2/__pycache__/modular_got_ocr2.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..55a5af43df7fd5f2920d3b2ba62e299ed7e42c26 Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/got_ocr2/__pycache__/modular_got_ocr2.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/gpt_bigcode/__pycache__/__init__.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/gpt_bigcode/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2886cd73a8e8e9652bf80af6c1d2fb39090abd53 Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/gpt_bigcode/__pycache__/__init__.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/gpt_bigcode/__pycache__/configuration_gpt_bigcode.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/gpt_bigcode/__pycache__/configuration_gpt_bigcode.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2a802f9f28b3a6f9b73604cc0eed6e5b92121f5c Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/gpt_bigcode/__pycache__/configuration_gpt_bigcode.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/gpt_bigcode/__pycache__/modeling_gpt_bigcode.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/gpt_bigcode/__pycache__/modeling_gpt_bigcode.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ea78d30c12d1b02e209d69e8bd556fe9b4162393 Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/gpt_bigcode/__pycache__/modeling_gpt_bigcode.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/granite/__pycache__/__init__.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/granite/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bea7fb7dd186d14271b5dee64217510d4ef5fb27 Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/granite/__pycache__/__init__.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/granite/__pycache__/configuration_granite.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/granite/__pycache__/configuration_granite.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b5d703ccb660b9398dc140c39e9ba3301f9f46d1 Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/granite/__pycache__/configuration_granite.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/granite/__pycache__/modeling_granite.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/granite/__pycache__/modeling_granite.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6cf5be07abf6b88d1118d99530f1858ba792c028 Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/granite/__pycache__/modeling_granite.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/granite/__pycache__/modular_granite.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/granite/__pycache__/modular_granite.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9ca96781248f6aedc080cca9f7920dff45d95af8 Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/granite/__pycache__/modular_granite.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/granite_speech/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/granite_speech/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..d6122581855250911b0bf951a42e22d9c354240a --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/granite_speech/__init__.py @@ -0,0 +1,29 @@ +# Copyright 2025 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_granite_speech import * + from .feature_extraction_granite_speech import * + from .modeling_granite_speech import * + from .processing_granite_speech import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/granite_speech/configuration_granite_speech.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/granite_speech/configuration_granite_speech.py new file mode 100644 index 0000000000000000000000000000000000000000..fede07b7b7e820e78f44538313a85d39afc811d7 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/granite_speech/configuration_granite_speech.py @@ -0,0 +1,198 @@ +# coding=utf-8 +# Copyright 2025 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Config class for Granite Speech.""" + +from ...configuration_utils import PretrainedConfig +from ..auto import CONFIG_MAPPING, AutoConfig + + +class GraniteSpeechEncoderConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`GraniteSpeechCTCEncoder`]. It is used to instantiate + a Granite Speech audio encoder according to the specified arguments, defining the model architecture. Instantiating a + configuration with the dfefaults will yield a similar configuration to that of the audio encoder of the Granite Speech + architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + input_dim (`int`, *optional*, defaults to 160): + Dimension of the first hidden layer of the encoder. + num_layers (`int`, *optional*, defaults to 10): + Number of encoder blocks. + hidden_dim (`int`, *optional*, defaults to 1024): + The size of the intermediate layers in the conformer encoder. + feedforward_mult (`int`, *optional*, defaults to 4): + Multiplier for the up/down projections in the encoder's feedforward layers; + The projections will have intermediate dim of size `hidden_dim * feedforward_mult`. + num_heads (`int`, *optional*, defaults to 8): + Number of attention heads for each attention layer in the Transformer encoder. + dim_head (`int`, *optional*, defaults to 128): + Dimension of attention heads for each attention layer in the Transformer encoder. + output_dim (`int`, *optional*, defaults to 42): + Intermediate dimension of the feedforward projections in the conformer + to be added to every other encoder block's output. + context_size (`int`, *optional*, defaults to 200): + Context size to be used in conformer attention. + max_pos_emb (`int`, *optional*, defaults to 512): + Max pos embeds to be used in attention (shaw's relative positional encoding). + dropout (`float`, *optional*, defaults to 0.1): + The dropout probability for fully connected layers in the encoder. + conv_kernel_size (`int`, *optional*, defaults to 15): + Kernel size to be used for 1D convolution in each conformer block. + conv_expansion_factor (`int`, *optional*, defaults to 2): + Intermediate dimension to be used in conformer convolutions. + + Example: + + ```python + >>> from transformers import GraniteSpeechEncoderConfig, GraniteSpeechCTCEncoder + + >>> # Initializing a GraniteSpeechEncoderConfig + >>> configuration = GraniteSpeechEncoderConfig() + + >>> # Initializing a GraniteSpeechCTCEncoder (with random weights) + >>> model = GraniteSpeechCTCEncoder(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "granite_speech_encoder" + + def __init__( + self, + input_dim=160, + num_layers=10, + hidden_dim=1024, + feedforward_mult=4, + num_heads=8, + dim_head=128, + output_dim=42, + context_size=200, + max_pos_emb=512, + dropout=0.1, + conv_kernel_size=15, + conv_expansion_factor=2, + **kwargs, + ): + super().__init__(**kwargs) + self.input_dim = input_dim + self.num_layers = num_layers + self.hidden_dim = hidden_dim + self.feedforward_mult = feedforward_mult + self.num_heads = num_heads + self.dim_head = dim_head + self.output_dim = output_dim + self.context_size = context_size + self.dropout = dropout + self.conv_kernel_size = conv_kernel_size + self.conv_expansion_factor = conv_expansion_factor + self.max_pos_emb = max_pos_emb + + +class GraniteSpeechConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`GraniteSpeechForConditionalGeneration`]. It is used to instantiate an + Granite Speech model according to the specified arguments, defining the model architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `GraniteConfig`): + The config object or dictionary of the text backbone. + encoder_config (`GraniteSpeechEncoderConfig`, *optional*): + The config object or dictionary of the Granite Speech CTC Encoder. + projector_config (`Union[AutoConfig, dict]`, *optional*, defaults to `Blip2QFormerConfig`): + The config object or dictionary of the audio projector. + audio_token_index (`int`, *optional*, defaults to 49155): + The audio token index to encode the audio prompt. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + has_lora_adapter (`bool`, *optional*, defaults to `True`): + Indicates whether or not the model has a lora adapter that should only + be activate when processing audio inputs. + downsample_rate (`int`, *optional*, defaults to 5): + Downsample rate for the audio feature extractor. + window_size (`int`, *optional*, defaults to 15): + Window size for the audio feature projector. + + Example: + + ```python + >>> from transformers import GraniteSpeechConfig, GraniteSpeechForConditionalGeneration + + >>> # Initializing a GraniteSpeechConfig + >>> configuration = GraniteSpeechConfig() + + >>> # Initializing a GraniteSpeechForConditionalGeneration (with random weights) + >>> model = GraniteSpeechForConditionalGeneration(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "granite_speech" + attribute_map = { + "audio_token_id": "audio_token_index", + } + sub_configs = { + "text_config": AutoConfig, + "encoder_config": GraniteSpeechEncoderConfig, + "projector_config": AutoConfig, + } + + def __init__( + self, + text_config=None, + encoder_config=None, + projector_config=None, + audio_token_index=49155, + initializer_range=0.02, + has_lora_adapter=True, + downsample_rate=5, + window_size=15, + **kwargs, + ): + if isinstance(text_config, dict): + text_config["model_type"] = text_config.get("model_type", "granite") + text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config) + elif text_config is None: + text_config = CONFIG_MAPPING["granite"]() + + if isinstance(projector_config, dict): + projector_config["model_type"] = projector_config.get("model_type", "blip_2_qformer") + projector_config = CONFIG_MAPPING[projector_config["model_type"]](**projector_config) + elif projector_config is None: + projector_config = CONFIG_MAPPING["blip_2_qformer"]() + + if not isinstance(encoder_config, GraniteSpeechEncoderConfig): + encoder_config = {} if encoder_config is None else encoder_config + encoder_config = GraniteSpeechEncoderConfig(**encoder_config) + + self.text_config = text_config + self.encoder_config = encoder_config + self.projector_config = projector_config + self.audio_token_index = audio_token_index + self.initializer_range = initializer_range + self.has_lora_adapter = has_lora_adapter + self.downsample_rate = downsample_rate + self.window_size = window_size + super().__init__(**kwargs) + + +__all__ = ["GraniteSpeechEncoderConfig", "GraniteSpeechConfig"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/granite_speech/feature_extraction_granite_speech.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/granite_speech/feature_extraction_granite_speech.py new file mode 100644 index 0000000000000000000000000000000000000000..7528fc7ea5bd9efa6ae322d7fd2e40b567855359 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/granite_speech/feature_extraction_granite_speech.py @@ -0,0 +1,186 @@ +# coding=utf-8 +# Copyright 2025 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Feature extractor class for Granite Speech.""" + +import math +from collections.abc import Sequence +from typing import Optional + +import numpy as np + +from ...feature_extraction_utils import BatchFeature, FeatureExtractionMixin +from ...tokenization_utils_base import AudioInput +from ...utils import is_torch_available, is_torchaudio_available, logging +from ...utils.import_utils import requires_backends + + +logger = logging.get_logger(__name__) + +if is_torch_available(): + import torch + +if is_torchaudio_available(): + import torchaudio + + +class GraniteSpeechFeatureExtractor(FeatureExtractionMixin): + model_input_names = ["input_features"] + + def __init__( + self, + sampling_rate: int = 16000, + n_fft: int = 512, + win_length: int = 400, + hop_length: int = 160, + n_mels: int = 80, + projector_window_size: int = 15, + projector_downsample_rate: int = 5, + **kwargs, + ): + super().__init__(**kwargs) + self.sampling_rate = sampling_rate + self.melspec_kwargs = { + "sample_rate": sampling_rate, + "n_fft": n_fft, + "win_length": win_length, + "hop_length": hop_length, + "n_mels": n_mels, + } + requires_backends(self, ["torchaudio"]) + self.mel_filters = torchaudio.transforms.MelSpectrogram(**self.melspec_kwargs) + self.projector_window_size = projector_window_size + self.projector_downsample_rate = projector_downsample_rate + + def __call__( + self, + audios: AudioInput, + device: Optional[str] = "cpu", + ) -> BatchFeature: + requires_backends(self, ["torchaudio"]) + + speech_inputs = {} + batched_audio, audio_lengths = self._get_audios_and_audio_lengths(audios) + speech_inputs["input_features"] = self._extract_mel_spectrograms( + batched_audio, + device=device, + ) + audio_embed_sizes = self._get_num_audio_features(audio_lengths) + speech_inputs["audio_embed_sizes"] = audio_embed_sizes + # TODO (@alex-jw-brooks): Currently input_features_mask is not + # a great name, because input_features and input_features_mask + # have different shapes (before/after the projector). + # + # We should align this with other multimodal models, e.g,. llava + # and qwen2audio and refactor this to ensure input_feature_mask + # has the same dimensionality as input_features, or compute it in + # the model based on the audio embedding sizes (since we do not + # have an attention mask for the audio features to infer padding from). + speech_inputs["input_features_mask"] = torch.arange(max(audio_embed_sizes)).view(1, -1) < torch.tensor( + audio_embed_sizes + ).view(-1, 1) + return BatchFeature(data=speech_inputs) + + def _extract_mel_spectrograms(self, audio: "torch.Tensor", device="cpu"): + """ + Compute the Mel features to be passed to the conformer encoder. + """ + requires_backends(self, ["torchaudio"]) + if device is not None: + melspec = self.mel_filters.to(device) + audio = audio.to(device) + else: + melspec = self.mel_filters + + bsz = audio.shape[0] + with torch.no_grad(): + # Compute mel features + mel = melspec(audio.float()) + logmel = mel.transpose(-1, -2).clip_(min=1e-10).log10_() + mx = logmel.amax(dim=(-2, -1), keepdim=True) + logmel = torch.maximum(logmel, mx - 8.0).div_(4).add_(1) + # remove last frame if odd + if logmel.shape[1] % 2 == 1: + logmel = logmel[:, :-1] + + # stacking and skipping by 2 + audio = logmel.reshape(bsz, -1, 2 * logmel.shape[-1]) + + return audio + + def _get_num_audio_features(self, audio_lengths: Sequence[int]) -> Sequence[int]: + """ + Gets the (variable length) number of features (i.e., projector output) for the sequences + being considered. + + Args: + audio_lengths (`Sequence[int]`): + Sequence of one or more raw audio lengths. + """ + hop_length = self.melspec_kwargs["hop_length"] + effective_window_size = self.projector_window_size // self.projector_downsample_rate + + projector_lengths = [] + for raw_length in audio_lengths: + # mel sequence length computation + mel_length = raw_length // hop_length + 1 + # encoder frame takes two mel features + encoder_length = mel_length // 2 + nblocks = math.ceil(encoder_length / self.projector_window_size) + # projector output length + projector_length = nblocks * effective_window_size + projector_lengths.append(projector_length) + + return projector_lengths + + def _get_audios_and_audio_lengths(self, audios: AudioInput) -> Sequence["torch.Tensor", Sequence[int]]: + """ + Coerces audio inputs to torch tensors and extracts audio lengths prior to stacking. + + Args: + audios (`AudioInput`): + Audio sequence, numpy array, or torch tensor. + """ + requires_backends(self, ["torch"]) + + # Coerce to PyTorch tensors if we have numpy arrays, since + # currently we have a dependency on torch/torchaudio anyway + if isinstance(audios, np.ndarray): + audios = torch.from_numpy(audios) + elif isinstance(audios, Sequence) and isinstance(audios[0], np.ndarray): + audios = [torch.from_numpy(arr) for arr in audios] + + if isinstance(audios, torch.Tensor): + if audios.ndim == 1: + audios = audios.unsqueeze(0) + if not torch.is_floating_point(audios): + raise ValueError("Invalid audio provided. Audio should be a floating point between 0 and 1") + + if audios.shape[0] > 1: + logger.warning("Audio samples are already collated; assuming they all have the same length") + lengths = [audios.shape[-1]] * audios.shape[0] + return audios, lengths + + elif isinstance(audios, Sequence) and isinstance(audios[0], torch.Tensor): + if not torch.is_floating_point(audios[0]): + raise ValueError("Invalid audio provided. Audio should be a floating point between 0 and 1") + lengths = [audio.shape[-1] for audio in audios] + audios = [audio.squeeze(0) for audio in audios] + audios = torch.nn.utils.rnn.pad_sequence(audios, batch_first=True, padding_value=0.0) + return audios, lengths + + raise TypeError("Invalid audio provided. Audio should be a one or more torch tensors or numpy arrays") + + +__all__ = ["GraniteSpeechFeatureExtractor"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/granite_speech/modeling_granite_speech.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/granite_speech/modeling_granite_speech.py new file mode 100644 index 0000000000000000000000000000000000000000..1e44c9781dec683cf4b12bcb9d55d32b3635fb53 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/granite_speech/modeling_granite_speech.py @@ -0,0 +1,577 @@ +# coding=utf-8 +# Copyright 2025 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +from dataclasses import dataclass +from typing import Optional, Union + +import torch +import torch.nn.functional as F +from torch import nn + +from ...cache_utils import Cache +from ...generation import GenerationMixin +from ...modeling_outputs import ModelOutput +from ...modeling_utils import PreTrainedModel +from ...utils import auto_docstring, is_peft_available, logging +from ..auto import AutoModel, AutoModelForCausalLM +from .configuration_granite_speech import GraniteSpeechConfig, GraniteSpeechEncoderConfig + + +logger = logging.get_logger(__name__) + + +@dataclass +@auto_docstring( + custom_intro=""" + Base class for LlavaNext causal language model (or autoregressive) outputs. + """ +) +class GraniteSpeechCausalLMOutputWithPast(ModelOutput): + r""" + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Language modeling loss (for next-token prediction). + logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache). + + Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see + `past_key_values` input) to speed up sequential decoding. + """ + + loss: Optional[torch.FloatTensor] = None + logits: Optional[torch.FloatTensor] = None + past_key_values: Optional[Cache] = None + hidden_states: Optional[tuple[torch.FloatTensor]] = None + attentions: Optional[tuple[torch.FloatTensor]] = None + + +### Projector +class GraniteSpeechEncoderProjector(nn.Module): + def __init__(self, config: GraniteSpeechConfig): + super().__init__() + self.hidden_size = config.projector_config.hidden_size + self.downsample_rate = config.downsample_rate + self.window_size = config.window_size + self.num_queries = config.window_size // config.downsample_rate + + self.query = nn.Parameter(torch.zeros(1, self.num_queries, config.projector_config.hidden_size)) + self.query.data.normal_(mean=0.0, std=1.0) + + # By default, this will be a blip_2_qformer config + self.qformer = AutoModel.from_config(config.projector_config) + self.linear = nn.Linear(config.projector_config.hidden_size, config.text_config.hidden_size) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + batch_size, seq_len, dim = hidden_states.size() + nblocks = math.ceil(seq_len / self.window_size) + pad = nblocks * self.window_size - seq_len + hidden_states = nn.functional.pad(hidden_states, (0, 0, 0, pad), "constant", 0) + hidden_states = hidden_states.view(batch_size * nblocks, self.window_size, dim) + + query_output = self.qformer( + query_embeds=self.query, + encoder_hidden_states=hidden_states, + encoder_attention_mask=None, + return_dict=True, + ) + query_proj = self.linear( + query_output.last_hidden_state.view(batch_size, nblocks * self.window_size // self.downsample_rate, -1) + ) + return query_proj + + +### Encoder - conformer is adapted from: https://github.com/lucidrains/conformer.git +class GraniteSpeechConformerFeedForward(nn.Module): + """Feedforward module for conformer encoder blocks.""" + + def __init__(self, config: GraniteSpeechEncoderConfig): + super().__init__() + self.pre_norm = nn.LayerNorm(config.hidden_dim) + self.up_proj = nn.Linear(config.hidden_dim, config.hidden_dim * config.feedforward_mult) + self.silu = nn.SiLU() + self.dropout = nn.Dropout(config.dropout) + self.down_proj = nn.Linear(config.hidden_dim * config.feedforward_mult, config.hidden_dim) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.pre_norm(hidden_states) + hidden_states = self.up_proj(hidden_states) + hidden_states = self.dropout(self.silu(hidden_states)) + hidden_states = self.down_proj(hidden_states) + hidden_states = self.dropout(hidden_states) + return hidden_states + + +class GraniteSpeechConformerAttention(nn.Module): + """Attention for conformer blocks using Shaw's relative positional embeddings. + See the following [paper](https://huggingface.co/papers/1803.02155) for more details. + """ + + def __init__(self, config: GraniteSpeechEncoderConfig): + super().__init__() + + inner_dim = config.dim_head * config.num_heads + self.max_pos_emb = config.max_pos_emb + self.context_size = config.context_size + self.num_heads = config.num_heads + self.dim_head = config.dim_head + self.scale = self.dim_head**-0.5 + self.pre_norm = nn.LayerNorm(config.hidden_dim) + self.to_q = nn.Linear(config.hidden_dim, inner_dim, bias=False) + self.to_kv = nn.Linear(config.hidden_dim, inner_dim * 2, bias=False) + self.to_out = nn.Linear(inner_dim, config.hidden_dim) + self.rel_pos_emb = nn.Embedding(2 * self.max_pos_emb + 1, self.dim_head) + self.dropout = nn.Dropout(config.dropout) + + if self.context_size <= 0 or self.context_size > self.max_pos_emb: + raise ValueError("Context size is either less than 0 or exceeds the max_pos_emb") + + def forward(self, hidden_states: torch.Tensor, attention_dists: torch.Tensor) -> torch.Tensor: + hidden_states = self.pre_norm(hidden_states) + bsz, num_features, _ = hidden_states.shape + + num_blocks = math.ceil(num_features / self.context_size) + remainder = num_features % self.context_size + if remainder > 0: + # right padding to reach block size + hidden_states = torch.nn.functional.pad(hidden_states, (0, 0, 0, self.context_size - remainder)) + + query_states = self.to_q(hidden_states) + key_states, value_states = self.to_kv(hidden_states).chunk(2, dim=-1) + + query_states = query_states.reshape(bsz, num_blocks, self.context_size, self.num_heads, -1).transpose(2, 3) + key_states = key_states.reshape(bsz, num_blocks, self.context_size, self.num_heads, -1).transpose(2, 3) + value_states = value_states.reshape(bsz, num_blocks, self.context_size, self.num_heads, -1).transpose(2, 3) + + # shaw's relative positional embedding + rel_pos_emb = self.rel_pos_emb(attention_dists) + # alternative computation of `pos_attn` - for readability + # rel_pos_emb_expanded = rel_pos_emb.view([1, 1, 1] + list(rel_pos_emb.shape)) + # pos_attn = torch.sum(query_states.unsqueeze(-2) * rel_pos_emb_expanded, dim=-1) * self.scale + # einsum implementation of pos_attn - gives x30 speedup over the alternative + # TODO (@avihu111) find a fast alternative to einsum + pos_attn = torch.einsum("b m h c d, c r d -> b m h c r", query_states, rel_pos_emb) * self.scale + + if remainder > 0: + # masked attention in the extended block + mask = torch.ones(self.context_size, self.context_size, dtype=bool, device=hidden_states.device) + mask[:remainder, :remainder] = 0 + mask_value = -torch.finfo(pos_attn.dtype).max + pos_attn[:, -1, :].masked_fill_(mask, mask_value) + + with torch.nn.attention.sdpa_kernel(torch.nn.attention.SDPBackend.MATH): + out = F.scaled_dot_product_attention( + query_states, key_states, value_states, attn_mask=pos_attn, scale=self.scale + ) + out = out.transpose(2, 3).reshape(bsz, hidden_states.shape[1], -1) + out = self.to_out(out[:, :num_features, :]) + return self.dropout(out) + + +class GraniteSpeechConformerDepthWiseConv1d(nn.Module): + """Wrapper for padded 1D pointwise convolution.""" + + def __init__(self, chan_in: int, chan_out: int, kernel_size: int): + super().__init__() + # Padding for the 1D conv is symmetric or close (i.e., offset by one). + pad = kernel_size // 2 + pad_offset = (kernel_size + 1) % 2 + self.padding = (pad, pad - pad_offset) + + self.conv = nn.Conv1d(chan_in, chan_out, kernel_size, groups=chan_in, bias=False) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = F.pad(hidden_states, self.padding) + return self.conv(hidden_states) + + +class GraniteSpeechConformerConvModule(nn.Module): + """Conformer conv module consisting of several 1D/depthwise 1D convolutional layers.""" + + def __init__(self, config: GraniteSpeechEncoderConfig): + super().__init__() + inner_dim = config.hidden_dim * config.conv_expansion_factor + + self.norm = nn.LayerNorm(config.hidden_dim) + self.up_conv = nn.Conv1d(config.hidden_dim, inner_dim * 2, 1) + self.glu = nn.GLU(dim=1) + self.depth_conv = GraniteSpeechConformerDepthWiseConv1d( + inner_dim, + inner_dim, + kernel_size=config.conv_kernel_size, + ) + self.silu = nn.SiLU() + self.batch_norm = nn.BatchNorm1d(inner_dim) + self.down_conv = nn.Conv1d(inner_dim, config.hidden_dim, 1) + self.dropout = nn.Dropout(config.dropout) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.norm(hidden_states) + hidden_states = self.up_conv(hidden_states.permute(0, 2, 1)) + hidden_states = self.glu(hidden_states) + hidden_states = self.depth_conv(hidden_states) + hidden_states = self.silu(self.batch_norm(hidden_states)) + hidden_states = self.down_conv(hidden_states).permute(0, 2, 1) + hidden_states = self.dropout(hidden_states) + return hidden_states + + +class GraniteSpeechConformerBlock(nn.Module): + """Conformer block, consisting largely of linear layers, attention, and convolutional layers.""" + + def __init__(self, config: GraniteSpeechEncoderConfig): + super().__init__() + self.ff1 = GraniteSpeechConformerFeedForward(config) + self.attn = GraniteSpeechConformerAttention(config) + self.conv = GraniteSpeechConformerConvModule(config) + self.ff2 = GraniteSpeechConformerFeedForward(config) + self.post_norm = nn.LayerNorm(config.hidden_dim) + + def forward(self, hidden_states: torch.Tensor, attention_dists: torch.Tensor) -> torch.Tensor: + hidden_states = 0.5 * self.ff1(hidden_states) + hidden_states + hidden_states = self.attn(hidden_states, attention_dists=attention_dists) + hidden_states + hidden_states = self.conv(hidden_states) + hidden_states + hidden_states = 0.5 * self.ff2(hidden_states) + hidden_states + hidden_states = self.post_norm(hidden_states) + return hidden_states + + +class GraniteSpeechCTCEncoder(nn.Module): + def __init__(self, config: GraniteSpeechEncoderConfig): + super().__init__() + self.config = config + + # Precompute clamped relative positional encoding distances + seq = torch.arange(config.context_size) + relpos_dist = seq.view(-1, 1) - seq.view(1, -1) + attention_dists = torch.clamp(relpos_dist, -config.context_size, config.context_size) + config.max_pos_emb + self.register_buffer("attention_dists", attention_dists, persistent=False) + self.input_linear = nn.Linear(config.input_dim, config.hidden_dim, bias=True) + self.layers = nn.ModuleList([GraniteSpeechConformerBlock(config) for _ in range(config.num_layers)]) + + self.out = nn.Linear(config.hidden_dim, config.output_dim, bias=True) + self.out_mid = nn.Linear(config.output_dim, config.hidden_dim, bias=True) + self.num_layers = config.num_layers + + def forward(self, hidden_states: torch.Tensor): + hidden_states = self.input_linear(hidden_states) + for idx, layer in enumerate(self.layers, start=1): + hidden_states = layer(hidden_states, attention_dists=self.attention_dists) + + if idx == self.num_layers // 2: + hidden_states_mid = hidden_states.clone() + hidden_states_mid = self.out(hidden_states_mid) + hidden_states += self.out_mid(nn.Softmax(dim=-1)(hidden_states_mid)) + return hidden_states + + +@auto_docstring +class GraniteSpeechPreTrainedModel(PreTrainedModel): + config: GraniteSpeechConfig + + _supports_flash_attn = False # `blip_2_qformer` dependency does not allow for this + _supports_sdpa = True + + def _init_weights(self, module: nn.Module): + """Initialize the weights.""" + std = self.config.initializer_range + + if isinstance(module, (nn.Linear, nn.Conv1d)): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, (nn.LayerNorm, nn.BatchNorm1d)): + module.weight.data.fill_(1.0) + module.bias.data.zero_() + elif isinstance(module, GraniteSpeechEncoderProjector): + module.query.data.normal_() + + +@auto_docstring( + custom_intro=""" + The Granite Speech model, which consists of an audio encoder, projector, and language model. + """ +) +class GraniteSpeechForConditionalGeneration(GraniteSpeechPreTrainedModel, GenerationMixin): + def __init__(self, config: GraniteSpeechConfig): + super().__init__(config) + # NOTE: It doesn't matter when we initialize from config, but we should be careful + # to make sure this does not pick up the adapter_config if in the future we use + # from_pretrained or something similar, since that should be set by the composite + # model; don't need to consider it twice + self.language_model = AutoModelForCausalLM.from_config(config.text_config) + + if self.language_model._tied_weights_keys is not None: + self._tied_weights_keys = [f"language_model.{k}" for k in self.language_model._tied_weights_keys] + + self.encoder = GraniteSpeechCTCEncoder(config.encoder_config) + self.projector = GraniteSpeechEncoderProjector(config) + + if config.has_lora_adapter and not is_peft_available(): + logger.warning( + "Config indicates that a lora adapter should be present, but " + "peft is not installed; this will cause the model to perform " + "incorrectly when audio inputs are provided. Please install " + "peft and reload the model!" + ) + + self.post_init() + + def set_input_embeddings(self, value): + self.language_model.set_input_embeddings(value) + + def set_output_embeddings(self, new_embeddings): + self.language_model.set_output_embeddings(new_embeddings) + + def get_input_embeddings(self): + return self.language_model.get_input_embeddings() + + def get_output_embeddings(self): + return self.language_model.get_output_embeddings() + + def get_audio_features(self, input_features: torch.Tensor) -> torch.Tensor: + """Get the audio features to merged into the multimodal embeddings.""" + encoder_embeds = self.encoder(input_features) + projected_embeds = self.projector(encoder_embeds) + return projected_embeds + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + input_features: Optional[torch.FloatTensor] = None, + input_features_mask: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + logits_to_keep: Union[int, torch.Tensor] = 0, + **lm_kwargs, + ) -> Union[tuple[torch.Tensor], GraniteSpeechCausalLMOutputWithPast]: + r""" + input_features_mask (`torch.Tensor`, *optional*): + Mask to be applied to audio features prior to scattering into the language embeddings. + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + """ + # TODO (@alex-jw-brooks) add an example to this docstring once models are released + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") + + if input_features is not None and inputs_embeds is not None: + raise ValueError( + "You cannot specify both input_features and inputs_embeds at the same time, and must specify either one" + ) + + if inputs_embeds is None: + # Get the base embeddings; set all audio tokens to 0 index + # to avoid out of vocabulary issues with the LLM embedding. + # Audio features will be masked into is_audio_idx indices later. + is_audio_idx = input_ids == self.config.audio_token_id + llm_input_ids = input_ids.clone() + llm_input_ids[is_audio_idx] = 0 + inputs_embeds = self.get_input_embeddings()(llm_input_ids) + + if input_features is not None: + if input_features.dtype != self.dtype: + input_features = input_features.to(self.dtype) + # Get the audio features from the encoder / projector + audio_embeds = self.get_audio_features(input_features) + + # Merge the audio features into the LLM embeddings + inputs_embeds = self.get_merged_audio_embeddings( + input_ids=input_ids, + audio_features=audio_embeds, + input_features_mask=input_features_mask, + ) + + outputs = self.language_model( + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + cache_position=cache_position, + logits_to_keep=logits_to_keep, + **lm_kwargs, + ) + logits = outputs[0] + + loss = None + if labels is not None: + # Shift so that tokens < n predict n + if attention_mask is not None: + # we use the input attention mask to shift the logits and labels, because it is 2D. + # we also crop attn mask in case it is longer, which happens in PrefixTuning with peft + shift_attention_mask = attention_mask[:, -(logits.shape[1] - 1) :].to(logits.device) + shift_logits = logits[..., :-1, :][shift_attention_mask.to(logits.device) != 0].contiguous() + shift_labels = labels[..., 1:][shift_attention_mask.to(labels.device) != 0].contiguous() + else: + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + # Flatten the tokens + loss_fct = nn.CrossEntropyLoss() + loss = loss_fct( + shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1).to(shift_logits.device) + ) + + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return GraniteSpeechCausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def prepare_inputs_for_generation( + self, + input_ids, + past_key_values=None, + inputs_embeds=None, + input_features=None, + attention_mask=None, + cache_position=None, + logits_to_keep=None, + **kwargs, + ): + # Overwritten -- in specific circumstances we don't want to forward audio inputs to the model + + model_inputs = self.language_model.prepare_inputs_for_generation( + input_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + cache_position=cache_position, + logits_to_keep=logits_to_keep, + **kwargs, + ) + + # If we're in cached decoding stage, input_features should be None because + # input ids do not contain special audio token anymore Otherwise we need + # input feature values to be passed to the model + if cache_position[0] == 0: + model_inputs["input_features"] = input_features + return model_inputs + + def get_merged_audio_embeddings( + self, input_ids: torch.Tensor, audio_features: torch.Tensor, input_features_mask: Optional[torch.Tensor] = None + ) -> torch.Tensor: + """ + Adds the audio token to the model's LLM vocabulary so that we can pass it + through the tokenizer; it's assumed that the embeddings corresponding to the + <|audio|> token will be clobbered with speech features. + + Args: + input_ids (`torch.Tensor`): + Input IDs containing one or more audio tokens. + audio_features (`torch.Tensor`): + Audio features to be masked into the language embeddings to form multimodal embeddings. + input_features_mask (`torch.Tensor`, *optional*, defaults to `None`) + Mask to be applied to audio features prior to scattering into the language embeddings. + """ + is_audio_index = input_ids == self.config.audio_token_id + llm_input_ids = torch.where(is_audio_index, 0, input_ids) + inputs_embeds = self.language_model.get_input_embeddings()(llm_input_ids) # [bsz, # features, hidden size] + + # Mask the audio features into the text embeddings + special_audio_mask = is_audio_index.unsqueeze(-1) + audio_features = audio_features.to(inputs_embeds.device, inputs_embeds.dtype) + if input_features_mask is not None: + if torch.all(is_audio_index.int().sum(dim=1) != input_features_mask.int().sum(dim=1)).item(): + raise ValueError("Number of audio tokens does not match number of audio features") + + audio_features = audio_features[input_features_mask] + + inputs_embeds = inputs_embeds.masked_scatter( + special_audio_mask, + audio_features, + ) + return inputs_embeds + + def generate(self, *args, **kwargs) -> torch.LongTensor: + # This model is expected to have a lora adapter, which is only + # enabled when considering audio inputs. As such, we override generate + # to conditionally enable / disable the lora adapter based on whether + # or not any input features were provided. + + input_features = kwargs.pop("input_features", None) + if is_peft_available and self._hf_peft_config_loaded: + if input_features is not None: + self.enable_adapters() + else: + self.disable_adapters() + return super().generate(*args, input_features=input_features, **kwargs) + + def save_pretrained(self, save_directory, *args, **kwargs): + # overwrite save_pretrained to first save the adapter if we have one + if is_peft_available and self._hf_peft_config_loaded: + adapter_name = self._get_adapter_name() + self.peft_config[adapter_name].base_model_name_or_path = save_directory + super().save_pretrained(save_directory, *args, **kwargs) + # Then save the base model afterwards + prev_val = self._hf_peft_config_loaded + self._hf_peft_config_loaded = False + super().save_pretrained(save_directory, *args, **kwargs) + self._hf_peft_config_loaded = prev_val + + @staticmethod + def _fix_state_dict_key_on_save(key) -> tuple[str, bool]: + # save the model with the original weights format + return key.replace(".base_layer", ""), False + + def _fix_state_dict_keys_on_save(self, state_dict): + if is_peft_available and self._hf_peft_config_loaded: + # state dict is only adapter, should keep the same + return state_dict + # rename back the base model state dict + return { + self._fix_state_dict_key_on_save(key)[0]: value for key, value in state_dict.items() if ".lora_" not in key + } + + def _get_adapter_name(self): + return list(self.peft_config.keys())[0] + + +__all__ = [ + "GraniteSpeechCTCEncoder", + "GraniteSpeechForConditionalGeneration", + "GraniteSpeechPreTrainedModel", +] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/granite_speech/processing_granite_speech.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/granite_speech/processing_granite_speech.py new file mode 100644 index 0000000000000000000000000000000000000000..84515d173c471198b987081198aeeed9415252c9 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/granite_speech/processing_granite_speech.py @@ -0,0 +1,104 @@ +# coding=utf-8 +# Copyright 2025 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Processor class for Granite Speech.""" + +from typing import Union + +from ...feature_extraction_utils import BatchFeature +from ...processing_utils import ProcessorMixin +from ...tokenization_utils import PreTokenizedInput, TextInput +from ...utils import is_torch_available, logging +from ...utils.import_utils import requires_backends + + +if is_torch_available(): + import torch + +logger = logging.get_logger(__name__) + + +class GraniteSpeechProcessor(ProcessorMixin): + attributes = ["audio_processor", "tokenizer"] + audio_processor_class = "GraniteSpeechFeatureExtractor" + tokenizer_class = "AutoTokenizer" + + def __init__( + self, + audio_processor, + tokenizer, + audio_token="<|audio|>", + chat_template=None, + ): + self.audio_token = tokenizer.audio_token if hasattr(tokenizer, "audio_token") else audio_token + super().__init__(audio_processor, tokenizer, chat_template=chat_template) + + def __call__( + self, + text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]], + audio: Union["torch.Tensor", list["torch.Tensor"]] = None, + device: str = "cpu", + images=None, + videos=None, + **kwargs, + ) -> BatchFeature: + requires_backends(self, ["torch"]) + + text = self._get_validated_text(text) + prompt_strings = text + + if audio is not None: + # NOTE - we intentionally avoid throwing for potentially misaligned + # text / audio inputs here because some inference engines will + # trigger the conditions due to the way they call multimodal + # processors, e.g., vLLM. + audio_inputs = self.audio_processor(audio, device=device) + + # TODO (@alex-jw-brooks); we should add a util to get_num_audio_tokens + # from feature lengths and call it here, rather than returning it + # from the feature extractor. + audio_embed_sizes = audio_inputs.pop("audio_embed_sizes") + + # Expand the audio placeholders to match the feature dims; this + # is similar to how many VLMs handle image tokens, e.g., llava next + prompt_strings = [] + num_replaced = 0 + for sample in text: + while self.audio_token in sample: + sample = sample.replace( + self.audio_token, + "" * audio_embed_sizes[num_replaced], + 1, + ) + num_replaced += 1 + prompt_strings.append(sample) + + prompt_strings = [sample.replace("", self.audio_token) for sample in prompt_strings] + else: + audio_inputs = {} + + if "padding" not in kwargs: + kwargs["padding"] = True + text_inputs = self.tokenizer(prompt_strings, **kwargs) + return BatchFeature(data={**text_inputs, **audio_inputs}) + + def _get_validated_text(self, text: Union[str, list]) -> list[str]: + if isinstance(text, str): + return [text] + elif isinstance(text, list) and isinstance(text[0], str): + return text + raise TypeError("Invalid text provided! Text should be a string or list of strings.") + + +__all__ = ["GraniteSpeechProcessor"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/grounding_dino/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/grounding_dino/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..68f678c9cf9c9b239a79e1ded06958c43ecbaebe --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/grounding_dino/__init__.py @@ -0,0 +1,30 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_grounding_dino import * + from .image_processing_grounding_dino import * + from .image_processing_grounding_dino_fast import * + from .modeling_grounding_dino import * + from .processing_grounding_dino import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/grounding_dino/configuration_grounding_dino.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/grounding_dino/configuration_grounding_dino.py new file mode 100644 index 0000000000000000000000000000000000000000..838a897f70afcc099df80de3d9058173a36a0b3e --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/grounding_dino/configuration_grounding_dino.py @@ -0,0 +1,309 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Grounding DINO model configuration""" + +from ...configuration_utils import PretrainedConfig +from ...utils import logging +from ...utils.backbone_utils import verify_backbone_config_arguments +from ..auto import CONFIG_MAPPING + + +logger = logging.get_logger(__name__) + + +class GroundingDinoConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`GroundingDinoModel`]. It is used to instantiate a + Grounding DINO model according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of the Grounding DINO + [IDEA-Research/grounding-dino-tiny](https://huggingface.co/IDEA-Research/grounding-dino-tiny) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + backbone_config (`PretrainedConfig` or `dict`, *optional*, defaults to `ResNetConfig()`): + The configuration of the backbone model. + backbone (`str`, *optional*): + Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this + will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone` + is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights. + use_pretrained_backbone (`bool`, *optional*, defaults to `False`): + Whether to use pretrained weights for the backbone. + use_timm_backbone (`bool`, *optional*, defaults to `False`): + Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers + library. + backbone_kwargs (`dict`, *optional*): + Keyword arguments to be passed to AutoBackbone when loading from a checkpoint + e.g. `{'out_indices': (0, 1, 2, 3)}`. Cannot be specified if `backbone_config` is set. + text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `BertConfig`): + The config object or dictionary of the text backbone. + num_queries (`int`, *optional*, defaults to 900): + Number of object queries, i.e. detection slots. This is the maximal number of objects + [`GroundingDinoModel`] can detect in a single image. + encoder_layers (`int`, *optional*, defaults to 6): + Number of encoder layers. + encoder_ffn_dim (`int`, *optional*, defaults to 2048): + Dimension of the "intermediate" (often named feed-forward) layer in decoder. + encoder_attention_heads (`int`, *optional*, defaults to 8): + Number of attention heads for each attention layer in the Transformer encoder. + decoder_layers (`int`, *optional*, defaults to 6): + Number of decoder layers. + decoder_ffn_dim (`int`, *optional*, defaults to 2048): + Dimension of the "intermediate" (often named feed-forward) layer in decoder. + decoder_attention_heads (`int`, *optional*, defaults to 8): + Number of attention heads for each attention layer in the Transformer decoder. + is_encoder_decoder (`bool`, *optional*, defaults to `True`): + Whether the model is used as an encoder/decoder or not. + activation_function (`str` or `function`, *optional*, defaults to `"relu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"silu"` and `"gelu_new"` are supported. + d_model (`int`, *optional*, defaults to 256): + Dimension of the layers. + dropout (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + activation_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for activations inside the fully connected layer. + auxiliary_loss (`bool`, *optional*, defaults to `False`): + Whether auxiliary decoding losses (loss at each decoder layer) are to be used. + position_embedding_type (`str`, *optional*, defaults to `"sine"`): + Type of position embeddings to be used on top of the image features. One of `"sine"` or `"learned"`. + num_feature_levels (`int`, *optional*, defaults to 4): + The number of input feature levels. + encoder_n_points (`int`, *optional*, defaults to 4): + The number of sampled keys in each feature level for each attention head in the encoder. + decoder_n_points (`int`, *optional*, defaults to 4): + The number of sampled keys in each feature level for each attention head in the decoder. + two_stage (`bool`, *optional*, defaults to `True`): + Whether to apply a two-stage deformable DETR, where the region proposals are also generated by a variant of + Grounding DINO, which are further fed into the decoder for iterative bounding box refinement. + class_cost (`float`, *optional*, defaults to 1.0): + Relative weight of the classification error in the Hungarian matching cost. + bbox_cost (`float`, *optional*, defaults to 5.0): + Relative weight of the L1 error of the bounding box coordinates in the Hungarian matching cost. + giou_cost (`float`, *optional*, defaults to 2.0): + Relative weight of the generalized IoU loss of the bounding box in the Hungarian matching cost. + bbox_loss_coefficient (`float`, *optional*, defaults to 5.0): + Relative weight of the L1 bounding box loss in the object detection loss. + giou_loss_coefficient (`float`, *optional*, defaults to 2.0): + Relative weight of the generalized IoU loss in the object detection loss. + focal_alpha (`float`, *optional*, defaults to 0.25): + Alpha parameter in the focal loss. + disable_custom_kernels (`bool`, *optional*, defaults to `False`): + Disable the use of custom CUDA and CPU kernels. This option is necessary for the ONNX export, as custom + kernels are not supported by PyTorch ONNX export. + max_text_len (`int`, *optional*, defaults to 256): + The maximum length of the text input. + text_enhancer_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the text enhancer. + fusion_droppath (`float`, *optional*, defaults to 0.1): + The droppath ratio for the fusion module. + fusion_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the fusion module. + embedding_init_target (`bool`, *optional*, defaults to `True`): + Whether to initialize the target with Embedding weights. + query_dim (`int`, *optional*, defaults to 4): + The dimension of the query vector. + decoder_bbox_embed_share (`bool`, *optional*, defaults to `True`): + Whether to share the bbox regression head for all decoder layers. + two_stage_bbox_embed_share (`bool`, *optional*, defaults to `False`): + Whether to share the bbox embedding between the two-stage bbox generator and the region proposal + generation. + positional_embedding_temperature (`float`, *optional*, defaults to 20): + The temperature for Sine Positional Embedding that is used together with vision backbone. + init_std (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-05): + The epsilon used by the layer normalization layers. + + Examples: + + ```python + >>> from transformers import GroundingDinoConfig, GroundingDinoModel + + >>> # Initializing a Grounding DINO IDEA-Research/grounding-dino-tiny style configuration + >>> configuration = GroundingDinoConfig() + + >>> # Initializing a model (with random weights) from the IDEA-Research/grounding-dino-tiny style configuration + >>> model = GroundingDinoModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "grounding-dino" + attribute_map = { + "hidden_size": "d_model", + "num_attention_heads": "encoder_attention_heads", + } + + def __init__( + self, + backbone_config=None, + backbone=None, + use_pretrained_backbone=False, + use_timm_backbone=False, + backbone_kwargs=None, + text_config=None, + num_queries=900, + encoder_layers=6, + encoder_ffn_dim=2048, + encoder_attention_heads=8, + decoder_layers=6, + decoder_ffn_dim=2048, + decoder_attention_heads=8, + is_encoder_decoder=True, + activation_function="relu", + d_model=256, + dropout=0.1, + attention_dropout=0.0, + activation_dropout=0.0, + auxiliary_loss=False, + position_embedding_type="sine", + num_feature_levels=4, + encoder_n_points=4, + decoder_n_points=4, + two_stage=True, + class_cost=1.0, + bbox_cost=5.0, + giou_cost=2.0, + bbox_loss_coefficient=5.0, + giou_loss_coefficient=2.0, + focal_alpha=0.25, + disable_custom_kernels=False, + # other parameters + max_text_len=256, + text_enhancer_dropout=0.0, + fusion_droppath=0.1, + fusion_dropout=0.0, + embedding_init_target=True, + query_dim=4, + decoder_bbox_embed_share=True, + two_stage_bbox_embed_share=False, + positional_embedding_temperature=20, + init_std=0.02, + layer_norm_eps=1e-5, + **kwargs, + ): + if backbone_config is None and backbone is None: + logger.info("`backbone_config` is `None`. Initializing the config with the default `Swin` backbone.") + backbone_config = CONFIG_MAPPING["swin"]( + window_size=7, + image_size=224, + embed_dim=96, + depths=[2, 2, 6, 2], + num_heads=[3, 6, 12, 24], + out_indices=[2, 3, 4], + ) + elif isinstance(backbone_config, dict): + backbone_model_type = backbone_config.pop("model_type") + config_class = CONFIG_MAPPING[backbone_model_type] + backbone_config = config_class.from_dict(backbone_config) + + verify_backbone_config_arguments( + use_timm_backbone=use_timm_backbone, + use_pretrained_backbone=use_pretrained_backbone, + backbone=backbone, + backbone_config=backbone_config, + backbone_kwargs=backbone_kwargs, + ) + + if text_config is None: + text_config = {} + logger.info("text_config is None. Initializing the text config with default values (`BertConfig`).") + + self.backbone_config = backbone_config + self.backbone = backbone + self.use_pretrained_backbone = use_pretrained_backbone + self.use_timm_backbone = use_timm_backbone + self.backbone_kwargs = backbone_kwargs + self.num_queries = num_queries + self.d_model = d_model + self.encoder_ffn_dim = encoder_ffn_dim + self.encoder_layers = encoder_layers + self.encoder_attention_heads = encoder_attention_heads + self.decoder_ffn_dim = decoder_ffn_dim + self.decoder_layers = decoder_layers + self.decoder_attention_heads = decoder_attention_heads + self.dropout = dropout + self.attention_dropout = attention_dropout + self.activation_dropout = activation_dropout + self.activation_function = activation_function + self.auxiliary_loss = auxiliary_loss + self.position_embedding_type = position_embedding_type + # deformable attributes + self.num_feature_levels = num_feature_levels + self.encoder_n_points = encoder_n_points + self.decoder_n_points = decoder_n_points + self.two_stage = two_stage + # Hungarian matcher + self.class_cost = class_cost + self.bbox_cost = bbox_cost + self.giou_cost = giou_cost + # Loss coefficients + self.bbox_loss_coefficient = bbox_loss_coefficient + self.giou_loss_coefficient = giou_loss_coefficient + self.focal_alpha = focal_alpha + self.disable_custom_kernels = disable_custom_kernels + # Text backbone + if isinstance(text_config, dict): + text_config["model_type"] = text_config.get("model_type", "bert") + text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config) + elif text_config is None: + text_config = CONFIG_MAPPING["bert"]() + + self.text_config = text_config + self.max_text_len = max_text_len + + # Text Enhancer + self.text_enhancer_dropout = text_enhancer_dropout + # Fusion + self.fusion_droppath = fusion_droppath + self.fusion_dropout = fusion_dropout + # Others + self.embedding_init_target = embedding_init_target + self.query_dim = query_dim + self.decoder_bbox_embed_share = decoder_bbox_embed_share + self.two_stage_bbox_embed_share = two_stage_bbox_embed_share + if two_stage_bbox_embed_share and not decoder_bbox_embed_share: + raise ValueError("If two_stage_bbox_embed_share is True, decoder_bbox_embed_share must be True.") + self.positional_embedding_temperature = positional_embedding_temperature + self.init_std = init_std + self.layer_norm_eps = layer_norm_eps + super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) + + @property + def num_attention_heads(self) -> int: + return self.encoder_attention_heads + + @property + def hidden_size(self) -> int: + return self.d_model + + @property + def sub_configs(self): + sub_configs = {} + backbone_config = getattr(self, "backbone_config", None) + text_config = getattr(self, "text_config", None) + if isinstance(backbone_config, PretrainedConfig): + sub_configs["backbone_config"] = type(backbone_config) + if isinstance(text_config, PretrainedConfig): + sub_configs["text_config"] = type(self.text_config) + return sub_configs + + +__all__ = ["GroundingDinoConfig"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/grounding_dino/image_processing_grounding_dino.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/grounding_dino/image_processing_grounding_dino.py new file mode 100644 index 0000000000000000000000000000000000000000..24e3c8b3f9873d59a596433492a6f8fef8b9054f --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/grounding_dino/image_processing_grounding_dino.py @@ -0,0 +1,1621 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Image processor class for Deformable DETR.""" + +import io +import pathlib +from collections import defaultdict +from collections.abc import Iterable +from typing import TYPE_CHECKING, Any, Callable, Optional, Union + +import numpy as np + +from ...feature_extraction_utils import BatchFeature +from ...image_processing_utils import BaseImageProcessor, get_size_dict +from ...image_transforms import ( + PaddingMode, + center_to_corners_format, + corners_to_center_format, + id_to_rgb, + pad, + rescale, + resize, + rgb_to_id, + to_channel_dimension_format, +) +from ...image_utils import ( + IMAGENET_DEFAULT_MEAN, + IMAGENET_DEFAULT_STD, + ChannelDimension, + ImageInput, + PILImageResampling, + get_image_size, + infer_channel_dimension_format, + is_scaled_image, + make_flat_list_of_images, + to_numpy_array, + valid_images, + validate_annotations, + validate_kwargs, + validate_preprocess_arguments, +) +from ...utils import ( + ExplicitEnum, + TensorType, + is_flax_available, + is_jax_tensor, + is_scipy_available, + is_tf_available, + is_tf_tensor, + is_torch_available, + is_torch_tensor, + is_vision_available, + logging, +) + + +if is_torch_available(): + import torch + from torch import nn + + +if is_vision_available(): + import PIL + +if is_scipy_available(): + import scipy.special + import scipy.stats + +if TYPE_CHECKING: + from .modeling_grounding_dino import GroundingDinoObjectDetectionOutput + + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + +AnnotationType = dict[str, Union[int, str, list[dict]]] + + +class AnnotationFormat(ExplicitEnum): + COCO_DETECTION = "coco_detection" + COCO_PANOPTIC = "coco_panoptic" + + +SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC) + + +# Copied from transformers.models.detr.image_processing_detr.get_size_with_aspect_ratio +def get_size_with_aspect_ratio(image_size, size, max_size=None) -> tuple[int, int]: + """ + Computes the output image size given the input image size and the desired output size. + + Args: + image_size (`tuple[int, int]`): + The input image size. + size (`int`): + The desired output size. + max_size (`int`, *optional*): + The maximum allowed output size. + """ + height, width = image_size + raw_size = None + if max_size is not None: + min_original_size = float(min((height, width))) + max_original_size = float(max((height, width))) + if max_original_size / min_original_size * size > max_size: + raw_size = max_size * min_original_size / max_original_size + size = int(round(raw_size)) + + if (height <= width and height == size) or (width <= height and width == size): + oh, ow = height, width + elif width < height: + ow = size + if max_size is not None and raw_size is not None: + oh = int(raw_size * height / width) + else: + oh = int(size * height / width) + else: + oh = size + if max_size is not None and raw_size is not None: + ow = int(raw_size * width / height) + else: + ow = int(size * width / height) + + return (oh, ow) + + +# Copied from transformers.models.detr.image_processing_detr.get_resize_output_image_size +def get_resize_output_image_size( + input_image: np.ndarray, + size: Union[int, tuple[int, int], list[int]], + max_size: Optional[int] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, +) -> tuple[int, int]: + """ + Computes the output image size given the input image size and the desired output size. If the desired output size + is a tuple or list, the output image size is returned as is. If the desired output size is an integer, the output + image size is computed by keeping the aspect ratio of the input image size. + + Args: + input_image (`np.ndarray`): + The image to resize. + size (`int` or `tuple[int, int]` or `list[int]`): + The desired output size. + max_size (`int`, *optional*): + The maximum allowed output size. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format of the input image. If not provided, it will be inferred from the input image. + """ + image_size = get_image_size(input_image, input_data_format) + if isinstance(size, (list, tuple)): + return size + + return get_size_with_aspect_ratio(image_size, size, max_size) + + +# Copied from transformers.models.detr.image_processing_detr.get_image_size_for_max_height_width +def get_image_size_for_max_height_width( + input_image: np.ndarray, + max_height: int, + max_width: int, + input_data_format: Optional[Union[str, ChannelDimension]] = None, +) -> tuple[int, int]: + """ + Computes the output image size given the input image and the maximum allowed height and width. Keep aspect ratio. + Important, even if image_height < max_height and image_width < max_width, the image will be resized + to at least one of the edges be equal to max_height or max_width. + + For example: + - input_size: (100, 200), max_height: 50, max_width: 50 -> output_size: (25, 50) + - input_size: (100, 200), max_height: 200, max_width: 500 -> output_size: (200, 400) + + Args: + input_image (`np.ndarray`): + The image to resize. + max_height (`int`): + The maximum allowed height. + max_width (`int`): + The maximum allowed width. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format of the input image. If not provided, it will be inferred from the input image. + """ + image_size = get_image_size(input_image, input_data_format) + height, width = image_size + height_scale = max_height / height + width_scale = max_width / width + min_scale = min(height_scale, width_scale) + new_height = int(height * min_scale) + new_width = int(width * min_scale) + return new_height, new_width + + +# Copied from transformers.models.detr.image_processing_detr.get_numpy_to_framework_fn +def get_numpy_to_framework_fn(arr) -> Callable: + """ + Returns a function that converts a numpy array to the framework of the input array. + + Args: + arr (`np.ndarray`): The array to convert. + """ + if isinstance(arr, np.ndarray): + return np.array + if is_tf_available() and is_tf_tensor(arr): + import tensorflow as tf + + return tf.convert_to_tensor + if is_torch_available() and is_torch_tensor(arr): + import torch + + return torch.tensor + if is_flax_available() and is_jax_tensor(arr): + import jax.numpy as jnp + + return jnp.array + raise ValueError(f"Cannot convert arrays of type {type(arr)}") + + +# Copied from transformers.models.detr.image_processing_detr.safe_squeeze +def safe_squeeze(arr: np.ndarray, axis: Optional[int] = None) -> np.ndarray: + """ + Squeezes an array, but only if the axis specified has dim 1. + """ + if axis is None: + return arr.squeeze() + + try: + return arr.squeeze(axis=axis) + except ValueError: + return arr + + +# Copied from transformers.models.detr.image_processing_detr.normalize_annotation +def normalize_annotation(annotation: dict, image_size: tuple[int, int]) -> dict: + image_height, image_width = image_size + norm_annotation = {} + for key, value in annotation.items(): + if key == "boxes": + boxes = value + boxes = corners_to_center_format(boxes) + boxes /= np.asarray([image_width, image_height, image_width, image_height], dtype=np.float32) + norm_annotation[key] = boxes + else: + norm_annotation[key] = value + return norm_annotation + + +# Copied from transformers.models.detr.image_processing_detr.max_across_indices +def max_across_indices(values: Iterable[Any]) -> list[Any]: + """ + Return the maximum value across all indices of an iterable of values. + """ + return [max(values_i) for values_i in zip(*values)] + + +# Copied from transformers.models.detr.image_processing_detr.get_max_height_width +def get_max_height_width( + images: list[np.ndarray], input_data_format: Optional[Union[str, ChannelDimension]] = None +) -> list[int]: + """ + Get the maximum height and width across all images in a batch. + """ + if input_data_format is None: + input_data_format = infer_channel_dimension_format(images[0]) + + if input_data_format == ChannelDimension.FIRST: + _, max_height, max_width = max_across_indices([img.shape for img in images]) + elif input_data_format == ChannelDimension.LAST: + max_height, max_width, _ = max_across_indices([img.shape for img in images]) + else: + raise ValueError(f"Invalid channel dimension format: {input_data_format}") + return (max_height, max_width) + + +# Copied from transformers.models.detr.image_processing_detr.make_pixel_mask +def make_pixel_mask( + image: np.ndarray, output_size: tuple[int, int], input_data_format: Optional[Union[str, ChannelDimension]] = None +) -> np.ndarray: + """ + Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding. + + Args: + image (`np.ndarray`): + Image to make the pixel mask for. + output_size (`tuple[int, int]`): + Output size of the mask. + """ + input_height, input_width = get_image_size(image, channel_dim=input_data_format) + mask = np.zeros(output_size, dtype=np.int64) + mask[:input_height, :input_width] = 1 + return mask + + +# Copied from transformers.models.detr.image_processing_detr.convert_coco_poly_to_mask +def convert_coco_poly_to_mask(segmentations, height: int, width: int) -> np.ndarray: + """ + Convert a COCO polygon annotation to a mask. + + Args: + segmentations (`list[list[float]]`): + List of polygons, each polygon represented by a list of x-y coordinates. + height (`int`): + Height of the mask. + width (`int`): + Width of the mask. + """ + try: + from pycocotools import mask as coco_mask + except ImportError: + raise ImportError("Pycocotools is not installed in your environment.") + + masks = [] + for polygons in segmentations: + rles = coco_mask.frPyObjects(polygons, height, width) + mask = coco_mask.decode(rles) + if len(mask.shape) < 3: + mask = mask[..., None] + mask = np.asarray(mask, dtype=np.uint8) + mask = np.any(mask, axis=2) + masks.append(mask) + if masks: + masks = np.stack(masks, axis=0) + else: + masks = np.zeros((0, height, width), dtype=np.uint8) + + return masks + + +# Copied from transformers.models.detr.image_processing_detr.prepare_coco_detection_annotation with DETR->GroundingDino +def prepare_coco_detection_annotation( + image, + target, + return_segmentation_masks: bool = False, + input_data_format: Optional[Union[ChannelDimension, str]] = None, +): + """ + Convert the target in COCO format into the format expected by GroundingDino. + """ + image_height, image_width = get_image_size(image, channel_dim=input_data_format) + + image_id = target["image_id"] + image_id = np.asarray([image_id], dtype=np.int64) + + # Get all COCO annotations for the given image. + annotations = target["annotations"] + annotations = [obj for obj in annotations if "iscrowd" not in obj or obj["iscrowd"] == 0] + + classes = [obj["category_id"] for obj in annotations] + classes = np.asarray(classes, dtype=np.int64) + + # for conversion to coco api + area = np.asarray([obj["area"] for obj in annotations], dtype=np.float32) + iscrowd = np.asarray([obj.get("iscrowd", 0) for obj in annotations], dtype=np.int64) + + boxes = [obj["bbox"] for obj in annotations] + # guard against no boxes via resizing + boxes = np.asarray(boxes, dtype=np.float32).reshape(-1, 4) + boxes[:, 2:] += boxes[:, :2] + boxes[:, 0::2] = boxes[:, 0::2].clip(min=0, max=image_width) + boxes[:, 1::2] = boxes[:, 1::2].clip(min=0, max=image_height) + + keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0]) + + new_target = {} + new_target["image_id"] = image_id + new_target["class_labels"] = classes[keep] + new_target["boxes"] = boxes[keep] + new_target["area"] = area[keep] + new_target["iscrowd"] = iscrowd[keep] + new_target["orig_size"] = np.asarray([int(image_height), int(image_width)], dtype=np.int64) + + if annotations and "keypoints" in annotations[0]: + keypoints = [obj["keypoints"] for obj in annotations] + # Converting the filtered keypoints list to a numpy array + keypoints = np.asarray(keypoints, dtype=np.float32) + # Apply the keep mask here to filter the relevant annotations + keypoints = keypoints[keep] + num_keypoints = keypoints.shape[0] + keypoints = keypoints.reshape((-1, 3)) if num_keypoints else keypoints + new_target["keypoints"] = keypoints + + if return_segmentation_masks: + segmentation_masks = [obj["segmentation"] for obj in annotations] + masks = convert_coco_poly_to_mask(segmentation_masks, image_height, image_width) + new_target["masks"] = masks[keep] + + return new_target + + +# Copied from transformers.models.detr.image_processing_detr.masks_to_boxes +def masks_to_boxes(masks: np.ndarray) -> np.ndarray: + """ + Compute the bounding boxes around the provided panoptic segmentation masks. + + Args: + masks: masks in format `[number_masks, height, width]` where N is the number of masks + + Returns: + boxes: bounding boxes in format `[number_masks, 4]` in xyxy format + """ + if masks.size == 0: + return np.zeros((0, 4)) + + h, w = masks.shape[-2:] + y = np.arange(0, h, dtype=np.float32) + x = np.arange(0, w, dtype=np.float32) + # see https://github.com/pytorch/pytorch/issues/50276 + y, x = np.meshgrid(y, x, indexing="ij") + + x_mask = masks * np.expand_dims(x, axis=0) + x_max = x_mask.reshape(x_mask.shape[0], -1).max(-1) + x = np.ma.array(x_mask, mask=~(np.array(masks, dtype=bool))) + x_min = x.filled(fill_value=1e8) + x_min = x_min.reshape(x_min.shape[0], -1).min(-1) + + y_mask = masks * np.expand_dims(y, axis=0) + y_max = y_mask.reshape(x_mask.shape[0], -1).max(-1) + y = np.ma.array(y_mask, mask=~(np.array(masks, dtype=bool))) + y_min = y.filled(fill_value=1e8) + y_min = y_min.reshape(y_min.shape[0], -1).min(-1) + + return np.stack([x_min, y_min, x_max, y_max], 1) + + +# Copied from transformers.models.detr.image_processing_detr.prepare_coco_panoptic_annotation with DETR->GroundingDino +def prepare_coco_panoptic_annotation( + image: np.ndarray, + target: dict, + masks_path: Union[str, pathlib.Path], + return_masks: bool = True, + input_data_format: Union[ChannelDimension, str] = None, +) -> dict: + """ + Prepare a coco panoptic annotation for GroundingDino. + """ + image_height, image_width = get_image_size(image, channel_dim=input_data_format) + annotation_path = pathlib.Path(masks_path) / target["file_name"] + + new_target = {} + new_target["image_id"] = np.asarray([target["image_id"] if "image_id" in target else target["id"]], dtype=np.int64) + new_target["size"] = np.asarray([image_height, image_width], dtype=np.int64) + new_target["orig_size"] = np.asarray([image_height, image_width], dtype=np.int64) + + if "segments_info" in target: + masks = np.asarray(PIL.Image.open(annotation_path), dtype=np.uint32) + masks = rgb_to_id(masks) + + ids = np.array([segment_info["id"] for segment_info in target["segments_info"]]) + masks = masks == ids[:, None, None] + masks = masks.astype(np.uint8) + if return_masks: + new_target["masks"] = masks + new_target["boxes"] = masks_to_boxes(masks) + new_target["class_labels"] = np.array( + [segment_info["category_id"] for segment_info in target["segments_info"]], dtype=np.int64 + ) + new_target["iscrowd"] = np.asarray( + [segment_info["iscrowd"] for segment_info in target["segments_info"]], dtype=np.int64 + ) + new_target["area"] = np.asarray( + [segment_info["area"] for segment_info in target["segments_info"]], dtype=np.float32 + ) + + return new_target + + +# Copied from transformers.models.detr.image_processing_detr.get_segmentation_image +def get_segmentation_image( + masks: np.ndarray, input_size: tuple, target_size: tuple, stuff_equiv_classes, deduplicate=False +): + h, w = input_size + final_h, final_w = target_size + + m_id = scipy.special.softmax(masks.transpose(0, 1), -1) + + if m_id.shape[-1] == 0: + # We didn't detect any mask :( + m_id = np.zeros((h, w), dtype=np.int64) + else: + m_id = m_id.argmax(-1).reshape(h, w) + + if deduplicate: + # Merge the masks corresponding to the same stuff class + for equiv in stuff_equiv_classes.values(): + for eq_id in equiv: + m_id[m_id == eq_id] = equiv[0] + + seg_img = id_to_rgb(m_id) + seg_img = resize(seg_img, (final_w, final_h), resample=PILImageResampling.NEAREST) + return seg_img + + +# Copied from transformers.models.detr.image_processing_detr.get_mask_area +def get_mask_area(seg_img: np.ndarray, target_size: tuple[int, int], n_classes: int) -> np.ndarray: + final_h, final_w = target_size + np_seg_img = seg_img.astype(np.uint8) + np_seg_img = np_seg_img.reshape(final_h, final_w, 3) + m_id = rgb_to_id(np_seg_img) + area = [(m_id == i).sum() for i in range(n_classes)] + return area + + +# Copied from transformers.models.detr.image_processing_detr.score_labels_from_class_probabilities +def score_labels_from_class_probabilities(logits: np.ndarray) -> tuple[np.ndarray, np.ndarray]: + probs = scipy.special.softmax(logits, axis=-1) + labels = probs.argmax(-1, keepdims=True) + scores = np.take_along_axis(probs, labels, axis=-1) + scores, labels = scores.squeeze(-1), labels.squeeze(-1) + return scores, labels + + +# Copied from transformers.models.detr.image_processing_detr.post_process_panoptic_sample +def post_process_panoptic_sample( + out_logits: np.ndarray, + masks: np.ndarray, + boxes: np.ndarray, + processed_size: tuple[int, int], + target_size: tuple[int, int], + is_thing_map: dict, + threshold=0.85, +) -> dict: + """ + Converts the output of [`DetrForSegmentation`] into panoptic segmentation predictions for a single sample. + + Args: + out_logits (`torch.Tensor`): + The logits for this sample. + masks (`torch.Tensor`): + The predicted segmentation masks for this sample. + boxes (`torch.Tensor`): + The predicted bounding boxes for this sample. The boxes are in the normalized format `(center_x, center_y, + width, height)` and values between `[0, 1]`, relative to the size the image (disregarding padding). + processed_size (`tuple[int, int]`): + The processed size of the image `(height, width)`, as returned by the preprocessing step i.e. the size + after data augmentation but before batching. + target_size (`tuple[int, int]`): + The target size of the image, `(height, width)` corresponding to the requested final size of the + prediction. + is_thing_map (`Dict`): + A dictionary mapping class indices to a boolean value indicating whether the class is a thing or not. + threshold (`float`, *optional*, defaults to 0.85): + The threshold used to binarize the segmentation masks. + """ + # we filter empty queries and detection below threshold + scores, labels = score_labels_from_class_probabilities(out_logits) + keep = (labels != out_logits.shape[-1] - 1) & (scores > threshold) + + cur_scores = scores[keep] + cur_classes = labels[keep] + cur_boxes = center_to_corners_format(boxes[keep]) + + if len(cur_boxes) != len(cur_classes): + raise ValueError("Not as many boxes as there are classes") + + cur_masks = masks[keep] + cur_masks = resize(cur_masks[:, None], processed_size, resample=PILImageResampling.BILINEAR) + cur_masks = safe_squeeze(cur_masks, 1) + b, h, w = cur_masks.shape + + # It may be that we have several predicted masks for the same stuff class. + # In the following, we track the list of masks ids for each stuff class (they are merged later on) + cur_masks = cur_masks.reshape(b, -1) + stuff_equiv_classes = defaultdict(list) + for k, label in enumerate(cur_classes): + if not is_thing_map[label]: + stuff_equiv_classes[label].append(k) + + seg_img = get_segmentation_image(cur_masks, processed_size, target_size, stuff_equiv_classes, deduplicate=True) + area = get_mask_area(cur_masks, processed_size, n_classes=len(cur_scores)) + + # We filter out any mask that is too small + if cur_classes.size() > 0: + # We know filter empty masks as long as we find some + filtered_small = np.array([a <= 4 for a in area], dtype=bool) + while filtered_small.any(): + cur_masks = cur_masks[~filtered_small] + cur_scores = cur_scores[~filtered_small] + cur_classes = cur_classes[~filtered_small] + seg_img = get_segmentation_image(cur_masks, (h, w), target_size, stuff_equiv_classes, deduplicate=True) + area = get_mask_area(seg_img, target_size, n_classes=len(cur_scores)) + filtered_small = np.array([a <= 4 for a in area], dtype=bool) + else: + cur_classes = np.ones((1, 1), dtype=np.int64) + + segments_info = [ + {"id": i, "isthing": is_thing_map[cat], "category_id": int(cat), "area": a} + for i, (cat, a) in enumerate(zip(cur_classes, area)) + ] + del cur_classes + + with io.BytesIO() as out: + PIL.Image.fromarray(seg_img).save(out, format="PNG") + predictions = {"png_string": out.getvalue(), "segments_info": segments_info} + + return predictions + + +# Copied from transformers.models.detr.image_processing_detr.resize_annotation +def resize_annotation( + annotation: dict[str, Any], + orig_size: tuple[int, int], + target_size: tuple[int, int], + threshold: float = 0.5, + resample: PILImageResampling = PILImageResampling.NEAREST, +): + """ + Resizes an annotation to a target size. + + Args: + annotation (`dict[str, Any]`): + The annotation dictionary. + orig_size (`tuple[int, int]`): + The original size of the input image. + target_size (`tuple[int, int]`): + The target size of the image, as returned by the preprocessing `resize` step. + threshold (`float`, *optional*, defaults to 0.5): + The threshold used to binarize the segmentation masks. + resample (`PILImageResampling`, defaults to `PILImageResampling.NEAREST`): + The resampling filter to use when resizing the masks. + """ + ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(target_size, orig_size)) + ratio_height, ratio_width = ratios + + new_annotation = {} + new_annotation["size"] = target_size + + for key, value in annotation.items(): + if key == "boxes": + boxes = value + scaled_boxes = boxes * np.asarray([ratio_width, ratio_height, ratio_width, ratio_height], dtype=np.float32) + new_annotation["boxes"] = scaled_boxes + elif key == "area": + area = value + scaled_area = area * (ratio_width * ratio_height) + new_annotation["area"] = scaled_area + elif key == "masks": + masks = value[:, None] + masks = np.array([resize(mask, target_size, resample=resample) for mask in masks]) + masks = masks.astype(np.float32) + masks = masks[:, 0] > threshold + new_annotation["masks"] = masks + elif key == "size": + new_annotation["size"] = target_size + else: + new_annotation[key] = value + + return new_annotation + + +# Copied from transformers.models.detr.image_processing_detr.binary_mask_to_rle +def binary_mask_to_rle(mask): + """ + Converts given binary mask of shape `(height, width)` to the run-length encoding (RLE) format. + + Args: + mask (`torch.Tensor` or `numpy.array`): + A binary mask tensor of shape `(height, width)` where 0 denotes background and 1 denotes the target + segment_id or class_id. + Returns: + `List`: Run-length encoded list of the binary mask. Refer to COCO API for more information about the RLE + format. + """ + if is_torch_tensor(mask): + mask = mask.numpy() + + pixels = mask.flatten() + pixels = np.concatenate([[0], pixels, [0]]) + runs = np.where(pixels[1:] != pixels[:-1])[0] + 1 + runs[1::2] -= runs[::2] + return list(runs) + + +# Copied from transformers.models.detr.image_processing_detr.convert_segmentation_to_rle +def convert_segmentation_to_rle(segmentation): + """ + Converts given segmentation map of shape `(height, width)` to the run-length encoding (RLE) format. + + Args: + segmentation (`torch.Tensor` or `numpy.array`): + A segmentation map of shape `(height, width)` where each value denotes a segment or class id. + Returns: + `list[List]`: A list of lists, where each list is the run-length encoding of a segment / class id. + """ + segment_ids = torch.unique(segmentation) + + run_length_encodings = [] + for idx in segment_ids: + mask = torch.where(segmentation == idx, 1, 0) + rle = binary_mask_to_rle(mask) + run_length_encodings.append(rle) + + return run_length_encodings + + +# Copied from transformers.models.detr.image_processing_detr.remove_low_and_no_objects +def remove_low_and_no_objects(masks, scores, labels, object_mask_threshold, num_labels): + """ + Binarize the given masks using `object_mask_threshold`, it returns the associated values of `masks`, `scores` and + `labels`. + + Args: + masks (`torch.Tensor`): + A tensor of shape `(num_queries, height, width)`. + scores (`torch.Tensor`): + A tensor of shape `(num_queries)`. + labels (`torch.Tensor`): + A tensor of shape `(num_queries)`. + object_mask_threshold (`float`): + A number between 0 and 1 used to binarize the masks. + Raises: + `ValueError`: Raised when the first dimension doesn't match in all input tensors. + Returns: + `tuple[`torch.Tensor`, `torch.Tensor`, `torch.Tensor`]`: The `masks`, `scores` and `labels` without the region + < `object_mask_threshold`. + """ + if not (masks.shape[0] == scores.shape[0] == labels.shape[0]): + raise ValueError("mask, scores and labels must have the same shape!") + + to_keep = labels.ne(num_labels) & (scores > object_mask_threshold) + + return masks[to_keep], scores[to_keep], labels[to_keep] + + +# Copied from transformers.models.detr.image_processing_detr.check_segment_validity +def check_segment_validity(mask_labels, mask_probs, k, mask_threshold=0.5, overlap_mask_area_threshold=0.8): + # Get the mask associated with the k class + mask_k = mask_labels == k + mask_k_area = mask_k.sum() + + # Compute the area of all the stuff in query k + original_area = (mask_probs[k] >= mask_threshold).sum() + mask_exists = mask_k_area > 0 and original_area > 0 + + # Eliminate disconnected tiny segments + if mask_exists: + area_ratio = mask_k_area / original_area + if not area_ratio.item() > overlap_mask_area_threshold: + mask_exists = False + + return mask_exists, mask_k + + +# Copied from transformers.models.detr.image_processing_detr.compute_segments +def compute_segments( + mask_probs, + pred_scores, + pred_labels, + mask_threshold: float = 0.5, + overlap_mask_area_threshold: float = 0.8, + label_ids_to_fuse: Optional[set[int]] = None, + target_size: Optional[tuple[int, int]] = None, +): + height = mask_probs.shape[1] if target_size is None else target_size[0] + width = mask_probs.shape[2] if target_size is None else target_size[1] + + segmentation = torch.zeros((height, width), dtype=torch.int32, device=mask_probs.device) + segments: list[dict] = [] + + if target_size is not None: + mask_probs = nn.functional.interpolate( + mask_probs.unsqueeze(0), size=target_size, mode="bilinear", align_corners=False + )[0] + + current_segment_id = 0 + + # Weigh each mask by its prediction score + mask_probs *= pred_scores.view(-1, 1, 1) + mask_labels = mask_probs.argmax(0) # [height, width] + + # Keep track of instances of each class + stuff_memory_list: dict[str, int] = {} + for k in range(pred_labels.shape[0]): + pred_class = pred_labels[k].item() + should_fuse = pred_class in label_ids_to_fuse + + # Check if mask exists and large enough to be a segment + mask_exists, mask_k = check_segment_validity( + mask_labels, mask_probs, k, mask_threshold, overlap_mask_area_threshold + ) + + if mask_exists: + if pred_class in stuff_memory_list: + current_segment_id = stuff_memory_list[pred_class] + else: + current_segment_id += 1 + + # Add current object segment to final segmentation map + segmentation[mask_k] = current_segment_id + segment_score = round(pred_scores[k].item(), 6) + segments.append( + { + "id": current_segment_id, + "label_id": pred_class, + "was_fused": should_fuse, + "score": segment_score, + } + ) + if should_fuse: + stuff_memory_list[pred_class] = current_segment_id + + return segmentation, segments + + +# Copied from transformers.models.owlvit.image_processing_owlvit._scale_boxes +def _scale_boxes(boxes, target_sizes): + """ + Scale batch of bounding boxes to the target sizes. + + Args: + boxes (`torch.Tensor` of shape `(batch_size, num_boxes, 4)`): + Bounding boxes to scale. Each box is expected to be in (x1, y1, x2, y2) format. + target_sizes (`list[tuple[int, int]]` or `torch.Tensor` of shape `(batch_size, 2)`): + Target sizes to scale the boxes to. Each target size is expected to be in (height, width) format. + + Returns: + `torch.Tensor` of shape `(batch_size, num_boxes, 4)`: Scaled bounding boxes. + """ + + if isinstance(target_sizes, (list, tuple)): + image_height = torch.tensor([i[0] for i in target_sizes]) + image_width = torch.tensor([i[1] for i in target_sizes]) + elif isinstance(target_sizes, torch.Tensor): + image_height, image_width = target_sizes.unbind(1) + else: + raise TypeError("`target_sizes` must be a list, tuple or torch.Tensor") + + scale_factor = torch.stack([image_width, image_height, image_width, image_height], dim=1) + scale_factor = scale_factor.unsqueeze(1).to(boxes.device) + boxes = boxes * scale_factor + return boxes + + +class GroundingDinoImageProcessor(BaseImageProcessor): + r""" + Constructs a Grounding DINO image processor. + + Args: + format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`): + Data format of the annotations. One of "coco_detection" or "coco_panoptic". + do_resize (`bool`, *optional*, defaults to `True`): + Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be + overridden by the `do_resize` parameter in the `preprocess` method. + size (`dict[str, int]` *optional*, defaults to `{"shortest_edge": 800, "longest_edge": 1333}`): + Size of the image's `(height, width)` dimensions after resizing. Can be overridden by the `size` parameter + in the `preprocess` method. Available options are: + - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`. + Do NOT keep the aspect ratio. + - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting + the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge + less or equal to `longest_edge`. + - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the + aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to + `max_width`. + resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`): + Resampling filter to use if resizing the image. + do_rescale (`bool`, *optional*, defaults to `True`): + Controls whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the + `do_rescale` parameter in the `preprocess` method. + rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): + Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the + `preprocess` method. Controls whether to normalize the image. Can be overridden by the `do_normalize` + parameter in the `preprocess` method. + do_normalize (`bool`, *optional*, defaults to `True`): + Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess` + method. + image_mean (`float` or `list[float]`, *optional*, defaults to `IMAGENET_DEFAULT_MEAN`): + Mean values to use when normalizing the image. Can be a single value or a list of values, one for each + channel. Can be overridden by the `image_mean` parameter in the `preprocess` method. + image_std (`float` or `list[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`): + Standard deviation values to use when normalizing the image. Can be a single value or a list of values, one + for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method. + do_convert_annotations (`bool`, *optional*, defaults to `True`): + Controls whether to convert the annotations to the format expected by the DETR model. Converts the + bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`. + Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method. + do_pad (`bool`, *optional*, defaults to `True`): + Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess` + method. If `True`, padding will be applied to the bottom and right of the image with zeros. + If `pad_size` is provided, the image will be padded to the specified dimensions. + Otherwise, the image will be padded to the maximum height and width of the batch. + pad_size (`dict[str, int]`, *optional*): + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size + provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest + height and width in the batch. + """ + + model_input_names = ["pixel_values", "pixel_mask"] + + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.__init__ + def __init__( + self, + format: Union[str, AnnotationFormat] = AnnotationFormat.COCO_DETECTION, + do_resize: bool = True, + size: Optional[dict[str, int]] = None, + resample: PILImageResampling = PILImageResampling.BILINEAR, + do_rescale: bool = True, + rescale_factor: Union[int, float] = 1 / 255, + do_normalize: bool = True, + image_mean: Optional[Union[float, list[float]]] = None, + image_std: Optional[Union[float, list[float]]] = None, + do_convert_annotations: Optional[bool] = None, + do_pad: bool = True, + pad_size: Optional[dict[str, int]] = None, + **kwargs, + ) -> None: + if "pad_and_return_pixel_mask" in kwargs: + do_pad = kwargs.pop("pad_and_return_pixel_mask") + + if "max_size" in kwargs: + logger.warning_once( + "The `max_size` parameter is deprecated and will be removed in v4.26. " + "Please specify in `size['longest_edge'] instead`.", + ) + max_size = kwargs.pop("max_size") + else: + max_size = None if size is None else 1333 + + size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333} + size = get_size_dict(size, max_size=max_size, default_to_square=False) + + # Backwards compatibility + if do_convert_annotations is None: + do_convert_annotations = do_normalize + + super().__init__(**kwargs) + self.format = format + self.do_resize = do_resize + self.size = size + self.resample = resample + self.do_rescale = do_rescale + self.rescale_factor = rescale_factor + self.do_normalize = do_normalize + self.do_convert_annotations = do_convert_annotations + self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN + self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD + self.do_pad = do_pad + self.pad_size = pad_size + self._valid_processor_keys = [ + "images", + "annotations", + "return_segmentation_masks", + "masks_path", + "do_resize", + "size", + "resample", + "do_rescale", + "rescale_factor", + "do_normalize", + "do_convert_annotations", + "image_mean", + "image_std", + "do_pad", + "pad_size", + "format", + "return_tensors", + "data_format", + "input_data_format", + ] + + @classmethod + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.from_dict with Detr->GroundingDino + def from_dict(cls, image_processor_dict: dict[str, Any], **kwargs): + """ + Overrides the `from_dict` method from the base class to make sure parameters are updated if image processor is + created using from_dict and kwargs e.g. `GroundingDinoImageProcessor.from_pretrained(checkpoint, size=600, + max_size=800)` + """ + image_processor_dict = image_processor_dict.copy() + if "max_size" in kwargs: + image_processor_dict["max_size"] = kwargs.pop("max_size") + if "pad_and_return_pixel_mask" in kwargs: + image_processor_dict["pad_and_return_pixel_mask"] = kwargs.pop("pad_and_return_pixel_mask") + return super().from_dict(image_processor_dict, **kwargs) + + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare_annotation with DETR->GroundingDino + def prepare_annotation( + self, + image: np.ndarray, + target: dict, + format: Optional[AnnotationFormat] = None, + return_segmentation_masks: Optional[bool] = None, + masks_path: Optional[Union[str, pathlib.Path]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> dict: + """ + Prepare an annotation for feeding into GroundingDino model. + """ + format = format if format is not None else self.format + + if format == AnnotationFormat.COCO_DETECTION: + return_segmentation_masks = False if return_segmentation_masks is None else return_segmentation_masks + target = prepare_coco_detection_annotation( + image, target, return_segmentation_masks, input_data_format=input_data_format + ) + elif format == AnnotationFormat.COCO_PANOPTIC: + return_segmentation_masks = True if return_segmentation_masks is None else return_segmentation_masks + target = prepare_coco_panoptic_annotation( + image, + target, + masks_path=masks_path, + return_masks=return_segmentation_masks, + input_data_format=input_data_format, + ) + else: + raise ValueError(f"Format {format} is not supported.") + return target + + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.resize + def resize( + self, + image: np.ndarray, + size: dict[str, int], + resample: PILImageResampling = PILImageResampling.BILINEAR, + data_format: Optional[ChannelDimension] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ) -> np.ndarray: + """ + Resize the image to the given size. Size can be `min_size` (scalar) or `(height, width)` tuple. If size is an + int, smaller edge of the image will be matched to this number. + + Args: + image (`np.ndarray`): + Image to resize. + size (`dict[str, int]`): + Size of the image's `(height, width)` dimensions after resizing. Available options are: + - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`. + Do NOT keep the aspect ratio. + - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting + the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge + less or equal to `longest_edge`. + - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the + aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to + `max_width`. + resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`): + Resampling filter to use if resizing the image. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format for the output image. If unset, the channel dimension format of the input + image is used. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format of the input image. If not provided, it will be inferred. + """ + if "max_size" in kwargs: + logger.warning_once( + "The `max_size` parameter is deprecated and will be removed in v4.26. " + "Please specify in `size['longest_edge'] instead`.", + ) + max_size = kwargs.pop("max_size") + else: + max_size = None + size = get_size_dict(size, max_size=max_size, default_to_square=False) + if "shortest_edge" in size and "longest_edge" in size: + new_size = get_resize_output_image_size( + image, size["shortest_edge"], size["longest_edge"], input_data_format=input_data_format + ) + elif "max_height" in size and "max_width" in size: + new_size = get_image_size_for_max_height_width( + image, size["max_height"], size["max_width"], input_data_format=input_data_format + ) + elif "height" in size and "width" in size: + new_size = (size["height"], size["width"]) + else: + raise ValueError( + "Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got" + f" {size.keys()}." + ) + image = resize( + image, + size=new_size, + resample=resample, + data_format=data_format, + input_data_format=input_data_format, + **kwargs, + ) + return image + + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.resize_annotation + def resize_annotation( + self, + annotation, + orig_size, + size, + resample: PILImageResampling = PILImageResampling.NEAREST, + ) -> dict: + """ + Resize the annotation to match the resized image. If size is an int, smaller edge of the mask will be matched + to this number. + """ + return resize_annotation(annotation, orig_size=orig_size, target_size=size, resample=resample) + + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.rescale + def rescale( + self, + image: np.ndarray, + rescale_factor: float, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> np.ndarray: + """ + Rescale the image by the given factor. image = image * rescale_factor. + + Args: + image (`np.ndarray`): + Image to rescale. + rescale_factor (`float`): + The value to use for rescaling. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format for the output image. If unset, the channel dimension format of the input + image is used. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + input_data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format for the input image. If unset, is inferred from the input image. Can be + one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + """ + return rescale(image, rescale_factor, data_format=data_format, input_data_format=input_data_format) + + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.normalize_annotation + def normalize_annotation(self, annotation: dict, image_size: tuple[int, int]) -> dict: + """ + Normalize the boxes in the annotation from `[top_left_x, top_left_y, bottom_right_x, bottom_right_y]` to + `[center_x, center_y, width, height]` format and from absolute to relative pixel values. + """ + return normalize_annotation(annotation, image_size=image_size) + + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._update_annotation_for_padded_image + def _update_annotation_for_padded_image( + self, + annotation: dict, + input_image_size: tuple[int, int], + output_image_size: tuple[int, int], + padding, + update_bboxes, + ) -> dict: + """ + Update the annotation for a padded image. + """ + new_annotation = {} + new_annotation["size"] = output_image_size + + for key, value in annotation.items(): + if key == "masks": + masks = value + masks = pad( + masks, + padding, + mode=PaddingMode.CONSTANT, + constant_values=0, + input_data_format=ChannelDimension.FIRST, + ) + masks = safe_squeeze(masks, 1) + new_annotation["masks"] = masks + elif key == "boxes" and update_bboxes: + boxes = value + boxes *= np.asarray( + [ + input_image_size[1] / output_image_size[1], + input_image_size[0] / output_image_size[0], + input_image_size[1] / output_image_size[1], + input_image_size[0] / output_image_size[0], + ] + ) + new_annotation["boxes"] = boxes + elif key == "size": + new_annotation["size"] = output_image_size + else: + new_annotation[key] = value + return new_annotation + + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image + def _pad_image( + self, + image: np.ndarray, + output_size: tuple[int, int], + annotation: Optional[dict[str, Any]] = None, + constant_values: Union[float, Iterable[float]] = 0, + data_format: Optional[ChannelDimension] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + update_bboxes: bool = True, + ) -> np.ndarray: + """ + Pad an image with zeros to the given size. + """ + input_height, input_width = get_image_size(image, channel_dim=input_data_format) + output_height, output_width = output_size + + pad_bottom = output_height - input_height + pad_right = output_width - input_width + padding = ((0, pad_bottom), (0, pad_right)) + padded_image = pad( + image, + padding, + mode=PaddingMode.CONSTANT, + constant_values=constant_values, + data_format=data_format, + input_data_format=input_data_format, + ) + if annotation is not None: + annotation = self._update_annotation_for_padded_image( + annotation, (input_height, input_width), (output_height, output_width), padding, update_bboxes + ) + return padded_image, annotation + + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad + def pad( + self, + images: list[np.ndarray], + annotations: Optional[Union[AnnotationType, list[AnnotationType]]] = None, + constant_values: Union[float, Iterable[float]] = 0, + return_pixel_mask: bool = True, + return_tensors: Optional[Union[str, TensorType]] = None, + data_format: Optional[ChannelDimension] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + update_bboxes: bool = True, + pad_size: Optional[dict[str, int]] = None, + ) -> BatchFeature: + """ + Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width + in the batch and optionally returns their corresponding pixel mask. + + Args: + images (list[`np.ndarray`]): + Images to pad. + annotations (`AnnotationType` or `list[AnnotationType]`, *optional*): + Annotations to transform according to the padding that is applied to the images. + constant_values (`float` or `Iterable[float]`, *optional*): + The value to use for the padding if `mode` is `"constant"`. + return_pixel_mask (`bool`, *optional*, defaults to `True`): + Whether to return a pixel mask. + return_tensors (`str` or `TensorType`, *optional*): + The type of tensors to return. Can be one of: + - Unset: Return a list of `np.ndarray`. + - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`. + - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. + - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. + - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format of the image. If not provided, it will be the same as the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format of the input image. If not provided, it will be inferred. + update_bboxes (`bool`, *optional*, defaults to `True`): + Whether to update the bounding boxes in the annotations to match the padded images. If the + bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)` + format, the bounding boxes will not be updated. + pad_size (`dict[str, int]`, *optional*): + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size + provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest + height and width in the batch. + """ + pad_size = pad_size if pad_size is not None else self.pad_size + if pad_size is not None: + padded_size = (pad_size["height"], pad_size["width"]) + else: + padded_size = get_max_height_width(images, input_data_format=input_data_format) + + annotation_list = annotations if annotations is not None else [None] * len(images) + padded_images = [] + padded_annotations = [] + for image, annotation in zip(images, annotation_list): + padded_image, padded_annotation = self._pad_image( + image, + padded_size, + annotation, + constant_values=constant_values, + data_format=data_format, + input_data_format=input_data_format, + update_bboxes=update_bboxes, + ) + padded_images.append(padded_image) + padded_annotations.append(padded_annotation) + + data = {"pixel_values": padded_images} + + if return_pixel_mask: + masks = [ + make_pixel_mask(image=image, output_size=padded_size, input_data_format=input_data_format) + for image in images + ] + data["pixel_mask"] = masks + + encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors) + + if annotations is not None: + encoded_inputs["labels"] = [ + BatchFeature(annotation, tensor_type=return_tensors) for annotation in padded_annotations + ] + + return encoded_inputs + + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.preprocess + def preprocess( + self, + images: ImageInput, + annotations: Optional[Union[AnnotationType, list[AnnotationType]]] = None, + return_segmentation_masks: Optional[bool] = None, + masks_path: Optional[Union[str, pathlib.Path]] = None, + do_resize: Optional[bool] = None, + size: Optional[dict[str, int]] = None, + resample=None, # PILImageResampling + do_rescale: Optional[bool] = None, + rescale_factor: Optional[Union[int, float]] = None, + do_normalize: Optional[bool] = None, + do_convert_annotations: Optional[bool] = None, + image_mean: Optional[Union[float, list[float]]] = None, + image_std: Optional[Union[float, list[float]]] = None, + do_pad: Optional[bool] = None, + format: Optional[Union[str, AnnotationFormat]] = None, + return_tensors: Optional[Union[TensorType, str]] = None, + data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + pad_size: Optional[dict[str, int]] = None, + **kwargs, + ) -> BatchFeature: + """ + Preprocess an image or a batch of images so that it can be used by the model. + + Args: + images (`ImageInput`): + Image or batch of images to preprocess. Expects a single or batch of images with pixel values ranging + from 0 to 255. If passing in images with pixel values between 0 and 1, set `do_rescale=False`. + annotations (`AnnotationType` or `list[AnnotationType]`, *optional*): + List of annotations associated with the image or batch of images. If annotation is for object + detection, the annotations should be a dictionary with the following keys: + - "image_id" (`int`): The image id. + - "annotations" (`list[Dict]`): List of annotations for an image. Each annotation should be a + dictionary. An image can have no annotations, in which case the list should be empty. + If annotation is for segmentation, the annotations should be a dictionary with the following keys: + - "image_id" (`int`): The image id. + - "segments_info" (`list[Dict]`): List of segments for an image. Each segment should be a dictionary. + An image can have no segments, in which case the list should be empty. + - "file_name" (`str`): The file name of the image. + return_segmentation_masks (`bool`, *optional*, defaults to self.return_segmentation_masks): + Whether to return segmentation masks. + masks_path (`str` or `pathlib.Path`, *optional*): + Path to the directory containing the segmentation masks. + do_resize (`bool`, *optional*, defaults to self.do_resize): + Whether to resize the image. + size (`dict[str, int]`, *optional*, defaults to self.size): + Size of the image's `(height, width)` dimensions after resizing. Available options are: + - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`. + Do NOT keep the aspect ratio. + - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting + the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge + less or equal to `longest_edge`. + - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the + aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to + `max_width`. + resample (`PILImageResampling`, *optional*, defaults to self.resample): + Resampling filter to use when resizing the image. + do_rescale (`bool`, *optional*, defaults to self.do_rescale): + Whether to rescale the image. + rescale_factor (`float`, *optional*, defaults to self.rescale_factor): + Rescale factor to use when rescaling the image. + do_normalize (`bool`, *optional*, defaults to self.do_normalize): + Whether to normalize the image. + do_convert_annotations (`bool`, *optional*, defaults to self.do_convert_annotations): + Whether to convert the annotations to the format expected by the model. Converts the bounding + boxes from the format `(top_left_x, top_left_y, width, height)` to `(center_x, center_y, width, height)` + and in relative coordinates. + image_mean (`float` or `list[float]`, *optional*, defaults to self.image_mean): + Mean to use when normalizing the image. + image_std (`float` or `list[float]`, *optional*, defaults to self.image_std): + Standard deviation to use when normalizing the image. + do_pad (`bool`, *optional*, defaults to self.do_pad): + Whether to pad the image. If `True`, padding will be applied to the bottom and right of + the image with zeros. If `pad_size` is provided, the image will be padded to the specified + dimensions. Otherwise, the image will be padded to the maximum height and width of the batch. + format (`str` or `AnnotationFormat`, *optional*, defaults to self.format): + Format of the annotations. + return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors): + Type of tensors to return. If `None`, will return the list of images. + data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - Unset: Use the channel dimension format of the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + pad_size (`dict[str, int]`, *optional*): + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size + provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest + height and width in the batch. + """ + if "pad_and_return_pixel_mask" in kwargs: + logger.warning_once( + "The `pad_and_return_pixel_mask` argument is deprecated and will be removed in a future version, " + "use `do_pad` instead." + ) + do_pad = kwargs.pop("pad_and_return_pixel_mask") + + if "max_size" in kwargs: + logger.warning_once( + "The `max_size` argument is deprecated and will be removed in a future version, use" + " `size['longest_edge']` instead." + ) + size = kwargs.pop("max_size") + + do_resize = self.do_resize if do_resize is None else do_resize + size = self.size if size is None else size + size = get_size_dict(size=size, default_to_square=False) + resample = self.resample if resample is None else resample + do_rescale = self.do_rescale if do_rescale is None else do_rescale + rescale_factor = self.rescale_factor if rescale_factor is None else rescale_factor + do_normalize = self.do_normalize if do_normalize is None else do_normalize + image_mean = self.image_mean if image_mean is None else image_mean + image_std = self.image_std if image_std is None else image_std + do_convert_annotations = ( + self.do_convert_annotations if do_convert_annotations is None else do_convert_annotations + ) + do_pad = self.do_pad if do_pad is None else do_pad + pad_size = self.pad_size if pad_size is None else pad_size + format = self.format if format is None else format + + images = make_flat_list_of_images(images) + + if not valid_images(images): + raise ValueError( + "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " + "torch.Tensor, tf.Tensor or jax.ndarray." + ) + validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self._valid_processor_keys) + + # Here, the pad() method pads to the maximum of (width, height). It does not need to be validated. + validate_preprocess_arguments( + do_rescale=do_rescale, + rescale_factor=rescale_factor, + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + do_resize=do_resize, + size=size, + resample=resample, + ) + + if annotations is not None and isinstance(annotations, dict): + annotations = [annotations] + + if annotations is not None and len(images) != len(annotations): + raise ValueError( + f"The number of images ({len(images)}) and annotations ({len(annotations)}) do not match." + ) + + format = AnnotationFormat(format) + if annotations is not None: + validate_annotations(format, SUPPORTED_ANNOTATION_FORMATS, annotations) + + if ( + masks_path is not None + and format == AnnotationFormat.COCO_PANOPTIC + and not isinstance(masks_path, (pathlib.Path, str)) + ): + raise ValueError( + "The path to the directory containing the mask PNG files should be provided as a" + f" `pathlib.Path` or string object, but is {type(masks_path)} instead." + ) + + # All transformations expect numpy arrays + images = [to_numpy_array(image) for image in images] + + if do_rescale and is_scaled_image(images[0]): + logger.warning_once( + "It looks like you are trying to rescale already rescaled images. If the input" + " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again." + ) + + if input_data_format is None: + # We assume that all images have the same channel dimension format. + input_data_format = infer_channel_dimension_format(images[0]) + + # prepare (COCO annotations as a list of Dict -> DETR target as a single Dict per image) + if annotations is not None: + prepared_images = [] + prepared_annotations = [] + for image, target in zip(images, annotations): + target = self.prepare_annotation( + image, + target, + format, + return_segmentation_masks=return_segmentation_masks, + masks_path=masks_path, + input_data_format=input_data_format, + ) + prepared_images.append(image) + prepared_annotations.append(target) + images = prepared_images + annotations = prepared_annotations + del prepared_images, prepared_annotations + + # transformations + if do_resize: + if annotations is not None: + resized_images, resized_annotations = [], [] + for image, target in zip(images, annotations): + orig_size = get_image_size(image, input_data_format) + resized_image = self.resize( + image, size=size, resample=resample, input_data_format=input_data_format + ) + resized_annotation = self.resize_annotation( + target, orig_size, get_image_size(resized_image, input_data_format) + ) + resized_images.append(resized_image) + resized_annotations.append(resized_annotation) + images = resized_images + annotations = resized_annotations + del resized_images, resized_annotations + else: + images = [ + self.resize(image, size=size, resample=resample, input_data_format=input_data_format) + for image in images + ] + + if do_rescale: + images = [self.rescale(image, rescale_factor, input_data_format=input_data_format) for image in images] + + if do_normalize: + images = [ + self.normalize(image, image_mean, image_std, input_data_format=input_data_format) for image in images + ] + + if do_convert_annotations and annotations is not None: + annotations = [ + self.normalize_annotation(annotation, get_image_size(image, input_data_format)) + for annotation, image in zip(annotations, images) + ] + + if do_pad: + # Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...} + encoded_inputs = self.pad( + images, + annotations=annotations, + return_pixel_mask=True, + data_format=data_format, + input_data_format=input_data_format, + update_bboxes=do_convert_annotations, + return_tensors=return_tensors, + pad_size=pad_size, + ) + else: + images = [ + to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) + for image in images + ] + encoded_inputs = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors) + if annotations is not None: + encoded_inputs["labels"] = [ + BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations + ] + + return encoded_inputs + + # Copied from transformers.models.owlvit.image_processing_owlvit.OwlViTImageProcessor.post_process_object_detection with OwlViT->GroundingDino + def post_process_object_detection( + self, + outputs: "GroundingDinoObjectDetectionOutput", + threshold: float = 0.1, + target_sizes: Optional[Union[TensorType, list[tuple]]] = None, + ): + """ + Converts the raw output of [`GroundingDinoForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y, + bottom_right_x, bottom_right_y) format. + + Args: + outputs ([`GroundingDinoObjectDetectionOutput`]): + Raw outputs of the model. + threshold (`float`, *optional*, defaults to 0.1): + Score threshold to keep object detection predictions. + target_sizes (`torch.Tensor` or `list[tuple[int, int]]`, *optional*): + Tensor of shape `(batch_size, 2)` or list of tuples (`tuple[int, int]`) containing the target size + `(height, width)` of each image in the batch. If unset, predictions will not be resized. + + Returns: + `list[Dict]`: A list of dictionaries, each dictionary containing the following keys: + - "scores": The confidence scores for each predicted box on the image. + - "labels": Indexes of the classes predicted by the model on the image. + - "boxes": Image bounding boxes in (top_left_x, top_left_y, bottom_right_x, bottom_right_y) format. + """ + batch_logits, batch_boxes = outputs.logits, outputs.pred_boxes + batch_size = len(batch_logits) + + if target_sizes is not None and len(target_sizes) != batch_size: + raise ValueError("Make sure that you pass in as many target sizes as images") + + # batch_logits of shape (batch_size, num_queries, num_classes) + batch_class_logits = torch.max(batch_logits, dim=-1) + batch_scores = torch.sigmoid(batch_class_logits.values) + batch_labels = batch_class_logits.indices + + # Convert to [x0, y0, x1, y1] format + batch_boxes = center_to_corners_format(batch_boxes) + + # Convert from relative [0, 1] to absolute [0, height] coordinates + if target_sizes is not None: + batch_boxes = _scale_boxes(batch_boxes, target_sizes) + + results = [] + for scores, labels, boxes in zip(batch_scores, batch_labels, batch_boxes): + keep = scores > threshold + scores = scores[keep] + labels = labels[keep] + boxes = boxes[keep] + results.append({"scores": scores, "labels": labels, "boxes": boxes}) + + return results + + +__all__ = ["GroundingDinoImageProcessor"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/grounding_dino/image_processing_grounding_dino_fast.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/grounding_dino/image_processing_grounding_dino_fast.py new file mode 100644 index 0000000000000000000000000000000000000000..744cb5f92923068601add8ea3890ea55a1b09396 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/grounding_dino/image_processing_grounding_dino_fast.py @@ -0,0 +1,776 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/grounding_dino/modular_grounding_dino.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_grounding_dino.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +import pathlib +from typing import TYPE_CHECKING, Any, Optional, Union + +import torch +from torchvision.io import read_image +from torchvision.transforms.v2 import functional as F + +from ...image_processing_utils import BatchFeature, get_size_dict +from ...image_processing_utils_fast import ( + BaseImageProcessorFast, + DefaultFastImageProcessorKwargs, + SizeDict, + get_image_size_for_max_height_width, + get_max_height_width, + safe_squeeze, +) +from ...image_transforms import center_to_corners_format, corners_to_center_format +from ...image_utils import ( + IMAGENET_DEFAULT_MEAN, + IMAGENET_DEFAULT_STD, + AnnotationFormat, + AnnotationType, + ChannelDimension, + ImageInput, + PILImageResampling, + get_image_size, + validate_annotations, +) +from ...processing_utils import Unpack +from ...utils import TensorType, auto_docstring, logging +from ...utils.import_utils import requires +from .image_processing_grounding_dino import get_size_with_aspect_ratio + + +if TYPE_CHECKING: + from .modeling_grounding_dino import GroundingDinoObjectDetectionOutput + + +logger = logging.get_logger(__name__) + + +class GroundingDinoFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): + r""" + format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`): + Data format of the annotations. One of "coco_detection" or "coco_panoptic". + do_convert_annotations (`bool`, *optional*, defaults to `True`): + Controls whether to convert the annotations to the format expected by the GROUNDING_DINO model. Converts the + bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`. + Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method. + return_segmentation_masks (`bool`, *optional*, defaults to `False`): + Whether to return segmentation masks. + """ + + format: Optional[Union[str, AnnotationFormat]] + do_convert_annotations: Optional[bool] + return_segmentation_masks: Optional[bool] + + +SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC) + + +# inspired by https://github.com/facebookresearch/grounding_dino/blob/master/datasets/coco.py#L33 +def convert_coco_poly_to_mask(segmentations, height: int, width: int, device: torch.device) -> torch.Tensor: + """ + Convert a COCO polygon annotation to a mask. + + Args: + segmentations (`list[list[float]]`): + List of polygons, each polygon represented by a list of x-y coordinates. + height (`int`): + Height of the mask. + width (`int`): + Width of the mask. + """ + try: + from pycocotools import mask as coco_mask + except ImportError: + raise ImportError("Pycocotools is not installed in your environment.") + + masks = [] + for polygons in segmentations: + rles = coco_mask.frPyObjects(polygons, height, width) + mask = coco_mask.decode(rles) + if len(mask.shape) < 3: + mask = mask[..., None] + mask = torch.as_tensor(mask, dtype=torch.uint8, device=device) + mask = torch.any(mask, axis=2) + masks.append(mask) + if masks: + masks = torch.stack(masks, axis=0) + else: + masks = torch.zeros((0, height, width), dtype=torch.uint8, device=device) + + return masks + + +# inspired by https://github.com/facebookresearch/grounding_dino/blob/master/datasets/coco.py#L50 +def prepare_coco_detection_annotation( + image, + target, + return_segmentation_masks: bool = False, + input_data_format: Optional[Union[ChannelDimension, str]] = None, +): + """ + Convert the target in COCO format into the format expected by GROUNDING_DINO. + """ + image_height, image_width = image.size()[-2:] + + image_id = target["image_id"] + image_id = torch.as_tensor([image_id], dtype=torch.int64, device=image.device) + + # Get all COCO annotations for the given image. + annotations = target["annotations"] + classes = [] + area = [] + boxes = [] + keypoints = [] + for obj in annotations: + if "iscrowd" not in obj or obj["iscrowd"] == 0: + classes.append(obj["category_id"]) + area.append(obj["area"]) + boxes.append(obj["bbox"]) + if "keypoints" in obj: + keypoints.append(obj["keypoints"]) + + classes = torch.as_tensor(classes, dtype=torch.int64, device=image.device) + area = torch.as_tensor(area, dtype=torch.float32, device=image.device) + iscrowd = torch.zeros_like(classes, dtype=torch.int64, device=image.device) + # guard against no boxes via resizing + boxes = torch.as_tensor(boxes, dtype=torch.float32, device=image.device).reshape(-1, 4) + boxes[:, 2:] += boxes[:, :2] + boxes[:, 0::2] = boxes[:, 0::2].clip(min=0, max=image_width) + boxes[:, 1::2] = boxes[:, 1::2].clip(min=0, max=image_height) + + keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0]) + + new_target = { + "image_id": image_id, + "class_labels": classes[keep], + "boxes": boxes[keep], + "area": area[keep], + "iscrowd": iscrowd[keep], + "orig_size": torch.as_tensor([int(image_height), int(image_width)], dtype=torch.int64, device=image.device), + } + + if keypoints: + keypoints = torch.as_tensor(keypoints, dtype=torch.float32, device=image.device) + # Apply the keep mask here to filter the relevant annotations + keypoints = keypoints[keep] + num_keypoints = keypoints.shape[0] + keypoints = keypoints.reshape((-1, 3)) if num_keypoints else keypoints + new_target["keypoints"] = keypoints + + if return_segmentation_masks: + segmentation_masks = [obj["segmentation"] for obj in annotations] + masks = convert_coco_poly_to_mask(segmentation_masks, image_height, image_width, device=image.device) + new_target["masks"] = masks[keep] + + return new_target + + +def masks_to_boxes(masks: torch.Tensor) -> torch.Tensor: + """ + Compute the bounding boxes around the provided panoptic segmentation masks. + + Args: + masks: masks in format `[number_masks, height, width]` where N is the number of masks + + Returns: + boxes: bounding boxes in format `[number_masks, 4]` in xyxy format + """ + if masks.numel() == 0: + return torch.zeros((0, 4), device=masks.device) + + h, w = masks.shape[-2:] + y = torch.arange(0, h, dtype=torch.float32, device=masks.device) + x = torch.arange(0, w, dtype=torch.float32, device=masks.device) + # see https://github.com/pytorch/pytorch/issues/50276 + y, x = torch.meshgrid(y, x, indexing="ij") + + x_mask = masks * torch.unsqueeze(x, 0) + x_max = x_mask.view(x_mask.shape[0], -1).max(-1)[0] + x_min = ( + torch.where(masks, x.unsqueeze(0), torch.tensor(1e8, device=masks.device)).view(masks.shape[0], -1).min(-1)[0] + ) + + y_mask = masks * torch.unsqueeze(y, 0) + y_max = y_mask.view(y_mask.shape[0], -1).max(-1)[0] + y_min = ( + torch.where(masks, y.unsqueeze(0), torch.tensor(1e8, device=masks.device)).view(masks.shape[0], -1).min(-1)[0] + ) + + return torch.stack([x_min, y_min, x_max, y_max], 1) + + +# 2 functions below adapted from https://github.com/cocodataset/panopticapi/blob/master/panopticapi/utils.py +# Copyright (c) 2018, Alexander Kirillov +# All rights reserved. +def rgb_to_id(color): + """ + Converts RGB color to unique ID. + """ + if isinstance(color, torch.Tensor) and len(color.shape) == 3: + if color.dtype == torch.uint8: + color = color.to(torch.int32) + return color[:, :, 0] + 256 * color[:, :, 1] + 256 * 256 * color[:, :, 2] + return int(color[0] + 256 * color[1] + 256 * 256 * color[2]) + + +def prepare_coco_panoptic_annotation( + image: torch.Tensor, + target: dict, + masks_path: Union[str, pathlib.Path], + return_masks: bool = True, + input_data_format: Union[ChannelDimension, str] = None, +) -> dict: + """ + Prepare a coco panoptic annotation for GROUNDING_DINO. + """ + image_height, image_width = get_image_size(image, channel_dim=input_data_format) + annotation_path = pathlib.Path(masks_path) / target["file_name"] + + new_target = {} + new_target["image_id"] = torch.as_tensor( + [target["image_id"] if "image_id" in target else target["id"]], dtype=torch.int64, device=image.device + ) + new_target["size"] = torch.as_tensor([image_height, image_width], dtype=torch.int64, device=image.device) + new_target["orig_size"] = torch.as_tensor([image_height, image_width], dtype=torch.int64, device=image.device) + + if "segments_info" in target: + masks = read_image(annotation_path).permute(1, 2, 0).to(dtype=torch.int32, device=image.device) + masks = rgb_to_id(masks) + + ids = torch.as_tensor([segment_info["id"] for segment_info in target["segments_info"]], device=image.device) + masks = masks == ids[:, None, None] + masks = masks.to(torch.bool) + if return_masks: + new_target["masks"] = masks + new_target["boxes"] = masks_to_boxes(masks) + new_target["class_labels"] = torch.as_tensor( + [segment_info["category_id"] for segment_info in target["segments_info"]], + dtype=torch.int64, + device=image.device, + ) + new_target["iscrowd"] = torch.as_tensor( + [segment_info["iscrowd"] for segment_info in target["segments_info"]], + dtype=torch.int64, + device=image.device, + ) + new_target["area"] = torch.as_tensor( + [segment_info["area"] for segment_info in target["segments_info"]], + dtype=torch.float32, + device=image.device, + ) + + return new_target + + +def _scale_boxes(boxes, target_sizes): + """ + Scale batch of bounding boxes to the target sizes. + + Args: + boxes (`torch.Tensor` of shape `(batch_size, num_boxes, 4)`): + Bounding boxes to scale. Each box is expected to be in (x1, y1, x2, y2) format. + target_sizes (`list[tuple[int, int]]` or `torch.Tensor` of shape `(batch_size, 2)`): + Target sizes to scale the boxes to. Each target size is expected to be in (height, width) format. + + Returns: + `torch.Tensor` of shape `(batch_size, num_boxes, 4)`: Scaled bounding boxes. + """ + + if isinstance(target_sizes, (list, tuple)): + image_height = torch.tensor([i[0] for i in target_sizes]) + image_width = torch.tensor([i[1] for i in target_sizes]) + elif isinstance(target_sizes, torch.Tensor): + image_height, image_width = target_sizes.unbind(1) + else: + raise TypeError("`target_sizes` must be a list, tuple or torch.Tensor") + + scale_factor = torch.stack([image_width, image_height, image_width, image_height], dim=1) + scale_factor = scale_factor.unsqueeze(1).to(boxes.device) + boxes = boxes * scale_factor + return boxes + + +@auto_docstring +@requires(backends=("torchvision", "torch")) +class GroundingDinoImageProcessorFast(BaseImageProcessorFast): + resample = PILImageResampling.BILINEAR + image_mean = IMAGENET_DEFAULT_MEAN + image_std = IMAGENET_DEFAULT_STD + format = AnnotationFormat.COCO_DETECTION + do_resize = True + do_rescale = True + do_normalize = True + do_pad = True + size = {"shortest_edge": 800, "longest_edge": 1333} + default_to_square = False + model_input_names = ["pixel_values", "pixel_mask"] + valid_kwargs = GroundingDinoFastImageProcessorKwargs + + def __init__(self, **kwargs: Unpack[GroundingDinoFastImageProcessorKwargs]) -> None: + if "pad_and_return_pixel_mask" in kwargs: + kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask") + + size = kwargs.pop("size", None) + if "max_size" in kwargs: + logger.warning_once( + "The `max_size` parameter is deprecated and will be removed in v4.26. " + "Please specify in `size['longest_edge'] instead`.", + ) + max_size = kwargs.pop("max_size") + else: + max_size = None if size is None else 1333 + + size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333} + self.size = get_size_dict(size, max_size=max_size, default_to_square=False) + + # Backwards compatibility + do_convert_annotations = kwargs.get("do_convert_annotations") + do_normalize = kwargs.get("do_normalize") + if do_convert_annotations is None and getattr(self, "do_convert_annotations", None) is None: + self.do_convert_annotations = do_normalize if do_normalize is not None else self.do_normalize + + super().__init__(**kwargs) + + @classmethod + def from_dict(cls, image_processor_dict: dict[str, Any], **kwargs): + """ + Overrides the `from_dict` method from the base class to make sure parameters are updated if image processor is + created using from_dict and kwargs e.g. `GroundingDinoImageProcessorFast.from_pretrained(checkpoint, size=600, + max_size=800)` + """ + image_processor_dict = image_processor_dict.copy() + if "max_size" in kwargs: + image_processor_dict["max_size"] = kwargs.pop("max_size") + if "pad_and_return_pixel_mask" in kwargs: + image_processor_dict["pad_and_return_pixel_mask"] = kwargs.pop("pad_and_return_pixel_mask") + return super().from_dict(image_processor_dict, **kwargs) + + def prepare_annotation( + self, + image: torch.Tensor, + target: dict, + format: Optional[AnnotationFormat] = None, + return_segmentation_masks: Optional[bool] = None, + masks_path: Optional[Union[str, pathlib.Path]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> dict: + """ + Prepare an annotation for feeding into GROUNDING_DINO model. + """ + format = format if format is not None else self.format + + if format == AnnotationFormat.COCO_DETECTION: + return_segmentation_masks = False if return_segmentation_masks is None else return_segmentation_masks + target = prepare_coco_detection_annotation( + image, target, return_segmentation_masks, input_data_format=input_data_format + ) + elif format == AnnotationFormat.COCO_PANOPTIC: + return_segmentation_masks = True if return_segmentation_masks is None else return_segmentation_masks + target = prepare_coco_panoptic_annotation( + image, + target, + masks_path=masks_path, + return_masks=return_segmentation_masks, + input_data_format=input_data_format, + ) + else: + raise ValueError(f"Format {format} is not supported.") + return target + + def resize( + self, + image: torch.Tensor, + size: SizeDict, + interpolation: Optional["F.InterpolationMode"] = None, + **kwargs, + ) -> torch.Tensor: + """ + Resize the image to the given size. Size can be `min_size` (scalar) or `(height, width)` tuple. If size is an + int, smaller edge of the image will be matched to this number. + + Args: + image (`torch.Tensor`): + Image to resize. + size (`SizeDict`): + Size of the image's `(height, width)` dimensions after resizing. Available options are: + - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`. + Do NOT keep the aspect ratio. + - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting + the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge + less or equal to `longest_edge`. + - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the + aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to + `max_width`. + interpolation (`InterpolationMode`, *optional*, defaults to `InterpolationMode.BILINEAR`): + Resampling filter to use if resizing the image. + """ + interpolation = interpolation if interpolation is not None else F.InterpolationMode.BILINEAR + if size.shortest_edge and size.longest_edge: + # Resize the image so that the shortest edge or the longest edge is of the given size + # while maintaining the aspect ratio of the original image. + new_size = get_size_with_aspect_ratio( + image.size()[-2:], + size["shortest_edge"], + size["longest_edge"], + ) + elif size.max_height and size.max_width: + new_size = get_image_size_for_max_height_width(image.size()[-2:], size["max_height"], size["max_width"]) + elif size.height and size.width: + new_size = (size["height"], size["width"]) + else: + raise ValueError( + "Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got" + f" {size.keys()}." + ) + + image = F.resize( + image, + size=new_size, + interpolation=interpolation, + **kwargs, + ) + return image + + def resize_annotation( + self, + annotation: dict[str, Any], + orig_size: tuple[int, int], + target_size: tuple[int, int], + threshold: float = 0.5, + interpolation: Optional["F.InterpolationMode"] = None, + ): + """ + Resizes an annotation to a target size. + + Args: + annotation (`dict[str, Any]`): + The annotation dictionary. + orig_size (`tuple[int, int]`): + The original size of the input image. + target_size (`tuple[int, int]`): + The target size of the image, as returned by the preprocessing `resize` step. + threshold (`float`, *optional*, defaults to 0.5): + The threshold used to binarize the segmentation masks. + resample (`InterpolationMode`, defaults to `F.InterpolationMode.NEAREST_EXACT`): + The resampling filter to use when resizing the masks. + """ + interpolation = interpolation if interpolation is not None else F.InterpolationMode.NEAREST_EXACT + ratio_height, ratio_width = [target / orig for target, orig in zip(target_size, orig_size)] + + new_annotation = {} + new_annotation["size"] = target_size + + for key, value in annotation.items(): + if key == "boxes": + boxes = value + scaled_boxes = boxes * torch.as_tensor( + [ratio_width, ratio_height, ratio_width, ratio_height], dtype=torch.float32, device=boxes.device + ) + new_annotation["boxes"] = scaled_boxes + elif key == "area": + area = value + scaled_area = area * (ratio_width * ratio_height) + new_annotation["area"] = scaled_area + elif key == "masks": + masks = value[:, None] + masks = [F.resize(mask, target_size, interpolation=interpolation) for mask in masks] + masks = torch.stack(masks).to(torch.float32) + masks = masks[:, 0] > threshold + new_annotation["masks"] = masks + elif key == "size": + new_annotation["size"] = target_size + else: + new_annotation[key] = value + + return new_annotation + + def normalize_annotation(self, annotation: dict, image_size: tuple[int, int]) -> dict: + image_height, image_width = image_size + norm_annotation = {} + for key, value in annotation.items(): + if key == "boxes": + boxes = value + boxes = corners_to_center_format(boxes) + boxes /= torch.as_tensor( + [image_width, image_height, image_width, image_height], dtype=torch.float32, device=boxes.device + ) + norm_annotation[key] = boxes + else: + norm_annotation[key] = value + return norm_annotation + + def _update_annotation_for_padded_image( + self, + annotation: dict, + input_image_size: tuple[int, int], + output_image_size: tuple[int, int], + padding, + update_bboxes, + ) -> dict: + """ + Update the annotation for a padded image. + """ + new_annotation = {} + new_annotation["size"] = output_image_size + ratio_height, ratio_width = (input / output for output, input in zip(output_image_size, input_image_size)) + + for key, value in annotation.items(): + if key == "masks": + masks = value + masks = F.pad( + masks, + padding, + fill=0, + ) + masks = safe_squeeze(masks, 1) + new_annotation["masks"] = masks + elif key == "boxes" and update_bboxes: + boxes = value + boxes *= torch.as_tensor([ratio_width, ratio_height, ratio_width, ratio_height], device=boxes.device) + new_annotation["boxes"] = boxes + elif key == "size": + new_annotation["size"] = output_image_size + else: + new_annotation[key] = value + return new_annotation + + def pad( + self, + image: torch.Tensor, + padded_size: tuple[int, int], + annotation: Optional[dict[str, Any]] = None, + update_bboxes: bool = True, + fill: int = 0, + ): + original_size = image.size()[-2:] + padding_bottom = padded_size[0] - original_size[0] + padding_right = padded_size[1] - original_size[1] + if padding_bottom < 0 or padding_right < 0: + raise ValueError( + f"Padding dimensions are negative. Please make sure that the padded size is larger than the " + f"original size. Got padded size: {padded_size}, original size: {original_size}." + ) + if original_size != padded_size: + padding = [0, 0, padding_right, padding_bottom] + image = F.pad(image, padding, fill=fill) + if annotation is not None: + annotation = self._update_annotation_for_padded_image( + annotation, original_size, padded_size, padding, update_bboxes + ) + + # Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding. + pixel_mask = torch.zeros(padded_size, dtype=torch.int64, device=image.device) + pixel_mask[: original_size[0], : original_size[1]] = 1 + + return image, pixel_mask, annotation + + @auto_docstring + def preprocess( + self, + images: ImageInput, + annotations: Optional[Union[AnnotationType, list[AnnotationType]]] = None, + masks_path: Optional[Union[str, pathlib.Path]] = None, + **kwargs: Unpack[GroundingDinoFastImageProcessorKwargs], + ) -> BatchFeature: + r""" + annotations (`AnnotationType` or `list[AnnotationType]`, *optional*): + List of annotations associated with the image or batch of images. If annotation is for object + detection, the annotations should be a dictionary with the following keys: + - "image_id" (`int`): The image id. + - "annotations" (`list[Dict]`): List of annotations for an image. Each annotation should be a + dictionary. An image can have no annotations, in which case the list should be empty. + If annotation is for segmentation, the annotations should be a dictionary with the following keys: + - "image_id" (`int`): The image id. + - "segments_info" (`list[Dict]`): List of segments for an image. Each segment should be a dictionary. + An image can have no segments, in which case the list should be empty. + - "file_name" (`str`): The file name of the image. + masks_path (`str` or `pathlib.Path`, *optional*): + Path to the directory containing the segmentation masks. + """ + if "pad_and_return_pixel_mask" in kwargs: + kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask") + logger.warning_once( + "The `pad_and_return_pixel_mask` argument is deprecated and will be removed in a future version, " + "use `do_pad` instead." + ) + + if "max_size" in kwargs: + logger.warning_once( + "The `max_size` argument is deprecated and will be removed in a future version, use" + " `size['longest_edge']` instead." + ) + kwargs["size"] = kwargs.pop("max_size") + + return super().preprocess(images, annotations, masks_path, **kwargs) + + def _preprocess( + self, + images: list["torch.Tensor"], + annotations: Optional[Union[AnnotationType, list[AnnotationType]]], + masks_path: Optional[Union[str, pathlib.Path]], + return_segmentation_masks: bool, + do_resize: bool, + size: SizeDict, + interpolation: Optional["F.InterpolationMode"], + do_rescale: bool, + rescale_factor: float, + do_normalize: bool, + do_convert_annotations: bool, + image_mean: Optional[Union[float, list[float]]], + image_std: Optional[Union[float, list[float]]], + do_pad: bool, + pad_size: Optional[SizeDict], + format: Optional[Union[str, AnnotationFormat]], + return_tensors: Optional[Union[str, TensorType]], + **kwargs, + ) -> BatchFeature: + """ + Preprocess an image or a batch of images so that it can be used by the model. + """ + if annotations is not None and isinstance(annotations, dict): + annotations = [annotations] + + if annotations is not None and len(images) != len(annotations): + raise ValueError( + f"The number of images ({len(images)}) and annotations ({len(annotations)}) do not match." + ) + + format = AnnotationFormat(format) + if annotations is not None: + validate_annotations(format, SUPPORTED_ANNOTATION_FORMATS, annotations) + + if ( + masks_path is not None + and format == AnnotationFormat.COCO_PANOPTIC + and not isinstance(masks_path, (pathlib.Path, str)) + ): + raise ValueError( + "The path to the directory containing the mask PNG files should be provided as a" + f" `pathlib.Path` or string object, but is {type(masks_path)} instead." + ) + + data = {} + + processed_images = [] + processed_annotations = [] + pixel_masks = [] # Initialize pixel_masks here + for image, annotation in zip(images, annotations if annotations is not None else [None] * len(images)): + # prepare (COCO annotations as a list of Dict -> GROUNDING_DINO target as a single Dict per image) + if annotations is not None: + annotation = self.prepare_annotation( + image, + annotation, + format, + return_segmentation_masks=return_segmentation_masks, + masks_path=masks_path, + input_data_format=ChannelDimension.FIRST, + ) + + if do_resize: + resized_image = self.resize(image, size=size, interpolation=interpolation) + if annotations is not None: + annotation = self.resize_annotation( + annotation, + orig_size=image.size()[-2:], + target_size=resized_image.size()[-2:], + ) + image = resized_image + # Fused rescale and normalize + image = self.rescale_and_normalize(image, do_rescale, rescale_factor, do_normalize, image_mean, image_std) + if do_convert_annotations and annotations is not None: + annotation = self.normalize_annotation(annotation, get_image_size(image, ChannelDimension.FIRST)) + + processed_images.append(image) + processed_annotations.append(annotation) + images = processed_images + annotations = processed_annotations if annotations is not None else None + + if do_pad: + # depends on all resized image shapes so we need another loop + if pad_size is not None: + padded_size = (pad_size.height, pad_size.width) + else: + padded_size = get_max_height_width(images) + + padded_images = [] + padded_annotations = [] + for image, annotation in zip(images, annotations if annotations is not None else [None] * len(images)): + # Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...} + if padded_size == image.size()[-2:]: + padded_images.append(image) + pixel_masks.append(torch.ones(padded_size, dtype=torch.int64, device=image.device)) + padded_annotations.append(annotation) + continue + image, pixel_mask, annotation = self.pad( + image, padded_size, annotation=annotation, update_bboxes=do_convert_annotations + ) + padded_images.append(image) + padded_annotations.append(annotation) + pixel_masks.append(pixel_mask) + images = padded_images + annotations = padded_annotations if annotations is not None else None + data.update({"pixel_mask": torch.stack(pixel_masks, dim=0)}) + + data.update({"pixel_values": torch.stack(images, dim=0)}) + encoded_inputs = BatchFeature(data, tensor_type=return_tensors) + if annotations is not None: + encoded_inputs["labels"] = [ + BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations + ] + return encoded_inputs + + def post_process_object_detection( + self, + outputs: "GroundingDinoObjectDetectionOutput", + threshold: float = 0.1, + target_sizes: Optional[Union[TensorType, list[tuple]]] = None, + ): + """ + Converts the raw output of [`GroundingDinoForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y, + bottom_right_x, bottom_right_y) format. + + Args: + outputs ([`GroundingDinoObjectDetectionOutput`]): + Raw outputs of the model. + threshold (`float`, *optional*, defaults to 0.1): + Score threshold to keep object detection predictions. + target_sizes (`torch.Tensor` or `list[tuple[int, int]]`, *optional*): + Tensor of shape `(batch_size, 2)` or list of tuples (`tuple[int, int]`) containing the target size + `(height, width)` of each image in the batch. If unset, predictions will not be resized. + + Returns: + `list[Dict]`: A list of dictionaries, each dictionary containing the following keys: + - "scores": The confidence scores for each predicted box on the image. + - "labels": Indexes of the classes predicted by the model on the image. + - "boxes": Image bounding boxes in (top_left_x, top_left_y, bottom_right_x, bottom_right_y) format. + """ + batch_logits, batch_boxes = outputs.logits, outputs.pred_boxes + batch_size = len(batch_logits) + + if target_sizes is not None and len(target_sizes) != batch_size: + raise ValueError("Make sure that you pass in as many target sizes as images") + + # batch_logits of shape (batch_size, num_queries, num_classes) + batch_class_logits = torch.max(batch_logits, dim=-1) + batch_scores = torch.sigmoid(batch_class_logits.values) + batch_labels = batch_class_logits.indices + + # Convert to [x0, y0, x1, y1] format + batch_boxes = center_to_corners_format(batch_boxes) + + # Convert from relative [0, 1] to absolute [0, height] coordinates + if target_sizes is not None: + batch_boxes = _scale_boxes(batch_boxes, target_sizes) + + results = [] + for scores, labels, boxes in zip(batch_scores, batch_labels, batch_boxes): + keep = scores > threshold + scores = scores[keep] + labels = labels[keep] + boxes = boxes[keep] + results.append({"scores": scores, "labels": labels, "boxes": boxes}) + + return results + + +__all__ = ["GroundingDinoImageProcessorFast"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/grounding_dino/modeling_grounding_dino.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/grounding_dino/modeling_grounding_dino.py new file mode 100644 index 0000000000000000000000000000000000000000..5d674caca6fa7dc52246addce3ff0f8d16a9edda --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -0,0 +1,2635 @@ +# coding=utf-8 +# Copyright 2024 IDEA Research and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch Grounding DINO model.""" + +import math +import warnings +from dataclasses import dataclass +from typing import Optional, Union + +import torch +import torch.nn.functional as F +from torch import Tensor, nn + +from ...activations import ACT2FN +from ...file_utils import ModelOutput, is_timm_available, requires_backends +from ...integrations import use_kernel_forward_from_hub +from ...modeling_utils import PreTrainedModel +from ...pytorch_utils import meshgrid +from ...utils import auto_docstring, logging +from ...utils.backbone_utils import load_backbone +from ..auto import AutoModel +from .configuration_grounding_dino import GroundingDinoConfig + + +if is_timm_available(): + from timm import create_model + + +logger = logging.get_logger(__name__) + + +@use_kernel_forward_from_hub("MultiScaleDeformableAttention") +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.MultiScaleDeformableAttention +class MultiScaleDeformableAttention(nn.Module): + def forward( + self, + value: Tensor, + value_spatial_shapes: Tensor, + value_spatial_shapes_list: list[tuple], + level_start_index: Tensor, + sampling_locations: Tensor, + attention_weights: Tensor, + im2col_step: int, + ): + batch_size, _, num_heads, hidden_dim = value.shape + _, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape + value_list = value.split([height * width for height, width in value_spatial_shapes_list], dim=1) + sampling_grids = 2 * sampling_locations - 1 + sampling_value_list = [] + for level_id, (height, width) in enumerate(value_spatial_shapes_list): + # batch_size, height*width, num_heads, hidden_dim + # -> batch_size, height*width, num_heads*hidden_dim + # -> batch_size, num_heads*hidden_dim, height*width + # -> batch_size*num_heads, hidden_dim, height, width + value_l_ = ( + value_list[level_id] + .flatten(2) + .transpose(1, 2) + .reshape(batch_size * num_heads, hidden_dim, height, width) + ) + # batch_size, num_queries, num_heads, num_points, 2 + # -> batch_size, num_heads, num_queries, num_points, 2 + # -> batch_size*num_heads, num_queries, num_points, 2 + sampling_grid_l_ = sampling_grids[:, :, :, level_id].transpose(1, 2).flatten(0, 1) + # batch_size*num_heads, hidden_dim, num_queries, num_points + sampling_value_l_ = nn.functional.grid_sample( + value_l_, + sampling_grid_l_, + mode="bilinear", + padding_mode="zeros", + align_corners=False, + ) + sampling_value_list.append(sampling_value_l_) + # (batch_size, num_queries, num_heads, num_levels, num_points) + # -> (batch_size, num_heads, num_queries, num_levels, num_points) + # -> (batch_size, num_heads, 1, num_queries, num_levels*num_points) + attention_weights = attention_weights.transpose(1, 2).reshape( + batch_size * num_heads, 1, num_queries, num_levels * num_points + ) + output = ( + (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights) + .sum(-1) + .view(batch_size, num_heads * hidden_dim, num_queries) + ) + return output.transpose(1, 2).contiguous() + + +@dataclass +@auto_docstring( + custom_intro=""" + Base class for outputs of the GroundingDinoDecoder. This class adds two attributes to + BaseModelOutputWithCrossAttentions, namely: + - a stacked tensor of intermediate decoder hidden states (i.e. the output of each decoder layer) + - a stacked tensor of intermediate reference points. + """ +) +class GroundingDinoDecoderOutput(ModelOutput): + r""" + intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`): + Stacked intermediate hidden states (output of each layer of the decoder). + intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, sequence_length, hidden_size)`): + Stacked intermediate reference points (reference points of each layer of the decoder). + """ + + last_hidden_state: Optional[torch.FloatTensor] = None + intermediate_hidden_states: Optional[torch.FloatTensor] = None + intermediate_reference_points: Optional[torch.FloatTensor] = None + hidden_states: Optional[tuple[torch.FloatTensor]] = None + attentions: Optional[tuple[tuple[torch.FloatTensor]]] = None + + +@dataclass +@auto_docstring( + custom_intro=""" + Base class for outputs of the GroundingDinoEncoder. This class extends BaseModelOutput, due to: + - vision and text last hidden states + - vision and text intermediate hidden states + """ +) +class GroundingDinoEncoderOutput(ModelOutput): + r""" + last_hidden_state_vision (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the vision encoder. + last_hidden_state_text (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the text encoder. + vision_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the vision embeddings + one for the output of each + layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the vision encoder at the + output of each layer plus the initial embedding outputs. + text_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the text embeddings + one for the output of each layer) + of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the text encoder at the output of + each layer plus the initial embedding outputs. + """ + + last_hidden_state_vision: Optional[torch.FloatTensor] = None + last_hidden_state_text: Optional[torch.FloatTensor] = None + vision_hidden_states: Optional[tuple[torch.FloatTensor]] = None + text_hidden_states: Optional[tuple[torch.FloatTensor]] = None + attentions: Optional[tuple[tuple[torch.FloatTensor]]] = None + + +@dataclass +@auto_docstring( + custom_intro=""" + Base class for outputs of the Grounding DINO encoder-decoder model. + """ +) +class GroundingDinoModelOutput(ModelOutput): + r""" + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the decoder of the model. + init_reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`): + Initial reference points sent through the Transformer decoder. + intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`): + Stacked intermediate hidden states (output of each layer of the decoder). + intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`): + Stacked intermediate reference points (reference points of each layer of the decoder). + encoder_last_hidden_state_vision (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder of the model. + encoder_last_hidden_state_text (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder of the model. + encoder_vision_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the vision embeddings + one for the output of each + layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the vision encoder at the + output of each layer plus the initial embedding outputs. + encoder_text_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the text embeddings + one for the output of each layer) + of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the text encoder at the output of + each layer plus the initial embedding outputs. + encoder_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of tuples of `torch.FloatTensor` (one for attention for each layer) of shape `(batch_size, num_heads, + sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the + weighted average in the text-vision attention, vision-text attention, text-enhancer (self-attention) and + multi-scale deformable attention heads. attention softmax, used to compute the weighted average in the + bi-attention heads. + enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.two_stage=True`): + Predicted bounding boxes scores where the top `config.num_queries` scoring bounding boxes are picked as + region proposals in the first stage. Output of bounding box binary classification (i.e. foreground and + background). + enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.two_stage=True`): + Logits of predicted bounding boxes coordinates in the first stage. + encoder_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.two_stage=True`): + Logits of top `config.num_queries` scoring bounding boxes in the first stage. + encoder_pred_boxes (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.two_stage=True`): + Coordinates of top `config.num_queries` scoring bounding boxes in the first stage. + """ + + last_hidden_state: Optional[torch.FloatTensor] = None + init_reference_points: Optional[torch.FloatTensor] = None + intermediate_hidden_states: Optional[torch.FloatTensor] = None + intermediate_reference_points: Optional[torch.FloatTensor] = None + decoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None + decoder_attentions: Optional[tuple[tuple[torch.FloatTensor]]] = None + encoder_last_hidden_state_vision: Optional[torch.FloatTensor] = None + encoder_last_hidden_state_text: Optional[torch.FloatTensor] = None + encoder_vision_hidden_states: Optional[tuple[torch.FloatTensor]] = None + encoder_text_hidden_states: Optional[tuple[torch.FloatTensor]] = None + encoder_attentions: Optional[tuple[tuple[torch.FloatTensor]]] = None + enc_outputs_class: Optional[torch.FloatTensor] = None + enc_outputs_coord_logits: Optional[torch.FloatTensor] = None + encoder_logits: Optional[torch.FloatTensor] = None + encoder_pred_boxes: Optional[torch.FloatTensor] = None + + +@dataclass +@auto_docstring( + custom_intro=""" + Output type of [`GroundingDinoForObjectDetection`]. + """ +) +class GroundingDinoObjectDetectionOutput(ModelOutput): + r""" + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)): + Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a + bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized + scale-invariant IoU loss. + loss_dict (`Dict`, *optional*): + A dictionary containing the individual losses. Useful for logging. + logits (`torch.FloatTensor` of shape `(batch_size, num_queries, num_classes + 1)`): + Classification logits (including no-object) for all queries. + pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`): + Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These + values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding + possible padding). You can use [`~GroundingDinoProcessor.post_process_grounded_object_detection`] to retrieve the + unnormalized bounding boxes. + auxiliary_outputs (`list[Dict]`, *optional*): + Optional, only returned when auxiliary losses are activated (i.e. `config.auxiliary_loss` is set to `True`) + and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and + `pred_boxes`) for each decoder layer. + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the decoder of the model. + init_reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`): + Initial reference points sent through the Transformer decoder. + intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`): + Stacked intermediate hidden states (output of each layer of the decoder). + intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`): + Stacked intermediate reference points (reference points of each layer of the decoder). + encoder_last_hidden_state_vision (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder of the model. + encoder_last_hidden_state_text (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder of the model. + encoder_vision_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the vision embeddings + one for the output of each + layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the vision encoder at the + output of each layer plus the initial embedding outputs. + encoder_text_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the text embeddings + one for the output of each layer) + of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the text encoder at the output of + each layer plus the initial embedding outputs. + enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.two_stage=True`): + Predicted bounding boxes scores where the top `config.num_queries` scoring bounding boxes are picked as + region proposals in the first stage. Output of bounding box binary classification (i.e. foreground and + background). + enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.two_stage=True`): + Logits of predicted bounding boxes coordinates in the first stage. + encoder_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.two_stage=True`): + Logits of top `config.num_queries` scoring bounding boxes in the first stage. + encoder_pred_boxes (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.two_stage=True`): + Coordinates of top `config.num_queries` scoring bounding boxes in the first stage. + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Encoded candidate labels sequence. Used in processor to post process object detection result. + """ + + loss: Optional[torch.FloatTensor] = None + loss_dict: Optional[dict] = None + logits: Optional[torch.FloatTensor] = None + pred_boxes: Optional[torch.FloatTensor] = None + auxiliary_outputs: Optional[list[dict]] = None + last_hidden_state: Optional[torch.FloatTensor] = None + init_reference_points: Optional[torch.FloatTensor] = None + intermediate_hidden_states: Optional[torch.FloatTensor] = None + intermediate_reference_points: Optional[torch.FloatTensor] = None + decoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None + decoder_attentions: Optional[tuple[tuple[torch.FloatTensor]]] = None + encoder_last_hidden_state_vision: Optional[torch.FloatTensor] = None + encoder_last_hidden_state_text: Optional[torch.FloatTensor] = None + encoder_vision_hidden_states: Optional[tuple[torch.FloatTensor]] = None + encoder_text_hidden_states: Optional[tuple[torch.FloatTensor]] = None + encoder_attentions: Optional[tuple[tuple[torch.FloatTensor]]] = None + enc_outputs_class: Optional[torch.FloatTensor] = None + enc_outputs_coord_logits: Optional[torch.FloatTensor] = None + encoder_logits: Optional[torch.FloatTensor] = None + encoder_pred_boxes: Optional[torch.FloatTensor] = None + input_ids: Optional[torch.LongTensor] = None + + +# Copied from transformers.models.detr.modeling_detr.DetrFrozenBatchNorm2d with Detr->GroundingDino +class GroundingDinoFrozenBatchNorm2d(nn.Module): + """ + BatchNorm2d where the batch statistics and the affine parameters are fixed. + + Copy-paste from torchvision.misc.ops with added eps before rqsrt, without which any other models than + torchvision.models.resnet[18,34,50,101] produce nans. + """ + + def __init__(self, n): + super().__init__() + self.register_buffer("weight", torch.ones(n)) + self.register_buffer("bias", torch.zeros(n)) + self.register_buffer("running_mean", torch.zeros(n)) + self.register_buffer("running_var", torch.ones(n)) + + def _load_from_state_dict( + self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs + ): + num_batches_tracked_key = prefix + "num_batches_tracked" + if num_batches_tracked_key in state_dict: + del state_dict[num_batches_tracked_key] + + super()._load_from_state_dict( + state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs + ) + + def forward(self, x): + # move reshapes to the beginning + # to make it user-friendly + weight = self.weight.reshape(1, -1, 1, 1) + bias = self.bias.reshape(1, -1, 1, 1) + running_var = self.running_var.reshape(1, -1, 1, 1) + running_mean = self.running_mean.reshape(1, -1, 1, 1) + epsilon = 1e-5 + scale = weight * (running_var + epsilon).rsqrt() + bias = bias - running_mean * scale + return x * scale + bias + + +# Copied from transformers.models.detr.modeling_detr.replace_batch_norm with Detr->GroundingDino +def replace_batch_norm(model): + r""" + Recursively replace all `torch.nn.BatchNorm2d` with `GroundingDinoFrozenBatchNorm2d`. + + Args: + model (torch.nn.Module): + input model + """ + for name, module in model.named_children(): + if isinstance(module, nn.BatchNorm2d): + new_module = GroundingDinoFrozenBatchNorm2d(module.num_features) + + if module.weight.device != torch.device("meta"): + new_module.weight.data.copy_(module.weight) + new_module.bias.data.copy_(module.bias) + new_module.running_mean.data.copy_(module.running_mean) + new_module.running_var.data.copy_(module.running_var) + + model._modules[name] = new_module + + if len(list(module.children())) > 0: + replace_batch_norm(module) + + +class GroundingDinoConvEncoder(nn.Module): + """ + Convolutional backbone, using either the AutoBackbone API or one from the timm library. + + nn.BatchNorm2d layers are replaced by GroundingDinoFrozenBatchNorm2d as defined above. + + """ + + def __init__(self, config): + super().__init__() + + self.config = config + + if config.use_timm_backbone: + requires_backends(self, ["timm"]) + backbone = create_model( + config.backbone, + pretrained=config.use_pretrained_backbone, + features_only=True, + **config.backbone_kwargs, + ) + else: + backbone = load_backbone(config) + + # replace batch norm by frozen batch norm + with torch.no_grad(): + replace_batch_norm(backbone) + self.model = backbone + self.intermediate_channel_sizes = ( + self.model.feature_info.channels() if config.use_timm_backbone else self.model.channels + ) + + backbone_model_type = None + if config.backbone is not None: + backbone_model_type = config.backbone + elif config.backbone_config is not None: + backbone_model_type = config.backbone_config.model_type + else: + raise ValueError("Either `backbone` or `backbone_config` should be provided in the config") + + if "resnet" in backbone_model_type: + for name, parameter in self.model.named_parameters(): + if config.use_timm_backbone: + if "layer2" not in name and "layer3" not in name and "layer4" not in name: + parameter.requires_grad_(False) + else: + if "stage.1" not in name and "stage.2" not in name and "stage.3" not in name: + parameter.requires_grad_(False) + + # Copied from transformers.models.detr.modeling_detr.DetrConvEncoder.forward with Detr->GroundingDino + def forward(self, pixel_values: torch.Tensor, pixel_mask: torch.Tensor): + # send pixel_values through the model to get list of feature maps + features = self.model(pixel_values) if self.config.use_timm_backbone else self.model(pixel_values).feature_maps + + out = [] + for feature_map in features: + # downsample pixel_mask to match shape of corresponding feature_map + mask = nn.functional.interpolate(pixel_mask[None].float(), size=feature_map.shape[-2:]).to(torch.bool)[0] + out.append((feature_map, mask)) + return out + + +# Copied from transformers.models.detr.modeling_detr.DetrConvModel with Detr->GroundingDino +class GroundingDinoConvModel(nn.Module): + """ + This module adds 2D position embeddings to all intermediate feature maps of the convolutional encoder. + """ + + def __init__(self, conv_encoder, position_embedding): + super().__init__() + self.conv_encoder = conv_encoder + self.position_embedding = position_embedding + + def forward(self, pixel_values, pixel_mask): + # send pixel_values and pixel_mask through backbone to get list of (feature_map, pixel_mask) tuples + out = self.conv_encoder(pixel_values, pixel_mask) + pos = [] + for feature_map, mask in out: + # position encoding + pos.append(self.position_embedding(feature_map, mask).to(feature_map.dtype)) + + return out, pos + + +class GroundingDinoSinePositionEmbedding(nn.Module): + """ + This is a more standard version of the position embedding, very similar to the one used by the Attention is all you + need paper, generalized to work on images. + """ + + def __init__(self, config): + super().__init__() + self.embedding_dim = config.d_model // 2 + self.temperature = config.positional_embedding_temperature + self.scale = 2 * math.pi + + def forward(self, pixel_values, pixel_mask): + y_embed = pixel_mask.cumsum(1, dtype=torch.float32) + x_embed = pixel_mask.cumsum(2, dtype=torch.float32) + eps = 1e-6 + y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale + x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale + + dim_t = torch.arange(self.embedding_dim, dtype=torch.float32, device=pixel_values.device) + dim_t = self.temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / self.embedding_dim) + + pos_x = x_embed[:, :, :, None] / dim_t + pos_y = y_embed[:, :, :, None] / dim_t + pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3) + pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3) + pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) + return pos + + +class GroundingDinoLearnedPositionEmbedding(nn.Module): + """ + This module learns positional embeddings up to a fixed maximum size. + """ + + def __init__(self, config): + super().__init__() + + embedding_dim = config.d_model // 2 + self.row_embeddings = nn.Embedding(50, embedding_dim) + self.column_embeddings = nn.Embedding(50, embedding_dim) + + def forward(self, pixel_values, pixel_mask=None): + height, width = pixel_values.shape[-2:] + width_values = torch.arange(width, device=pixel_values.device) + height_values = torch.arange(height, device=pixel_values.device) + x_emb = self.column_embeddings(width_values) + y_emb = self.row_embeddings(height_values) + pos = torch.cat([x_emb.unsqueeze(0).repeat(height, 1, 1), y_emb.unsqueeze(1).repeat(1, width, 1)], dim=-1) + pos = pos.permute(2, 0, 1) + pos = pos.unsqueeze(0) + pos = pos.repeat(pixel_values.shape[0], 1, 1, 1) + return pos + + +def build_position_encoding(config): + if config.position_embedding_type == "sine": + position_embedding = GroundingDinoSinePositionEmbedding(config) + elif config.position_embedding_type == "learned": + position_embedding = GroundingDinoLearnedPositionEmbedding(config) + else: + raise ValueError(f"Not supported {config.position_embedding_type}") + + return position_embedding + + +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrMultiscaleDeformableAttention with DeformableDetr->GroundingDino, Deformable DETR->Grounding DINO +class GroundingDinoMultiscaleDeformableAttention(nn.Module): + """ + Multiscale deformable attention as proposed in Deformable DETR. + """ + + def __init__(self, config: GroundingDinoConfig, num_heads: int, n_points: int): + super().__init__() + + self.attn = MultiScaleDeformableAttention() + + if config.d_model % num_heads != 0: + raise ValueError( + f"embed_dim (d_model) must be divisible by num_heads, but got {config.d_model} and {num_heads}" + ) + dim_per_head = config.d_model // num_heads + # check if dim_per_head is power of 2 + if not ((dim_per_head & (dim_per_head - 1) == 0) and dim_per_head != 0): + warnings.warn( + "You'd better set embed_dim (d_model) in GroundingDinoMultiscaleDeformableAttention to make the" + " dimension of each attention head a power of 2 which is more efficient in the authors' CUDA" + " implementation." + ) + + self.im2col_step = 64 + + self.d_model = config.d_model + self.n_levels = config.num_feature_levels + self.n_heads = num_heads + self.n_points = n_points + + self.sampling_offsets = nn.Linear(config.d_model, num_heads * self.n_levels * n_points * 2) + self.attention_weights = nn.Linear(config.d_model, num_heads * self.n_levels * n_points) + self.value_proj = nn.Linear(config.d_model, config.d_model) + self.output_proj = nn.Linear(config.d_model, config.d_model) + + self.disable_custom_kernels = config.disable_custom_kernels + + def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]): + return tensor if position_embeddings is None else tensor + position_embeddings + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + encoder_hidden_states=None, + encoder_attention_mask=None, + position_embeddings: Optional[torch.Tensor] = None, + reference_points=None, + spatial_shapes=None, + spatial_shapes_list=None, + level_start_index=None, + output_attentions: bool = False, + ): + # add position embeddings to the hidden states before projecting to queries and keys + if position_embeddings is not None: + hidden_states = self.with_pos_embed(hidden_states, position_embeddings) + + batch_size, num_queries, _ = hidden_states.shape + batch_size, sequence_length, _ = encoder_hidden_states.shape + # Ignore copy + if (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() != sequence_length: + raise ValueError( + "Make sure to align the spatial shapes with the sequence length of the encoder hidden states" + ) + + value = self.value_proj(encoder_hidden_states) + if attention_mask is not None: + # we invert the attention_mask + value = value.masked_fill(~attention_mask[..., None], float(0)) + value = value.view(batch_size, sequence_length, self.n_heads, self.d_model // self.n_heads) + sampling_offsets = self.sampling_offsets(hidden_states).view( + batch_size, num_queries, self.n_heads, self.n_levels, self.n_points, 2 + ) + attention_weights = self.attention_weights(hidden_states).view( + batch_size, num_queries, self.n_heads, self.n_levels * self.n_points + ) + attention_weights = F.softmax(attention_weights, -1).view( + batch_size, num_queries, self.n_heads, self.n_levels, self.n_points + ) + # batch_size, num_queries, n_heads, n_levels, n_points, 2 + num_coordinates = reference_points.shape[-1] + if num_coordinates == 2: + offset_normalizer = torch.stack([spatial_shapes[..., 1], spatial_shapes[..., 0]], -1) + sampling_locations = ( + reference_points[:, :, None, :, None, :] + + sampling_offsets / offset_normalizer[None, None, None, :, None, :] + ) + elif num_coordinates == 4: + sampling_locations = ( + reference_points[:, :, None, :, None, :2] + + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5 + ) + else: + raise ValueError(f"Last dim of reference_points must be 2 or 4, but got {reference_points.shape[-1]}") + + output = self.attn( + value, + spatial_shapes, + spatial_shapes_list, + level_start_index, + sampling_locations, + attention_weights, + self.im2col_step, + ) + + output = self.output_proj(output) + + return output, attention_weights + + +class GroundingDinoTextEnhancerLayer(nn.Module): + """Vanilla Transformer with text embeddings as input""" + + def __init__(self, config): + super().__init__() + self.self_attn = GroundingDinoMultiheadAttention( + config, num_attention_heads=config.encoder_attention_heads // 2 + ) + + # Implementation of Feedforward model + self.fc1 = nn.Linear(config.d_model, config.encoder_ffn_dim // 2) + self.fc2 = nn.Linear(config.encoder_ffn_dim // 2, config.d_model) + + self.layer_norm_before = nn.LayerNorm(config.d_model, config.layer_norm_eps) + self.layer_norm_after = nn.LayerNorm(config.d_model, config.layer_norm_eps) + + self.activation = ACT2FN[config.activation_function] + self.num_heads = config.encoder_attention_heads // 2 + self.dropout = config.text_enhancer_dropout + + def with_pos_embed(self, hidden_state: Tensor, position_embeddings: Optional[Tensor]): + return hidden_state if position_embeddings is None else hidden_state + position_embeddings + + def forward( + self, + hidden_states: torch.FloatTensor, + attention_masks: Optional[torch.BoolTensor] = None, + position_embeddings: Optional[torch.FloatTensor] = None, + ) -> tuple[torch.FloatTensor, torch.FloatTensor]: + """Text self-attention to enhance projection of text features generated by + the text encoder (AutoModel based on text_config) within GroundingDinoEncoderLayer + + Args: + hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_dim)`): + Text features generated by the text encoder. + attention_masks (`torch.BoolTensor`, *optional*): + Attention mask for text self-attention. False for real tokens and True for padding tokens. + position_embeddings (`torch.FloatTensor`, *optional*): + Position embeddings to be added to the hidden states. + + Returns: + `tuple(torch.FloatTensor)` comprising two elements: + - **hidden_states** (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`) -- + Output of the text self-attention layer. + - **attention_weights** (`torch.FloatTensor` of shape `(batch_size, num_heads, sequence_length, + sequence_length)`) -- + Attention weights of the text self-attention layer. + """ + + # repeat attn mask + if attention_masks.dim() == 3 and attention_masks.shape[0] == hidden_states.shape[0]: + # batch_size, num_queries, num_keys + attention_masks = attention_masks[:, None, :, :] + attention_masks = attention_masks.repeat(1, self.num_heads, 1, 1) + + dtype = hidden_states.dtype + attention_masks = attention_masks.to(dtype=dtype) # fp16 compatibility + attention_masks = (1.0 - attention_masks) * torch.finfo(dtype).min + + queries = keys = self.with_pos_embed(hidden_states, position_embeddings) + attention_output, attention_weights = self.self_attn( + queries=queries, + keys=keys, + values=hidden_states, + attention_mask=attention_masks, + output_attentions=True, + ) + attention_output = nn.functional.dropout(attention_output, p=self.dropout, training=self.training) + hidden_states = hidden_states + attention_output + hidden_states = self.layer_norm_before(hidden_states) + + residual = hidden_states + hidden_states = self.activation(self.fc1(hidden_states)) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = self.fc2(hidden_states) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = hidden_states + residual + hidden_states = self.layer_norm_after(hidden_states) + + return hidden_states, attention_weights + + +class GroundingDinoBiMultiHeadAttention(nn.Module): + def __init__(self, config): + super().__init__() + + vision_dim = text_dim = config.d_model + embed_dim = config.encoder_ffn_dim // 2 + num_heads = config.encoder_attention_heads // 2 + dropout = config.fusion_dropout + + self.embed_dim = embed_dim + self.num_heads = num_heads + self.head_dim = embed_dim // num_heads + self.vision_dim = vision_dim + self.text_dim = text_dim + + if self.head_dim * self.num_heads != self.embed_dim: + raise ValueError( + f"`embed_dim` must be divisible by `num_heads` (got `embed_dim`: {self.embed_dim} and `num_heads`: {self.num_heads})." + ) + self.scale = self.head_dim ** (-0.5) + self.dropout = dropout + + self.vision_proj = nn.Linear(self.vision_dim, self.embed_dim) + self.text_proj = nn.Linear(self.text_dim, self.embed_dim) + self.values_vision_proj = nn.Linear(self.vision_dim, self.embed_dim) + self.values_text_proj = nn.Linear(self.text_dim, self.embed_dim) + + self.out_vision_proj = nn.Linear(self.embed_dim, self.vision_dim) + self.out_text_proj = nn.Linear(self.embed_dim, self.text_dim) + + def _reshape(self, tensor: torch.Tensor, seq_len: int, batch_size: int): + return tensor.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() + + def forward( + self, + vision_features: torch.FloatTensor, + text_features: torch.FloatTensor, + vision_attention_mask: Optional[torch.BoolTensor] = None, + text_attention_mask: Optional[torch.BoolTensor] = None, + ) -> tuple[tuple[torch.FloatTensor, torch.FloatTensor], tuple[torch.FloatTensor, torch.FloatTensor]]: + """Image-to-text and text-to-image cross-attention + + Args: + vision_features (`torch.FloatTensor` of shape `(batch_size, vision_sequence_length, hidden_dim)`): + Projected flattened image features generated by the vision backbone. + text_features (`torch.FloatTensor` of shape `(batch_size, text_sequence_length, hidden_dim)`): + Projected text features generated by the text encoder. + vision_attention_mask (`torch.BoolTensor`, **optional**): + Attention mask for image-to-text cross-attention. False for real tokens and True for padding tokens. + text_attention_mask (`torch.BoolTensor`, **optional**): + Attention mask for text-to-image cross-attention. False for real tokens and True for padding tokens. + + Returns: + `tuple(tuple(torch.FloatTensor), tuple(torch.FloatTensor))` where each inner tuple comprises an attention + output and weights: + - **vision_attn_output** (`torch.FloatTensor` of shape `(batch_size, vision_sequence_length, hidden_din)`) + -- + Output of the image-to-text cross-attention layer. + - **vision_attn_weights** (`torch.FloatTensor` of shape `(batch_size, num_heads, vision_sequence_length, + vision_sequence_length)`) -- + Attention weights of the image-to-text cross-attention layer. + - **text_attn_output** (`torch.FloatTensor` of shape `(batch_size, text_sequence_length, hidden_dim)`) -- + Output of the text-to-image cross-attention layer. + - **text_attn_weights** (`torch.FloatTensor` of shape `(batch_size, num_heads, text_sequence_length, + text_sequence_length)`) -- + Attention weights of the text-to-image cross-attention layer. + """ + batch_size, tgt_len, _ = vision_features.size() + + vision_query_states = self.vision_proj(vision_features) * self.scale + vision_query_states = self._reshape(vision_query_states, tgt_len, batch_size) + + text_key_states = self.text_proj(text_features) + text_key_states = self._reshape(text_key_states, -1, batch_size) + + vision_value_states = self.values_vision_proj(vision_features) + vision_value_states = self._reshape(vision_value_states, -1, batch_size) + + text_value_states = self.values_text_proj(text_features) + text_value_states = self._reshape(text_value_states, -1, batch_size) + + proj_shape = (batch_size * self.num_heads, -1, self.head_dim) + + vision_query_states = vision_query_states.view(*proj_shape) + text_key_states = text_key_states.view(*proj_shape) + vision_value_states = vision_value_states.view(*proj_shape) + text_value_states = text_value_states.view(*proj_shape) + + src_len = text_key_states.size(1) + attn_weights = torch.bmm(vision_query_states, text_key_states.transpose(1, 2)) # bs*nhead, nimg, ntxt + + if attn_weights.size() != (batch_size * self.num_heads, tgt_len, src_len): + raise ValueError( + f"Attention weights should be of size {(batch_size * self.num_heads, tgt_len, src_len)}, but is {attn_weights.size()}" + ) + + attn_weights = attn_weights - attn_weights.max() + # Do not increase -50000/50000, data type half has quite limited range + attn_weights = torch.clamp(attn_weights, min=-50000, max=50000) + + attn_weights_transposed = attn_weights.transpose(1, 2) + text_attn_weights = attn_weights_transposed - torch.max(attn_weights_transposed, dim=-1, keepdim=True)[0] + + # Do not increase -50000/50000, data type half has quite limited range + text_attn_weights = torch.clamp(text_attn_weights, min=-50000, max=50000) + + # mask vision for language + if vision_attention_mask is not None: + vision_attention_mask = ( + vision_attention_mask[:, None, None, :].repeat(1, self.num_heads, 1, 1).flatten(0, 1) + ) + text_attn_weights.masked_fill_(vision_attention_mask, float("-inf")) + + text_attn_weights = text_attn_weights.softmax(dim=-1) + + # mask language for vision + if text_attention_mask is not None: + text_attention_mask = text_attention_mask[:, None, None, :].repeat(1, self.num_heads, 1, 1).flatten(0, 1) + attn_weights.masked_fill_(text_attention_mask, float("-inf")) + vision_attn_weights = attn_weights.softmax(dim=-1) + + vision_attn_probs = F.dropout(vision_attn_weights, p=self.dropout, training=self.training) + text_attn_probs = F.dropout(text_attn_weights, p=self.dropout, training=self.training) + + vision_attn_output = torch.bmm(vision_attn_probs, text_value_states) + text_attn_output = torch.bmm(text_attn_probs, vision_value_states) + + if vision_attn_output.size() != (batch_size * self.num_heads, tgt_len, self.head_dim): + raise ValueError( + f"`vision_attn_output` should be of size {(batch_size, self.num_heads, tgt_len, self.head_dim)}, but is {vision_attn_output.size()}" + ) + + if text_attn_output.size() != (batch_size * self.num_heads, src_len, self.head_dim): + raise ValueError( + f"`text_attn_output` should be of size {(batch_size, self.num_heads, src_len, self.head_dim)}, but is {text_attn_output.size()}" + ) + + vision_attn_output = vision_attn_output.view(batch_size, self.num_heads, tgt_len, self.head_dim) + vision_attn_output = vision_attn_output.transpose(1, 2) + vision_attn_output = vision_attn_output.reshape(batch_size, tgt_len, self.embed_dim) + + text_attn_output = text_attn_output.view(batch_size, self.num_heads, src_len, self.head_dim) + text_attn_output = text_attn_output.transpose(1, 2) + text_attn_output = text_attn_output.reshape(batch_size, src_len, self.embed_dim) + + vision_attn_output = self.out_vision_proj(vision_attn_output) + text_attn_output = self.out_text_proj(text_attn_output) + + return (vision_attn_output, vision_attn_weights), (text_attn_output, text_attn_weights) + + +# Copied from transformers.models.beit.modeling_beit.drop_path +def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor: + """ + Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + + Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks, + however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... + See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the + layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the + argument. + """ + if drop_prob == 0.0 or not training: + return input + keep_prob = 1 - drop_prob + shape = (input.shape[0],) + (1,) * (input.ndim - 1) # work with diff dim tensors, not just 2D ConvNets + random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device) + random_tensor.floor_() # binarize + output = input.div(keep_prob) * random_tensor + return output + + +# Copied from transformers.models.beit.modeling_beit.BeitDropPath with Beit->GroundingDino +class GroundingDinoDropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" + + def __init__(self, drop_prob: Optional[float] = None) -> None: + super().__init__() + self.drop_prob = drop_prob + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + return drop_path(hidden_states, self.drop_prob, self.training) + + def extra_repr(self) -> str: + return f"p={self.drop_prob}" + + +class GroundingDinoFusionLayer(nn.Module): + def __init__(self, config): + super().__init__() + drop_path = config.fusion_droppath + + # pre layer norm + self.layer_norm_vision = nn.LayerNorm(config.d_model, config.layer_norm_eps) + self.layer_norm_text = nn.LayerNorm(config.d_model, config.layer_norm_eps) + self.attn = GroundingDinoBiMultiHeadAttention(config) + + # add layer scale for training stability + self.drop_path = GroundingDinoDropPath(drop_path) if drop_path > 0.0 else nn.Identity() + init_values = 1e-4 + self.vision_param = nn.Parameter(init_values * torch.ones(config.d_model), requires_grad=True) + self.text_param = nn.Parameter(init_values * torch.ones(config.d_model), requires_grad=True) + + def forward( + self, + vision_features: torch.FloatTensor, + text_features: torch.FloatTensor, + attention_mask_vision: Optional[torch.BoolTensor] = None, + attention_mask_text: Optional[torch.BoolTensor] = None, + ) -> tuple[tuple[torch.FloatTensor, torch.FloatTensor], tuple[torch.FloatTensor, torch.FloatTensor]]: + """Image and text features fusion + + Args: + vision_features (`torch.FloatTensor` of shape `(batch_size, vision_sequence_length, hidden_dim)`): + Projected flattened image features generated by the vision backbone. + text_features (`torch.FloatTensor` of shape `(batch_size, text_sequence_length, hidden_dim)`): + Projected text features generated by the text encoder. + attention_mask_vision (`torch.BoolTensor`, **optional**): + Attention mask for image-to-text cross-attention. False for real tokens and True for padding tokens. + attention_mask_text (`torch.BoolTensor`, **optional**): + Attention mask for text-to-image cross-attention. False for real tokens and True for padding tokens. + + Returns: + `tuple(tuple(torch.FloatTensor), tuple(torch.FloatTensor))` where each inner tuple comprises an enhanced + feature and attention output and weights: + - **vision_features** (`torch.FloatTensor` of shape `(batch_size, vision_sequence_length, vision_dim)`) -- + Updated vision features with attention output from image-to-text cross-attention layer. + - **vision_attn_weights** (`torch.FloatTensor` of shape `(batch_size, num_heads, vision_sequence_length, + vision_sequence_length)`) -- + Attention weights of the image-to-text cross-attention layer. + - **text_features** (`torch.FloatTensor` of shape `(batch_size, text_sequence_length, text_dim)`) -- + Updated text features with attention output from text-to-image cross-attention layer. + - **text_attn_weights** (`torch.FloatTensor` of shape `(batch_size, num_heads, text_sequence_length, + text_sequence_length)`) -- + Attention weights of the text-to-image cross-attention layer. + """ + vision_features = self.layer_norm_vision(vision_features) + text_features = self.layer_norm_text(text_features) + (delta_v, vision_attn), (delta_t, text_attn) = self.attn( + vision_features, + text_features, + vision_attention_mask=attention_mask_vision, + text_attention_mask=attention_mask_text, + ) + vision_features = vision_features + self.drop_path(self.vision_param * delta_v) + text_features = text_features + self.drop_path(self.text_param * delta_t) + + return (vision_features, vision_attn), (text_features, text_attn) + + +class GroundingDinoDeformableLayer(nn.Module): + def __init__(self, config: GroundingDinoConfig): + super().__init__() + self.embed_dim = config.d_model + self.self_attn = GroundingDinoMultiscaleDeformableAttention( + config, num_heads=config.encoder_attention_heads, n_points=config.encoder_n_points + ) + self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim, config.layer_norm_eps) + self.dropout = config.dropout + self.activation_fn = ACT2FN[config.activation_function] + self.activation_dropout = config.activation_dropout + self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim) + self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim) + self.final_layer_norm = nn.LayerNorm(self.embed_dim, config.layer_norm_eps) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: torch.Tensor, + position_embeddings: Optional[torch.Tensor] = None, + reference_points=None, + spatial_shapes=None, + spatial_shapes_list=None, + level_start_index=None, + output_attentions: bool = False, + ): + """ + Args: + hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Input to the layer. + attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): + Attention mask. + position_embeddings (`torch.FloatTensor`, *optional*): + Position embeddings, to be added to `hidden_states`. + reference_points (`torch.FloatTensor`, *optional*): + Reference points. + spatial_shapes (`torch.LongTensor`, *optional*): + Spatial shapes of the backbone feature maps. + spatial_shapes_list (`list[tuple[int, int]]`, *optional*): + Spatial shapes of the backbone feature maps (but as list for export compatibility). + level_start_index (`torch.LongTensor`, *optional*): + Level start index. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + """ + residual = hidden_states + + # Apply Multi-scale Deformable Attention Module on the multi-scale feature maps. + hidden_states, attn_weights = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + encoder_hidden_states=hidden_states, + encoder_attention_mask=attention_mask, + position_embeddings=position_embeddings, + reference_points=reference_points, + spatial_shapes=spatial_shapes, + spatial_shapes_list=spatial_shapes_list, + level_start_index=level_start_index, + output_attentions=output_attentions, + ) + + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + + residual = hidden_states + hidden_states = self.activation_fn(self.fc1(hidden_states)) + hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) + + hidden_states = self.fc2(hidden_states) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + + hidden_states = residual + hidden_states + hidden_states = self.final_layer_norm(hidden_states) + + if self.training: + if torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any(): + clamp_value = torch.finfo(hidden_states.dtype).max - 1000 + hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) + + return hidden_states, attn_weights + + +# Based on https://github.com/IDEA-Research/GroundingDINO/blob/2b62f419c292ca9c518daae55512fabc3fead4a4/groundingdino/models/GroundingDINO/utils.py#L24 +def get_sine_pos_embed( + pos_tensor: torch.Tensor, num_pos_feats: int = 128, temperature: int = 10000, exchange_xy: bool = True +) -> Tensor: + """ + Generate sine position embeddings from a position tensor. + + Args: + pos_tensor (torch.Tensor): + Tensor containing positions. Shape: [..., n]. + num_pos_feats (`int`, *optional*, defaults to 128): + Projected shape for each float in the tensor. + temperature (`int`, *optional*, defaults to 10000): + Temperature in the sine/cosine function. + exchange_xy (`bool`, *optional*, defaults to `True`): + Exchange pos x and pos y. For example, input tensor is [x,y], the results will be [pos(y), pos(x)]. + + Returns: + position_embeddings (torch.Tensor): shape: [..., n * hidden_size]. + """ + scale = 2 * math.pi + dim_t = torch.arange(num_pos_feats, dtype=torch.float32, device=pos_tensor.device) + dim_t = temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / num_pos_feats) + + def sine_func(x: torch.Tensor): + sin_x = x * scale / dim_t + sin_x = torch.stack((sin_x[..., 0::2].sin(), sin_x[..., 1::2].cos()), dim=3).flatten(2) + return sin_x + + pos_tensor = pos_tensor.split([1] * pos_tensor.shape[-1], dim=-1) + position_embeddings = [sine_func(x) for x in pos_tensor] + if exchange_xy: + position_embeddings[0], position_embeddings[1] = position_embeddings[1], position_embeddings[0] + position_embeddings = torch.cat(position_embeddings, dim=-1) + return position_embeddings + + +class GroundingDinoEncoderLayer(nn.Module): + def __init__(self, config) -> None: + super().__init__() + + self.d_model = config.d_model + + self.text_enhancer_layer = GroundingDinoTextEnhancerLayer(config) + self.fusion_layer = GroundingDinoFusionLayer(config) + self.deformable_layer = GroundingDinoDeformableLayer(config) + + def get_text_position_embeddings( + self, + text_features: Tensor, + text_position_embedding: Optional[torch.Tensor], + text_position_ids: Optional[torch.Tensor], + ) -> Tensor: + batch_size, seq_length, _ = text_features.shape + if text_position_embedding is None and text_position_ids is None: + text_position_embedding = torch.arange(seq_length, device=text_features.device) + text_position_embedding = text_position_embedding.float() + text_position_embedding = text_position_embedding.unsqueeze(0).unsqueeze(-1) + text_position_embedding = text_position_embedding.repeat(batch_size, 1, 1) + text_position_embedding = get_sine_pos_embed( + text_position_embedding, num_pos_feats=self.d_model, exchange_xy=False + ) + if text_position_ids is not None: + text_position_embedding = get_sine_pos_embed( + text_position_ids[..., None], num_pos_feats=self.d_model, exchange_xy=False + ) + + return text_position_embedding + + def forward( + self, + vision_features: Tensor, + vision_position_embedding: Tensor, + spatial_shapes: Tensor, + spatial_shapes_list: list[tuple[int, int]], + level_start_index: Tensor, + key_padding_mask: Tensor, + reference_points: Tensor, + text_features: Optional[Tensor] = None, + text_attention_mask: Optional[Tensor] = None, + text_position_embedding: Optional[Tensor] = None, + text_self_attention_masks: Optional[Tensor] = None, + text_position_ids: Optional[Tensor] = None, + ): + text_position_embedding = self.get_text_position_embeddings( + text_features, text_position_embedding, text_position_ids + ) + + (vision_features, vision_fused_attn), (text_features, text_fused_attn) = self.fusion_layer( + vision_features=vision_features, + text_features=text_features, + attention_mask_vision=key_padding_mask, + attention_mask_text=text_attention_mask, + ) + + (text_features, text_enhanced_attn) = self.text_enhancer_layer( + hidden_states=text_features, + attention_masks=~text_self_attention_masks, # note we use ~ for mask here + position_embeddings=(text_position_embedding if text_position_embedding is not None else None), + ) + + (vision_features, vision_deformable_attn) = self.deformable_layer( + hidden_states=vision_features, + attention_mask=~key_padding_mask, + position_embeddings=vision_position_embedding, + reference_points=reference_points, + spatial_shapes=spatial_shapes, + spatial_shapes_list=spatial_shapes_list, + level_start_index=level_start_index, + ) + + return ( + (vision_features, text_features), + (vision_fused_attn, text_fused_attn, text_enhanced_attn, vision_deformable_attn), + ) + + +class GroundingDinoMultiheadAttention(nn.Module): + """Equivalent implementation of nn.MultiheadAttention with `batch_first=True`.""" + + def __init__(self, config, num_attention_heads=None): + super().__init__() + if config.hidden_size % num_attention_heads != 0 and not hasattr(config, "embedding_size"): + raise ValueError( + f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention " + f"heads ({num_attention_heads})" + ) + + self.num_attention_heads = num_attention_heads + self.attention_head_size = int(config.hidden_size / num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size) + self.key = nn.Linear(config.hidden_size, self.all_head_size) + self.value = nn.Linear(config.hidden_size, self.all_head_size) + + self.out_proj = nn.Linear(config.hidden_size, config.hidden_size) + + self.dropout = nn.Dropout(config.attention_dropout) + + def forward( + self, + queries: torch.Tensor, + keys: torch.Tensor, + values: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = False, + ) -> tuple[torch.Tensor]: + batch_size, seq_length, _ = queries.shape + query_layer = ( + self.query(queries) + .view(batch_size, -1, self.num_attention_heads, self.attention_head_size) + .transpose(1, 2) + ) + key_layer = ( + self.key(keys).view(batch_size, -1, self.num_attention_heads, self.attention_head_size).transpose(1, 2) + ) + value_layer = ( + self.value(values).view(batch_size, -1, self.num_attention_heads, self.attention_head_size).transpose(1, 2) + ) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in GroundingDinoModel forward() function) + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = nn.functional.softmax(attention_scores, dim=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + context_layer = torch.matmul(attention_probs, value_layer) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(new_context_layer_shape) + + context_layer = self.out_proj(context_layer) + + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + + return outputs + + +class GroundingDinoDecoderLayer(nn.Module): + def __init__(self, config: GroundingDinoConfig): + super().__init__() + self.embed_dim = config.d_model + + # self-attention + self.self_attn = GroundingDinoMultiheadAttention(config, num_attention_heads=config.decoder_attention_heads) + + self.dropout = config.dropout + self.activation_fn = ACT2FN[config.activation_function] + self.activation_dropout = config.activation_dropout + + self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim, config.layer_norm_eps) + # cross-attention text + self.encoder_attn_text = GroundingDinoMultiheadAttention( + config, num_attention_heads=config.decoder_attention_heads + ) + self.encoder_attn_text_layer_norm = nn.LayerNorm(self.embed_dim, config.layer_norm_eps) + # cross-attention + self.encoder_attn = GroundingDinoMultiscaleDeformableAttention( + config, + num_heads=config.decoder_attention_heads, + n_points=config.decoder_n_points, + ) + self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim, config.layer_norm_eps) + # feedforward neural networks + self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim) + self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim) + self.final_layer_norm = nn.LayerNorm(self.embed_dim, config.layer_norm_eps) + + def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]): + return tensor if position_embeddings is None else tensor + position_embeddings + + def forward( + self, + hidden_states: torch.Tensor, + position_embeddings: Optional[torch.Tensor] = None, + reference_points=None, + spatial_shapes=None, + spatial_shapes_list=None, + level_start_index=None, + vision_encoder_hidden_states: Optional[torch.Tensor] = None, + vision_encoder_attention_mask: Optional[torch.Tensor] = None, + text_encoder_hidden_states: Optional[torch.Tensor] = None, + text_encoder_attention_mask: Optional[torch.Tensor] = None, + self_attn_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = False, + ): + residual = hidden_states + + # Self Attention + queries = keys = self.with_pos_embed(hidden_states, position_embeddings) + hidden_states, self_attn_weights = self.self_attn( + queries=queries, + keys=keys, + values=hidden_states, + attention_mask=self_attn_mask, + output_attentions=True, + ) + + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + + second_residual = hidden_states + + # Cross-Attention Text + queries = self.with_pos_embed(hidden_states, position_embeddings) + hidden_states, text_cross_attn_weights = self.encoder_attn_text( + queries=queries, + keys=text_encoder_hidden_states, + values=text_encoder_hidden_states, + attention_mask=text_encoder_attention_mask, + output_attentions=True, + ) + + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = second_residual + hidden_states + hidden_states = self.encoder_attn_text_layer_norm(hidden_states) + + third_residual = hidden_states + + # Cross-Attention + cross_attn_weights = None + hidden_states, cross_attn_weights = self.encoder_attn( + hidden_states=hidden_states, + attention_mask=vision_encoder_attention_mask, + encoder_hidden_states=vision_encoder_hidden_states, + encoder_attention_mask=vision_encoder_attention_mask, + position_embeddings=position_embeddings, + reference_points=reference_points, + spatial_shapes=spatial_shapes, + spatial_shapes_list=spatial_shapes_list, + level_start_index=level_start_index, + output_attentions=output_attentions, + ) + + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = third_residual + hidden_states + hidden_states = self.encoder_attn_layer_norm(hidden_states) + + # Fully Connected + residual = hidden_states + hidden_states = self.activation_fn(self.fc1(hidden_states)) + hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) + hidden_states = self.fc2(hidden_states) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + hidden_states = self.final_layer_norm(hidden_states) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights, text_cross_attn_weights, cross_attn_weights) + + return outputs + + +class GroundingDinoContrastiveEmbedding(nn.Module): + def __init__(self, config): + super().__init__() + self.max_text_len = config.max_text_len + + def forward( + self, + vision_hidden_state: torch.FloatTensor, + text_hidden_state: torch.FloatTensor, + text_token_mask: torch.BoolTensor, + ) -> torch.FloatTensor: + output = vision_hidden_state @ text_hidden_state.transpose(-1, -2) + output = output.masked_fill(~text_token_mask[:, None, :], float("-inf")) + + # padding to max_text_len + new_output = torch.full((*output.shape[:-1], self.max_text_len), float("-inf"), device=output.device) + new_output[..., : output.shape[-1]] = output + + return new_output + + +@auto_docstring +class GroundingDinoPreTrainedModel(PreTrainedModel): + config: GroundingDinoConfig + base_model_prefix = "model" + main_input_name = "pixel_values" + + def _init_weights(self, module): + std = self.config.init_std + + if isinstance(module, GroundingDinoLearnedPositionEmbedding): + nn.init.uniform_(module.row_embeddings.weight) + nn.init.uniform_(module.column_embeddings.weight) + elif isinstance(module, GroundingDinoMultiscaleDeformableAttention): + nn.init.constant_(module.sampling_offsets.weight.data, 0.0) + default_dtype = torch.get_default_dtype() + thetas = torch.arange(module.n_heads, dtype=torch.int64).to(default_dtype) * ( + 2.0 * math.pi / module.n_heads + ) + grid_init = torch.stack([thetas.cos(), thetas.sin()], -1) + grid_init = ( + (grid_init / grid_init.abs().max(-1, keepdim=True)[0]) + .view(module.n_heads, 1, 1, 2) + .repeat(1, module.n_levels, module.n_points, 1) + ) + for i in range(module.n_points): + grid_init[:, :, i, :] *= i + 1 + with torch.no_grad(): + module.sampling_offsets.bias = nn.Parameter(grid_init.view(-1)) + nn.init.constant_(module.attention_weights.weight.data, 0.0) + nn.init.constant_(module.attention_weights.bias.data, 0.0) + nn.init.xavier_uniform_(module.value_proj.weight.data) + nn.init.constant_(module.value_proj.bias.data, 0.0) + nn.init.xavier_uniform_(module.output_proj.weight.data) + nn.init.constant_(module.output_proj.bias.data, 0.0) + elif isinstance(module, GroundingDinoBiMultiHeadAttention): + nn.init.xavier_uniform_(module.vision_proj.weight) + module.vision_proj.bias.data.fill_(0) + nn.init.xavier_uniform_(module.text_proj.weight) + module.text_proj.bias.data.fill_(0) + nn.init.xavier_uniform_(module.values_vision_proj.weight) + module.values_vision_proj.bias.data.fill_(0) + nn.init.xavier_uniform_(module.values_text_proj.weight) + module.values_text_proj.bias.data.fill_(0) + nn.init.xavier_uniform_(module.out_vision_proj.weight) + module.out_vision_proj.bias.data.fill_(0) + nn.init.xavier_uniform_(module.out_text_proj.weight) + module.out_text_proj.bias.data.fill_(0) + elif isinstance(module, GroundingDinoFusionLayer): + module.vision_param.data.fill_(1e-4) + module.text_param.data.fill_(1e-4) + elif isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)): + module.weight.data.fill_(1.0) + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, GroundingDinoMLPPredictionHead): + nn.init.constant_(module.layers[-1].weight.data, 0) + nn.init.constant_(module.layers[-1].bias.data, 0) + + if hasattr(module, "reference_points") and not self.config.two_stage: + nn.init.xavier_uniform_(module.reference_points.weight.data, gain=1.0) + nn.init.constant_(module.reference_points.bias.data, 0.0) + if hasattr(module, "level_embed"): + nn.init.normal_(module.level_embed) + + def _set_gradient_checkpointing(self, module, value=False): + if isinstance(module, GroundingDinoDecoder): + module.gradient_checkpointing = value + + +class GroundingDinoEncoder(GroundingDinoPreTrainedModel): + """ + Transformer encoder consisting of *config.encoder_layers* deformable attention layers. Each layer is a + [`GroundingDinoEncoderLayer`]. + + The encoder updates the flattened multi-scale feature maps through multiple deformable attention layers. + + Args: + config: GroundingDinoConfig + """ + + def __init__(self, config: GroundingDinoConfig): + super().__init__(config) + + self.dropout = config.dropout + self.layers = nn.ModuleList([GroundingDinoEncoderLayer(config) for _ in range(config.encoder_layers)]) + + # Initialize weights and apply final processing + self.post_init() + + @staticmethod + def get_reference_points(spatial_shapes, valid_ratios, device): + """ + Get reference points for each feature map. + + Args: + spatial_shapes (`torch.LongTensor` of shape `(num_feature_levels, 2)`): + Spatial shapes of each feature map. + valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`): + Valid ratios of each feature map. + device (`torch.device`): + Device on which to create the tensors. + Returns: + `torch.FloatTensor` of shape `(batch_size, num_queries, num_feature_levels, 2)` + """ + reference_points_list = [] + for level, (height, width) in enumerate(spatial_shapes): + ref_y, ref_x = meshgrid( + torch.linspace(0.5, height - 0.5, height, dtype=torch.float32, device=device), + torch.linspace(0.5, width - 0.5, width, dtype=torch.float32, device=device), + indexing="ij", + ) + # TODO: valid_ratios could be useless here. check https://github.com/fundamentalvision/Deformable-DETR/issues/36 + ref_y = ref_y.reshape(-1)[None] / (valid_ratios[:, None, level, 1] * height) + ref_x = ref_x.reshape(-1)[None] / (valid_ratios[:, None, level, 0] * width) + ref = torch.stack((ref_x, ref_y), -1) + reference_points_list.append(ref) + reference_points = torch.cat(reference_points_list, 1) + reference_points = reference_points[:, :, None] * valid_ratios[:, None] + return reference_points + + def forward( + self, + vision_features: Tensor, + vision_attention_mask: Tensor, + vision_position_embedding: Tensor, + spatial_shapes: Tensor, + spatial_shapes_list: list[tuple[int, int]], + level_start_index: Tensor, + valid_ratios=None, + text_features: Optional[Tensor] = None, + text_attention_mask: Optional[Tensor] = None, + text_position_embedding: Optional[Tensor] = None, + text_self_attention_masks: Optional[Tensor] = None, + text_position_ids: Optional[Tensor] = None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + Args: + vision_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Flattened feature map (output of the backbone + projection layer) that is passed to the encoder. + vision_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding pixel features. Mask values selected in `[0, 1]`: + - 0 for pixel features that are real (i.e. **not masked**), + - 1 for pixel features that are padding (i.e. **masked**). + [What are attention masks?](../glossary#attention-mask) + vision_position_embedding (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Position embeddings that are added to the queries and keys in each self-attention layer. + spatial_shapes (`torch.LongTensor` of shape `(num_feature_levels, 2)`): + Spatial shapes of each feature map. + spatial_shapes_list (`list[tuple[int, int]]`): + Spatial shapes of each feature map (but as list for export compatibility). + level_start_index (`torch.LongTensor` of shape `(num_feature_levels)`): + Starting index of each feature map. + valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`): + Ratio of valid area in each feature level. + text_features (`torch.FloatTensor` of shape `(batch_size, text_seq_len, hidden_size)`): + Flattened text features that are passed to the encoder. + text_attention_mask (`torch.Tensor` of shape `(batch_size, text_seq_len)`, *optional*): + Mask to avoid performing attention on padding text features. Mask values selected in `[0, 1]`: + - 0 for text features that are real (i.e. **not masked**), + - 1 for text features that are padding (i.e. **masked**). + [What are attention masks?](../glossary#attention-mask) + text_position_embedding (`torch.FloatTensor` of shape `(batch_size, text_seq_len)`): + Position embeddings that are added to the queries and keys in each self-attention layer. + text_self_attention_masks (`torch.BoolTensor` of shape `(batch_size, text_seq_len, text_seq_len)`): + Masks to avoid performing attention between padding text features. Mask values selected in `[0, 1]`: + - 1 for text features that are real (i.e. **not masked**), + - 0 for text features that are padding (i.e. **masked**). + text_position_ids (`torch.LongTensor` of shape `(batch_size, num_queries)`): + Position ids for text features. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + reference_points = self.get_reference_points(spatial_shapes, valid_ratios, device=vision_features.device) + + encoder_vision_states = () if output_hidden_states else None + encoder_text_states = () if output_hidden_states else None + all_attns = () if output_attentions else None + all_attn_fused_text = () if output_attentions else None + all_attn_fused_vision = () if output_attentions else None + all_attn_enhanced_text = () if output_attentions else None + all_attn_deformable = () if output_attentions else None + for i, encoder_layer in enumerate(self.layers): + if output_hidden_states: + encoder_vision_states += (vision_features,) + encoder_text_states += (text_features,) + + (vision_features, text_features), attentions = encoder_layer( + vision_features=vision_features, + vision_position_embedding=vision_position_embedding, + spatial_shapes=spatial_shapes, + spatial_shapes_list=spatial_shapes_list, + level_start_index=level_start_index, + key_padding_mask=vision_attention_mask, + reference_points=reference_points, + text_features=text_features, + text_attention_mask=text_attention_mask, + text_position_embedding=text_position_embedding, + text_self_attention_masks=text_self_attention_masks, + text_position_ids=text_position_ids, + ) + + if output_attentions: + all_attn_fused_vision += (attentions[0],) + all_attn_fused_text += (attentions[1],) + all_attn_enhanced_text += (attentions[2],) + all_attn_deformable += (attentions[3],) + + if output_hidden_states: + encoder_vision_states += (vision_features,) + encoder_text_states += (text_features,) + + if output_attentions: + all_attns = (all_attn_fused_vision, all_attn_fused_text, all_attn_enhanced_text, all_attn_deformable) + + if not return_dict: + enc_outputs = [vision_features, text_features, encoder_vision_states, encoder_text_states, all_attns] + return tuple(v for v in enc_outputs if v is not None) + return GroundingDinoEncoderOutput( + last_hidden_state_vision=vision_features, + last_hidden_state_text=text_features, + vision_hidden_states=encoder_vision_states, + text_hidden_states=encoder_text_states, + attentions=all_attns, + ) + + +class GroundingDinoDecoder(GroundingDinoPreTrainedModel): + """ + Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`GroundingDinoDecoderLayer`]. + + The decoder updates the query embeddings through multiple self-attention and cross-attention layers. + + Some tweaks for Grounding DINO: + + - `position_embeddings`, `reference_points`, `spatial_shapes` and `valid_ratios` are added to the forward pass. + - it also returns a stack of intermediate outputs and reference points from all decoding layers. + + Args: + config: GroundingDinoConfig + """ + + def __init__(self, config: GroundingDinoConfig): + super().__init__(config) + + self.dropout = config.dropout + self.layer_norm = nn.LayerNorm(config.d_model, config.layer_norm_eps) + self.layers = nn.ModuleList([GroundingDinoDecoderLayer(config) for _ in range(config.decoder_layers)]) + self.reference_points_head = GroundingDinoMLPPredictionHead( + config.query_dim // 2 * config.d_model, config.d_model, config.d_model, 2 + ) + self.gradient_checkpointing = False + + # hack implementation for iterative bounding box refinement as in two-stage Deformable DETR + self.bbox_embed = None + self.class_embed = None + self.query_scale = None + + # Initialize weights and apply final processing + self.post_init() + + def forward( + self, + inputs_embeds, + vision_encoder_hidden_states, + vision_encoder_attention_mask=None, + text_encoder_hidden_states=None, + text_encoder_attention_mask=None, + reference_points=None, + spatial_shapes=None, + spatial_shapes_list=None, + level_start_index=None, + valid_ratios=None, + self_attn_mask=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + Args: + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`): + The query embeddings that are passed into the decoder. + vision_encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Last hidden state from encoder related to vision feature map. + vision_encoder_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding pixel features. Mask values selected in `[0, 1]`: + - 1 for pixel features that are real (i.e. **not masked**), + - 0 for pixel features that are padding (i.e. **masked**). + text_encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, text_seq_len, hidden_size)`): + Last hidden state from encoder related to text features. + text_encoder_attention_mask (`torch.Tensor` of shape `(batch_size, text_seq_len)`, *optional*): + Mask to avoid performing attention on padding text features. Mask values selected in `[0, 1]`: + - 0 for text features that are real (i.e. **not masked**), + - 1 for text features that are padding (i.e. **masked**). + reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)` is `as_two_stage` else `(batch_size, num_queries, 2)` or , *optional*): + Reference point in range `[0, 1]`, top-left (0,0), bottom-right (1, 1), including padding area. + spatial_shapes (`torch.FloatTensor` of shape `(num_feature_levels, 2)`): + Spatial shapes of the feature maps. + spatial_shapes_list (`list[tuple[int, int]]`): + Spatial shapes of the feature maps (but as list for export compatibility). + level_start_index (`torch.LongTensor` of shape `(num_feature_levels)`, *optional*): + Indexes for the start of each feature level. In range `[0, sequence_length]`. + valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`, *optional*): + Ratio of valid area in each feature level. + self_attn_mask (`torch.BoolTensor` of shape `(batch_size, text_seq_len)`): + Masks to avoid performing self-attention between vision hidden state. Mask values selected in `[0, 1]`: + - 1 for queries that are real (i.e. **not masked**), + - 0 for queries that are padding (i.e. **masked**). + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if inputs_embeds is not None: + hidden_states = inputs_embeds + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + all_attns = () if output_attentions else None + all_cross_attns_vision = () if (output_attentions and vision_encoder_hidden_states is not None) else None + all_cross_attns_text = () if (output_attentions and text_encoder_hidden_states is not None) else None + intermediate = () + intermediate_reference_points = () + + if text_encoder_attention_mask is not None: + dtype = text_encoder_hidden_states.dtype + + text_encoder_attention_mask = text_encoder_attention_mask[:, None, None, :] + text_encoder_attention_mask = text_encoder_attention_mask.repeat( + 1, self.config.decoder_attention_heads, self.config.num_queries, 1 + ) + text_encoder_attention_mask = text_encoder_attention_mask.to(dtype=dtype) + text_encoder_attention_mask = text_encoder_attention_mask * torch.finfo(dtype).min + + for idx, decoder_layer in enumerate(self.layers): + num_coordinates = reference_points.shape[-1] + if num_coordinates == 4: + reference_points_input = ( + reference_points[:, :, None] * torch.cat([valid_ratios, valid_ratios], -1)[:, None] + ) + elif num_coordinates == 2: + reference_points_input = reference_points[:, :, None] * valid_ratios[:, None] + else: + raise ValueError("Last dim of reference_points must be 2 or 4, but got {reference_points.shape[-1]}") + query_pos = get_sine_pos_embed(reference_points_input[:, :, 0, :], num_pos_feats=self.config.d_model // 2) + query_pos = self.reference_points_head(query_pos) + + # In original implementation they apply layer norm before outputting intermediate hidden states + # Though that's not through between layers so the layers use as input the output of the previous layer + # without layer norm + if output_hidden_states: + all_hidden_states += (self.layer_norm(hidden_states),) + + if self.gradient_checkpointing and self.training: + + def create_custom_forward(module): + def custom_forward(*inputs): + return module(*inputs, output_attentions) + + return custom_forward + + layer_outputs = torch.utils.checkpoint.checkpoint( + create_custom_forward(decoder_layer), + hidden_states, + query_pos, + reference_points_input, + spatial_shapes, + level_start_index, + vision_encoder_hidden_states, + vision_encoder_attention_mask, + text_encoder_hidden_states, + text_encoder_attention_mask, + self_attn_mask, + None, + ) + else: + layer_outputs = decoder_layer( + hidden_states=hidden_states, + position_embeddings=query_pos, + reference_points=reference_points_input, + spatial_shapes=spatial_shapes, + spatial_shapes_list=spatial_shapes_list, + level_start_index=level_start_index, + vision_encoder_hidden_states=vision_encoder_hidden_states, + vision_encoder_attention_mask=vision_encoder_attention_mask, + text_encoder_hidden_states=text_encoder_hidden_states, + text_encoder_attention_mask=text_encoder_attention_mask, + self_attn_mask=self_attn_mask, + output_attentions=output_attentions, + ) + + hidden_states = layer_outputs[0] + + # hack implementation for iterative bounding box refinement + if self.bbox_embed is not None: + tmp = self.bbox_embed[idx](hidden_states) + num_coordinates = reference_points.shape[-1] + if num_coordinates == 4: + new_reference_points = tmp + torch.special.logit(reference_points, eps=1e-5) + new_reference_points = new_reference_points.sigmoid() + elif num_coordinates == 2: + new_reference_points = tmp + new_reference_points[..., :2] = tmp[..., :2] + torch.special.logit(reference_points, eps=1e-5) + new_reference_points = new_reference_points.sigmoid() + else: + raise ValueError( + f"Last dim of reference_points must be 2 or 4, but got {reference_points.shape[-1]}" + ) + reference_points = new_reference_points.detach() + + intermediate += (self.layer_norm(hidden_states),) + intermediate_reference_points += (reference_points,) + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + if text_encoder_hidden_states is not None: + all_cross_attns_text += (layer_outputs[2],) + + if vision_encoder_hidden_states is not None: + all_cross_attns_vision += (layer_outputs[3],) + + # Keep batch_size as first dimension + intermediate = torch.stack(intermediate, dim=1) + intermediate_reference_points = torch.stack(intermediate_reference_points, dim=1) + hidden_states = self.layer_norm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if output_attentions: + all_attns += (all_self_attns, all_cross_attns_text, all_cross_attns_vision) + + if not return_dict: + return tuple( + v + for v in [ + hidden_states, + intermediate, + intermediate_reference_points, + all_hidden_states, + all_attns, + ] + if v is not None + ) + return GroundingDinoDecoderOutput( + last_hidden_state=hidden_states, + intermediate_hidden_states=intermediate, + intermediate_reference_points=intermediate_reference_points, + hidden_states=all_hidden_states, + attentions=all_attns, + ) + + +# these correspond to [CLS], [SEP], . and ? +SPECIAL_TOKENS = [101, 102, 1012, 1029] + + +def generate_masks_with_special_tokens_and_transfer_map(input_ids: torch.LongTensor) -> tuple[Tensor, Tensor]: + """Generate attention mask between each pair of special tokens and positional ids. + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. + Returns: + `tuple(torch.Tensor)` comprising attention mask between each special tokens and position_ids: + - **attention_mask** (`torch.BoolTensor` of shape `(batch_size, sequence_length, sequence_length)`) + - **position_ids** (`torch.LongTensor` of shape `(batch_size, sequence_length)`) + """ + batch_size, num_token = input_ids.shape + # special_tokens_mask: batch_size, num_token. 1 for special tokens. 0 for normal tokens + special_tokens_mask = torch.zeros((batch_size, num_token), device=input_ids.device).bool() + for special_token in SPECIAL_TOKENS: + special_tokens_mask = torch.logical_or(special_tokens_mask, input_ids == special_token) + + # idxs: each row is a list of indices of special tokens + idxs = torch.nonzero(special_tokens_mask) + + # generate attention mask and positional ids + attention_mask = torch.eye(num_token, device=input_ids.device).bool().unsqueeze(0).repeat(batch_size, 1, 1) + position_ids = torch.zeros((batch_size, num_token), device=input_ids.device) + previous_col = 0 + for i in range(idxs.shape[0]): + row, col = idxs[i] + if (col == 0) or (col == num_token - 1): + attention_mask[row, col, col] = True + position_ids[row, col] = 0 + else: + attention_mask[row, previous_col + 1 : col + 1, previous_col + 1 : col + 1] = True + position_ids[row, previous_col + 1 : col + 1] = torch.arange( + 0, col - previous_col, device=input_ids.device + ) + + previous_col = col + + return attention_mask, position_ids.to(torch.long) + + +@auto_docstring( + custom_intro=""" + The bare Grounding DINO Model (consisting of a backbone and encoder-decoder Transformer) outputting raw + hidden-states without any specific head on top. + """ +) +class GroundingDinoModel(GroundingDinoPreTrainedModel): + def __init__(self, config: GroundingDinoConfig): + super().__init__(config) + + # Create backbone + positional encoding + backbone = GroundingDinoConvEncoder(config) + position_embeddings = build_position_encoding(config) + self.backbone = GroundingDinoConvModel(backbone, position_embeddings) + + # Create input projection layers + if config.num_feature_levels > 1: + num_backbone_outs = len(backbone.intermediate_channel_sizes) + input_proj_list = [] + for i in range(num_backbone_outs): + in_channels = backbone.intermediate_channel_sizes[i] + input_proj_list.append( + nn.Sequential( + nn.Conv2d(in_channels, config.d_model, kernel_size=1), + nn.GroupNorm(32, config.d_model), + ) + ) + for _ in range(config.num_feature_levels - num_backbone_outs): + input_proj_list.append( + nn.Sequential( + nn.Conv2d(in_channels, config.d_model, kernel_size=3, stride=2, padding=1), + nn.GroupNorm(32, config.d_model), + ) + ) + in_channels = config.d_model + self.input_proj_vision = nn.ModuleList(input_proj_list) + else: + self.input_proj_vision = nn.ModuleList( + [ + nn.Sequential( + nn.Conv2d(backbone.intermediate_channel_sizes[-1], config.d_model, kernel_size=1), + nn.GroupNorm(32, config.d_model), + ) + ] + ) + + # Create text backbone + self.text_backbone = AutoModel.from_config(config.text_config, add_pooling_layer=False) + self.text_projection = nn.Linear(config.text_config.hidden_size, config.d_model) + + if config.embedding_init_target or not config.two_stage: + self.query_position_embeddings = nn.Embedding(config.num_queries, config.d_model) + + self.encoder = GroundingDinoEncoder(config) + self.decoder = GroundingDinoDecoder(config) + + self.level_embed = nn.Parameter(torch.Tensor(config.num_feature_levels, config.d_model)) + + if config.two_stage: + self.enc_output = nn.Linear(config.d_model, config.d_model) + self.enc_output_norm = nn.LayerNorm(config.d_model, config.layer_norm_eps) + if ( + config.two_stage_bbox_embed_share + and config.decoder_bbox_embed_share + and self.decoder.bbox_embed is not None + ): + self.encoder_output_bbox_embed = self.decoder.bbox_embed + else: + self.encoder_output_bbox_embed = GroundingDinoMLPPredictionHead( + input_dim=config.d_model, hidden_dim=config.d_model, output_dim=4, num_layers=3 + ) + + self.encoder_output_class_embed = GroundingDinoContrastiveEmbedding(config) + else: + self.reference_points = nn.Embedding(config.num_queries, 4) + + self.post_init() + + def freeze_backbone(self): + for name, param in self.backbone.conv_encoder.model.named_parameters(): + param.requires_grad_(False) + + def unfreeze_backbone(self): + for name, param in self.backbone.conv_encoder.model.named_parameters(): + param.requires_grad_(True) + + def get_valid_ratio(self, mask): + """Get the valid ratio of all feature maps.""" + + _, height, width = mask.shape + valid_height = torch.sum(mask[:, :, 0], 1) + valid_width = torch.sum(mask[:, 0, :], 1) + valid_ratio_height = valid_height.float() / height + valid_ratio_width = valid_width.float() / width + valid_ratio = torch.stack([valid_ratio_width, valid_ratio_height], -1) + return valid_ratio + + def generate_encoder_output_proposals(self, enc_output, padding_mask, spatial_shapes): + """Generate the encoder output proposals from encoded enc_output. + + Args: + enc_output (`torch.Tensor[batch_size, sequence_length, hidden_size]`): Output of the encoder. + padding_mask (`torch.Tensor[batch_size, sequence_length]`): Padding mask for `enc_output`. + spatial_shapes (`torch.Tensor[num_feature_levels, 2]`): Spatial shapes of the feature maps. + + Returns: + `tuple(torch.FloatTensor)`: A tuple of feature map and bbox prediction. + - object_query (Tensor[batch_size, sequence_length, hidden_size]): Object query features. Later used to + directly predict a bounding box. (without the need of a decoder) + - output_proposals (Tensor[batch_size, sequence_length, 4]): Normalized proposals, after an inverse + sigmoid. + """ + batch_size = enc_output.shape[0] + proposals = [] + current_position = 0 + for level, (height, width) in enumerate(spatial_shapes): + mask_flatten_ = padding_mask[:, current_position : (current_position + height * width)] + mask_flatten_ = mask_flatten_.view(batch_size, height, width, 1) + valid_height = torch.sum(~mask_flatten_[:, :, 0, 0], 1) + valid_width = torch.sum(~mask_flatten_[:, 0, :, 0], 1) + + grid_y, grid_x = meshgrid( + torch.linspace(0, height - 1, height, dtype=torch.float32, device=enc_output.device), + torch.linspace(0, width - 1, width, dtype=torch.float32, device=enc_output.device), + indexing="ij", + ) + grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1) + + scale = torch.cat([valid_width.unsqueeze(-1), valid_height.unsqueeze(-1)], 1).view(batch_size, 1, 1, 2) + grid = (grid.unsqueeze(0).expand(batch_size, -1, -1, -1) + 0.5) / scale + width_height = torch.ones_like(grid) * 0.05 * (2.0**level) + proposal = torch.cat((grid, width_height), -1).view(batch_size, -1, 4) + proposals.append(proposal) + current_position += height * width + + output_proposals = torch.cat(proposals, 1) + output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all(-1, keepdim=True) + output_proposals = torch.log(output_proposals / (1 - output_proposals)) # inverse sigmoid + output_proposals = output_proposals.masked_fill(padding_mask.unsqueeze(-1), float("inf")) + output_proposals = output_proposals.masked_fill(~output_proposals_valid, float("inf")) + + # assign each pixel as an object query + object_query = enc_output + object_query = object_query.masked_fill(padding_mask.unsqueeze(-1), float(0)) + object_query = object_query.masked_fill(~output_proposals_valid, float(0)) + object_query = self.enc_output_norm(self.enc_output(object_query)) + return object_query, output_proposals + + @auto_docstring + def forward( + self, + pixel_values: Tensor, + input_ids: Tensor, + token_type_ids: Optional[Tensor] = None, + attention_mask: Optional[Tensor] = None, + pixel_mask: Optional[Tensor] = None, + encoder_outputs=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + input_ids (`torch.LongTensor` of shape `(batch_size, text_sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoTokenizer`]. See [`BertTokenizer.__call__`] for details. + token_type_ids (`torch.LongTensor` of shape `(batch_size, text_sequence_length)`, *optional*): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, + 1]`: 0 corresponds to a `sentence A` token, 1 corresponds to a `sentence B` token + + [What are token type IDs?](../glossary#token-type-ids) + + Examples: + + ```python + >>> from transformers import AutoProcessor, AutoModel + >>> from PIL import Image + >>> import requests + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + >>> text = "a cat." + + >>> processor = AutoProcessor.from_pretrained("IDEA-Research/grounding-dino-tiny") + >>> model = AutoModel.from_pretrained("IDEA-Research/grounding-dino-tiny") + + >>> inputs = processor(images=image, text=text, return_tensors="pt") + >>> outputs = model(**inputs) + + >>> last_hidden_states = outputs.last_hidden_state + >>> list(last_hidden_states.shape) + [1, 900, 256] + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + text_self_attention_masks, position_ids = generate_masks_with_special_tokens_and_transfer_map(input_ids) + + if attention_mask is None: + attention_mask = torch.ones_like(input_ids) + + if token_type_ids is None: + token_type_ids = torch.zeros_like(input_ids) + + text_token_mask = attention_mask.bool() # just to avoid renaming everywhere + + max_text_len = self.config.max_text_len + if text_self_attention_masks.shape[1] > max_text_len: + text_self_attention_masks = text_self_attention_masks[:, :max_text_len, :max_text_len] + position_ids = position_ids[:, :max_text_len] + input_ids = input_ids[:, :max_text_len] + token_type_ids = token_type_ids[:, :max_text_len] + text_token_mask = text_token_mask[:, :max_text_len] + + # Extract text features from text backbone + text_outputs = self.text_backbone( + input_ids, text_self_attention_masks, token_type_ids, position_ids, return_dict=return_dict + ) + text_features = text_outputs.last_hidden_state if return_dict else text_outputs[0] + text_features = self.text_projection(text_features) + + batch_size, num_channels, height, width = pixel_values.shape + device = pixel_values.device + + if pixel_mask is None: + pixel_mask = torch.ones(((batch_size, height, width)), dtype=torch.long, device=device) + + # Extract multi-scale feature maps of same resolution `config.d_model` (cf Figure 4 in paper) + # First, sent pixel_values + pixel_mask through Backbone to obtain the features + # which is a list of tuples + vision_features, position_embeddings_list = self.backbone(pixel_values, pixel_mask) + + # Then, apply 1x1 convolution to reduce the channel dimension to d_model (256 by default) + feature_maps = [] + masks = [] + for level, (source, mask) in enumerate(vision_features): + feature_maps.append(self.input_proj_vision[level](source)) + masks.append(mask) + + # Lowest resolution feature maps are obtained via 3x3 stride 2 convolutions on the final stage + if self.config.num_feature_levels > len(feature_maps): + _len_sources = len(feature_maps) + for level in range(_len_sources, self.config.num_feature_levels): + if level == _len_sources: + source = self.input_proj_vision[level](vision_features[-1][0]) + else: + source = self.input_proj_vision[level](feature_maps[-1]) + mask = nn.functional.interpolate(pixel_mask[None].float(), size=source.shape[-2:]).to(torch.bool)[0] + pos_l = self.backbone.position_embedding(source, mask).to(source.dtype) + feature_maps.append(source) + masks.append(mask) + position_embeddings_list.append(pos_l) + + # Create queries + query_embeds = None + if self.config.embedding_init_target or self.config.two_stage: + query_embeds = self.query_position_embeddings.weight + + # Prepare encoder inputs (by flattening) + source_flatten = [] + mask_flatten = [] + lvl_pos_embed_flatten = [] + spatial_shapes_list = [] + for level, (source, mask, pos_embed) in enumerate(zip(feature_maps, masks, position_embeddings_list)): + batch_size, num_channels, height, width = source.shape + spatial_shape = (height, width) + spatial_shapes_list.append(spatial_shape) + source = source.flatten(2).transpose(1, 2) + mask = mask.flatten(1) + pos_embed = pos_embed.flatten(2).transpose(1, 2) + lvl_pos_embed = pos_embed + self.level_embed[level].view(1, 1, -1) + lvl_pos_embed_flatten.append(lvl_pos_embed) + source_flatten.append(source) + mask_flatten.append(mask) + source_flatten = torch.cat(source_flatten, 1) + mask_flatten = torch.cat(mask_flatten, 1) + lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1) + spatial_shapes = torch.as_tensor(spatial_shapes_list, dtype=torch.long, device=source_flatten.device) + level_start_index = torch.cat((spatial_shapes.new_zeros((1,)), spatial_shapes.prod(1).cumsum(0)[:-1])) + valid_ratios = torch.stack([self.get_valid_ratio(m) for m in masks], 1) + valid_ratios = valid_ratios.float() + + # Fourth, sent source_flatten + mask_flatten + lvl_pos_embed_flatten (backbone + proj layer output) through encoder + # Also provide spatial_shapes, level_start_index and valid_ratios + if encoder_outputs is None: + encoder_outputs = self.encoder( + vision_features=source_flatten, + vision_attention_mask=~mask_flatten, + vision_position_embedding=lvl_pos_embed_flatten, + spatial_shapes=spatial_shapes, + spatial_shapes_list=spatial_shapes_list, + level_start_index=level_start_index, + valid_ratios=valid_ratios, + text_features=text_features, + text_attention_mask=~text_token_mask, + text_position_embedding=None, + text_self_attention_masks=~text_self_attention_masks, + text_position_ids=position_ids, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + # If the user passed a tuple for encoder_outputs, we wrap it in a GroundingDinoEncoderOutput when return_dict=True + elif return_dict and not isinstance(encoder_outputs, GroundingDinoEncoderOutput): + encoder_outputs = GroundingDinoEncoderOutput( + last_hidden_state_vision=encoder_outputs[0], + last_hidden_state_text=encoder_outputs[1], + vision_hidden_states=encoder_outputs[2] if output_hidden_states else None, + text_hidden_states=encoder_outputs[3] if output_hidden_states else None, + attentions=encoder_outputs[-1] if output_attentions else None, + ) + + # Fifth, prepare decoder inputs + topk_proposals = None + enc_outputs_class = None + enc_outputs_coord_logits = None + encoder_logits = None + encoder_pred_boxes = None + if self.config.two_stage: + object_query_embedding, output_proposals = self.generate_encoder_output_proposals( + encoder_outputs[0], ~mask_flatten, spatial_shapes + ) + + # hack implementation as in two-stage Deformable DETR + # apply a detection head to each pixel (A.4 in paper) + # linear projection for bounding box binary classification (i.e. foreground and background) + enc_outputs_class = self.encoder_output_class_embed( + object_query_embedding, encoder_outputs[1], text_token_mask + ) + # 3-layer FFN to predict bounding boxes coordinates (bbox regression branch) + delta_bbox = self.encoder_output_bbox_embed(object_query_embedding) + enc_outputs_coord_logits = delta_bbox + output_proposals + + # only keep top scoring `config.num_queries` proposals + topk = self.config.num_queries + topk_logits = enc_outputs_class.max(-1)[0] + topk_proposals = torch.topk(topk_logits, topk, dim=1)[1] + topk_coords_logits = torch.gather( + enc_outputs_coord_logits, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, 4) + ) + + topk_coords_logits = topk_coords_logits.detach() + reference_points = topk_coords_logits.sigmoid() + init_reference_points = reference_points + if query_embeds is not None: + target = query_embeds.unsqueeze(0).repeat(batch_size, 1, 1) + else: + target = torch.gather( + object_query_embedding, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, self.d_model) + ).detach() + + # Set intermediate topk proposals (coords and class) for loss computation + encoder_pred_boxes = reference_points + encoder_logits = self.encoder_output_class_embed(target, text_features, text_token_mask) + else: + target = query_embeds.unsqueeze(0).repeat(batch_size, 1, 1) + reference_points = self.reference_points.weight.unsqueeze(0).repeat(batch_size, 1, 1).sigmoid() + init_reference_points = reference_points + + decoder_outputs = self.decoder( + inputs_embeds=target, + vision_encoder_hidden_states=encoder_outputs[0], + vision_encoder_attention_mask=mask_flatten, + text_encoder_hidden_states=encoder_outputs[1], + text_encoder_attention_mask=~text_token_mask, + reference_points=reference_points, + spatial_shapes=spatial_shapes, + spatial_shapes_list=spatial_shapes_list, + level_start_index=level_start_index, + valid_ratios=valid_ratios, + self_attn_mask=None, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + if not return_dict: + enc_outputs = tuple( + value + for value in [ + enc_outputs_class, + enc_outputs_coord_logits, + encoder_logits, + encoder_pred_boxes, + ] + if value is not None + ) + tuple_outputs = ( + (decoder_outputs[0], init_reference_points) + decoder_outputs[1:] + encoder_outputs + enc_outputs + ) + + return tuple_outputs + + return GroundingDinoModelOutput( + last_hidden_state=decoder_outputs.last_hidden_state, + init_reference_points=init_reference_points, + intermediate_hidden_states=decoder_outputs.intermediate_hidden_states, + intermediate_reference_points=decoder_outputs.intermediate_reference_points, + decoder_hidden_states=decoder_outputs.hidden_states, + decoder_attentions=decoder_outputs.attentions, + encoder_last_hidden_state_vision=encoder_outputs.last_hidden_state_vision, + encoder_last_hidden_state_text=encoder_outputs.last_hidden_state_text, + encoder_vision_hidden_states=encoder_outputs.vision_hidden_states, + encoder_text_hidden_states=encoder_outputs.text_hidden_states, + encoder_attentions=encoder_outputs.attentions, + enc_outputs_class=enc_outputs_class, + enc_outputs_coord_logits=enc_outputs_coord_logits, + encoder_logits=encoder_logits, + encoder_pred_boxes=encoder_pred_boxes, + ) + + +# Copied from transformers.models.detr.modeling_detr.DetrMLPPredictionHead +class GroundingDinoMLPPredictionHead(nn.Module): + """ + Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates, + height and width of a bounding box w.r.t. an image. + + Copied from https://github.com/facebookresearch/detr/blob/master/models/detr.py + + """ + + def __init__(self, input_dim, hidden_dim, output_dim, num_layers): + super().__init__() + self.num_layers = num_layers + h = [hidden_dim] * (num_layers - 1) + self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])) + + def forward(self, x): + for i, layer in enumerate(self.layers): + x = nn.functional.relu(layer(x)) if i < self.num_layers - 1 else layer(x) + return x + + +def build_label_maps(logits: torch.FloatTensor, input_ids: torch.LongTensor) -> tuple[torch.FloatTensor]: + """ + Computes a mapping between tokens and their corresponding labels, where `num_labels` is determined by the number of classes in the input prompt. + The function identifies segments of tokens between specific delimiter tokens and generates label maps for those segments. + Args: + logits (`torch.Tensor` of shape `(batch_size, seq_length, hidden_size)`): + The output logits from the model, where `hidden_size` corresponds to the dimension of the model's output features. + + input_ids (`torch.Tensor` of shape `(batch_size, seq_length)`): + The input token IDs corresponding to the input prompt. For example, given the prompt "fish. shark.", + `input_ids` might look like `[101, 3869, 1012, 11420, 1012, 102]` where each number corresponds to a token including special tokens. + Returns: + tuple: A tuple containing label maps for each instance in the batch. + - label_maps (tuple of `torch.Tensor`): + A tuple of tensors, where each tensor in the tuple corresponds to an instance in the batch. Each tensor + has shape `(num_labels, hidden_size)` and contains binary values (0 or 1), where `1` indicates the tokens + that are associated with a specific label (class) between delimiter tokens, and `0` elsewhere. + Example: + Given an input prompt "fish. shark." and corresponding `input_ids` as `[101, 3869, 1012, 11420, 1012, 102]`: + - The function identifies the tokens for "fish" (IDs `[3869]`) and "shark" (IDs `[11420]`). + - The function then constructs label maps for these tokens, where each label map indicates which tokens + correspond to which label between the delimiter tokens (e.g., between the period `.`). + - The output is a tuple of label maps, one for each instance in the batch. + Note: + - `SPECIAL_TOKENS` should be a predefined list of tokens that are considered special (e.g., `[CLS]`, `[SEP]`, etc.). + """ + max_seq_len = logits.shape[-1] + # Add [PAD] token to the list of special tokens + delimiter_tokens = torch.tensor(SPECIAL_TOKENS + [0], device=input_ids.device) + + delimiter_token_masks = torch.isin(input_ids, delimiter_tokens) + label_groups = torch.cumsum(delimiter_token_masks, dim=1) * (~delimiter_token_masks).to(torch.int32) + + label_maps = () + + # Iterate over batch dimension as we can have different number of labels + for label_group in label_groups: + # `label_group` is a tensor of shape `(seq_len,)` with zeros for non-label tokens and integers for label tokens + # label tokens with same integer value are part of the same label group + + # Get unique labels and exclude 0 (i.e. non-label tokens) + unique_labels = torch.unique(label_group)[1:, None] + num_labels = unique_labels.shape[0] + + # Create one-hot encoding for each label group + label_map = label_group.unsqueeze(0).repeat(num_labels, 1) + label_map = torch.where(label_map == unique_labels, 1, 0) + + # Pad label_map to match `max_seq_len` + label_map = F.pad(label_map, (0, max_seq_len - label_map.shape[1]), value=0) + + label_maps += (label_map,) + + return label_maps + + +def build_text_mask(logits, attention_mask): + """ + Create text_mask based on the matching indices + """ + seq_len = attention_mask.shape[1] + text_mask = torch.zeros_like(logits, device=logits.device, dtype=attention_mask.dtype) + text_mask[:, :, :seq_len] = attention_mask[:, None, :] + + return text_mask.bool() + + +@auto_docstring( + custom_intro=""" + Grounding DINO Model (consisting of a backbone and encoder-decoder Transformer) with object detection heads on top, + for tasks such as COCO detection. + """ +) +class GroundingDinoForObjectDetection(GroundingDinoPreTrainedModel): + # When using clones, all layers > 0 will be clones, but layer 0 *is* required + # the bbox_embed in the decoder are all clones though + _tied_weights_keys = [r"bbox_embed\.[1-9]\d*", r"model\.decoder\.bbox_embed\.[0-9]\d*"] + + def __init__(self, config: GroundingDinoConfig): + super().__init__(config) + + self.model = GroundingDinoModel(config) + _class_embed = GroundingDinoContrastiveEmbedding(config) + + if config.decoder_bbox_embed_share: + # a single shared instance + shared_head = GroundingDinoMLPPredictionHead( + input_dim=config.d_model, hidden_dim=config.d_model, output_dim=4, num_layers=3 + ) + self.bbox_embed = nn.ModuleList([shared_head] * config.decoder_layers) + else: + # each layer has its own head (implicit deep copy through a new instance) + self.bbox_embed = nn.ModuleList( + [ + GroundingDinoMLPPredictionHead( + input_dim=config.d_model, + hidden_dim=config.d_model, + output_dim=4, + num_layers=3, + ) + for _ in range(config.decoder_layers) + ] + ) + + self.class_embed = nn.ModuleList([_class_embed for _ in range(config.decoder_layers)]) + # hack for box-refinement + self.model.decoder.bbox_embed = self.bbox_embed + # hack implementation for two-stage + self.model.decoder.class_embed = self.class_embed + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + pixel_values: torch.FloatTensor, + input_ids: torch.LongTensor, + token_type_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.LongTensor] = None, + pixel_mask: Optional[torch.BoolTensor] = None, + encoder_outputs: Optional[Union[GroundingDinoEncoderOutput, tuple]] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: Optional[list[dict[str, Union[torch.LongTensor, torch.FloatTensor]]]] = None, + ): + r""" + input_ids (`torch.LongTensor` of shape `(batch_size, text_sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoTokenizer`]. See [`BertTokenizer.__call__`] for details. + token_type_ids (`torch.LongTensor` of shape `(batch_size, text_sequence_length)`, *optional*): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, + 1]`: 0 corresponds to a `sentence A` token, 1 corresponds to a `sentence B` token + + [What are token type IDs?](../glossary#token-type-ids) + labels (`list[Dict]` of len `(batch_size,)`, *optional*): + Labels for computing the bipartite matching loss. List of dicts, each dictionary containing at least the + following 2 keys: 'class_labels' and 'boxes' (the class labels and bounding boxes of an image in the batch + respectively). The class labels themselves should be a `torch.LongTensor` of len `(number of bounding boxes + in the image,)` and the boxes a `torch.FloatTensor` of shape `(number of bounding boxes in the image, 4)`. + + Examples: + + ```python + >>> import requests + + >>> import torch + >>> from PIL import Image + >>> from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection + + >>> model_id = "IDEA-Research/grounding-dino-tiny" + >>> device = "cuda" + + >>> processor = AutoProcessor.from_pretrained(model_id) + >>> model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id).to(device) + + >>> image_url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(image_url, stream=True).raw) + >>> # Check for cats and remote controls + >>> text_labels = [["a cat", "a remote control"]] + + >>> inputs = processor(images=image, text=text_labels, return_tensors="pt").to(device) + >>> with torch.no_grad(): + ... outputs = model(**inputs) + + >>> results = processor.post_process_grounded_object_detection( + ... outputs, + ... threshold=0.4, + ... text_threshold=0.3, + ... target_sizes=[(image.height, image.width)] + ... ) + >>> # Retrieve the first image result + >>> result = results[0] + >>> for box, score, text_label in zip(result["boxes"], result["scores"], result["text_labels"]): + ... box = [round(x, 2) for x in box.tolist()] + ... print(f"Detected {text_label} with confidence {round(score.item(), 3)} at location {box}") + Detected a cat with confidence 0.479 at location [344.7, 23.11, 637.18, 374.28] + Detected a cat with confidence 0.438 at location [12.27, 51.91, 316.86, 472.44] + Detected a remote control with confidence 0.478 at location [38.57, 70.0, 176.78, 118.18] + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if attention_mask is None: + attention_mask = torch.ones_like(input_ids) + + # First, sent images through Grounding DINO base model to obtain encoder + decoder outputs + outputs = self.model( + pixel_values=pixel_values, + input_ids=input_ids, + token_type_ids=token_type_ids, + attention_mask=attention_mask, + pixel_mask=pixel_mask, + encoder_outputs=encoder_outputs, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + idx = 5 + (1 if output_attentions else 0) + (1 if output_hidden_states else 0) + enc_text_hidden_state = outputs.encoder_last_hidden_state_text if return_dict else outputs[idx] + hidden_states = outputs.intermediate_hidden_states if return_dict else outputs[2] + init_reference_points = outputs.init_reference_points if return_dict else outputs[1] + inter_references_points = outputs.intermediate_reference_points if return_dict else outputs[3] + + # class logits + predicted bounding boxes + outputs_classes = [] + outputs_coords = [] + + # hidden_states are of shape (batch_size, num_stages, height, width) + # predict class and bounding box deltas for each stage + num_levels = hidden_states.shape[1] + for level in range(num_levels): + if level == 0: + reference = init_reference_points + else: + reference = inter_references_points[:, level - 1] + reference = torch.special.logit(reference, eps=1e-5) + outputs_class = self.class_embed[level]( + vision_hidden_state=hidden_states[:, level], + text_hidden_state=enc_text_hidden_state, + text_token_mask=attention_mask.bool(), + ) + delta_bbox = self.bbox_embed[level](hidden_states[:, level]) + + reference_coordinates = reference.shape[-1] + if reference_coordinates == 4: + outputs_coord_logits = delta_bbox + reference + elif reference_coordinates == 2: + delta_bbox[..., :2] += reference + outputs_coord_logits = delta_bbox + else: + raise ValueError(f"reference.shape[-1] should be 4 or 2, but got {reference.shape[-1]}") + outputs_coord = outputs_coord_logits.sigmoid() + outputs_classes.append(outputs_class) + outputs_coords.append(outputs_coord) + outputs_class = torch.stack(outputs_classes) + outputs_coord = torch.stack(outputs_coords) + + logits = outputs_class[-1] + pred_boxes = outputs_coord[-1] + + loss, loss_dict, auxiliary_outputs = None, None, None + if labels is not None: + label_maps = build_label_maps(logits, input_ids) + text_mask = build_text_mask(logits, attention_mask) + loss, loss_dict, auxiliary_outputs = self.loss_function( + logits, + labels, + self.device, + pred_boxes, + self.config, + label_maps, + text_mask, + outputs_class=outputs_class, + outputs_coord=outputs_coord, + encoder_logits=outputs[-2], + encoder_pred_boxes=outputs[-1], + ) + + if not return_dict: + auxiliary_outputs = auxiliary_outputs if auxiliary_outputs is not None else [] + output = [loss, loss_dict, logits, pred_boxes, *auxiliary_outputs, *outputs, input_ids] + output = tuple(out for out in output if out is not None) + return output + + dict_outputs = GroundingDinoObjectDetectionOutput( + loss=loss, + loss_dict=loss_dict, + logits=logits, + pred_boxes=pred_boxes, + last_hidden_state=outputs.last_hidden_state, + auxiliary_outputs=auxiliary_outputs, + decoder_hidden_states=outputs.decoder_hidden_states, + decoder_attentions=outputs.decoder_attentions, + encoder_last_hidden_state_vision=outputs.encoder_last_hidden_state_vision, + encoder_last_hidden_state_text=outputs.encoder_last_hidden_state_text, + encoder_vision_hidden_states=outputs.encoder_vision_hidden_states, + encoder_text_hidden_states=outputs.encoder_text_hidden_states, + encoder_attentions=outputs.encoder_attentions, + intermediate_hidden_states=outputs.intermediate_hidden_states, + intermediate_reference_points=outputs.intermediate_reference_points, + init_reference_points=outputs.init_reference_points, + enc_outputs_class=outputs.enc_outputs_class, + enc_outputs_coord_logits=outputs.enc_outputs_coord_logits, + encoder_logits=outputs.encoder_logits, + encoder_pred_boxes=outputs.encoder_pred_boxes, + input_ids=input_ids, + ) + + return dict_outputs + + +__all__ = ["GroundingDinoForObjectDetection", "GroundingDinoModel", "GroundingDinoPreTrainedModel"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/grounding_dino/modular_grounding_dino.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/grounding_dino/modular_grounding_dino.py new file mode 100644 index 0000000000000000000000000000000000000000..a7b9c570e7b0add68209f6b2933be7b813269327 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/grounding_dino/modular_grounding_dino.py @@ -0,0 +1,125 @@ +from typing import TYPE_CHECKING, Optional, Union + +import torch + +from transformers.models.detr.image_processing_detr_fast import DetrImageProcessorFast + +from ...image_transforms import center_to_corners_format +from ...utils import ( + TensorType, + logging, +) + + +if TYPE_CHECKING: + from .modeling_grounding_dino import GroundingDinoObjectDetectionOutput + + +logger = logging.get_logger(__name__) + + +def _scale_boxes(boxes, target_sizes): + """ + Scale batch of bounding boxes to the target sizes. + + Args: + boxes (`torch.Tensor` of shape `(batch_size, num_boxes, 4)`): + Bounding boxes to scale. Each box is expected to be in (x1, y1, x2, y2) format. + target_sizes (`list[tuple[int, int]]` or `torch.Tensor` of shape `(batch_size, 2)`): + Target sizes to scale the boxes to. Each target size is expected to be in (height, width) format. + + Returns: + `torch.Tensor` of shape `(batch_size, num_boxes, 4)`: Scaled bounding boxes. + """ + + if isinstance(target_sizes, (list, tuple)): + image_height = torch.tensor([i[0] for i in target_sizes]) + image_width = torch.tensor([i[1] for i in target_sizes]) + elif isinstance(target_sizes, torch.Tensor): + image_height, image_width = target_sizes.unbind(1) + else: + raise TypeError("`target_sizes` must be a list, tuple or torch.Tensor") + + scale_factor = torch.stack([image_width, image_height, image_width, image_height], dim=1) + scale_factor = scale_factor.unsqueeze(1).to(boxes.device) + boxes = boxes * scale_factor + return boxes + + +class GroundingDinoImageProcessorFast(DetrImageProcessorFast): + def post_process_object_detection( + self, + outputs: "GroundingDinoObjectDetectionOutput", + threshold: float = 0.1, + target_sizes: Optional[Union[TensorType, list[tuple]]] = None, + ): + """ + Converts the raw output of [`GroundingDinoForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y, + bottom_right_x, bottom_right_y) format. + + Args: + outputs ([`GroundingDinoObjectDetectionOutput`]): + Raw outputs of the model. + threshold (`float`, *optional*, defaults to 0.1): + Score threshold to keep object detection predictions. + target_sizes (`torch.Tensor` or `list[tuple[int, int]]`, *optional*): + Tensor of shape `(batch_size, 2)` or list of tuples (`tuple[int, int]`) containing the target size + `(height, width)` of each image in the batch. If unset, predictions will not be resized. + + Returns: + `list[Dict]`: A list of dictionaries, each dictionary containing the following keys: + - "scores": The confidence scores for each predicted box on the image. + - "labels": Indexes of the classes predicted by the model on the image. + - "boxes": Image bounding boxes in (top_left_x, top_left_y, bottom_right_x, bottom_right_y) format. + """ + batch_logits, batch_boxes = outputs.logits, outputs.pred_boxes + batch_size = len(batch_logits) + + if target_sizes is not None and len(target_sizes) != batch_size: + raise ValueError("Make sure that you pass in as many target sizes as images") + + # batch_logits of shape (batch_size, num_queries, num_classes) + batch_class_logits = torch.max(batch_logits, dim=-1) + batch_scores = torch.sigmoid(batch_class_logits.values) + batch_labels = batch_class_logits.indices + + # Convert to [x0, y0, x1, y1] format + batch_boxes = center_to_corners_format(batch_boxes) + + # Convert from relative [0, 1] to absolute [0, height] coordinates + if target_sizes is not None: + batch_boxes = _scale_boxes(batch_boxes, target_sizes) + + results = [] + for scores, labels, boxes in zip(batch_scores, batch_labels, batch_boxes): + keep = scores > threshold + scores = scores[keep] + labels = labels[keep] + boxes = boxes[keep] + results.append({"scores": scores, "labels": labels, "boxes": boxes}) + + return results + + def post_process(self): + raise NotImplementedError("Post-processing is not implemented for Grounding-Dino yet.") + + def post_process_segmentation(self): + raise NotImplementedError("Segmentation post-processing is not implemented for Grounding-Dino yet.") + + def post_process_instance(self): + raise NotImplementedError("Instance post-processing is not implemented for Grounding-Dino yet.") + + def post_process_panoptic(self): + raise NotImplementedError("Panoptic post-processing is not implemented for Grounding-Dino yet.") + + def post_process_instance_segmentation(self): + raise NotImplementedError("Segmentation post-processing is not implemented for Grounding-Dino yet.") + + def post_process_semantic_segmentation(self): + raise NotImplementedError("Semantic segmentation post-processing is not implemented for Grounding-Dino yet.") + + def post_process_panoptic_segmentation(self): + raise NotImplementedError("Panoptic segmentation post-processing is not implemented for Grounding-Dino yet.") + + +__all__ = ["GroundingDinoImageProcessorFast"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/grounding_dino/processing_grounding_dino.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/grounding_dino/processing_grounding_dino.py new file mode 100644 index 0000000000000000000000000000000000000000..ea0e288f3eecf473149e7751c563906236d8811a --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/grounding_dino/processing_grounding_dino.py @@ -0,0 +1,277 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Processor class for Grounding DINO. +""" + +import pathlib +import warnings +from typing import TYPE_CHECKING, Optional, Union + +from ...image_transforms import center_to_corners_format +from ...image_utils import AnnotationFormat, ImageInput +from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, Unpack +from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput +from ...utils import TensorType, is_torch_available + + +if is_torch_available(): + import torch + +if TYPE_CHECKING: + from .modeling_grounding_dino import GroundingDinoObjectDetectionOutput + + +AnnotationType = dict[str, Union[int, str, list[dict]]] + + +def get_phrases_from_posmap(posmaps, input_ids): + """Get token ids of phrases from posmaps and input_ids. + + Args: + posmaps (`torch.BoolTensor` of shape `(num_boxes, hidden_size)`): + A boolean tensor of text-thresholded logits related to the detected bounding boxes. + input_ids (`torch.LongTensor`) of shape `(sequence_length, )`): + A tensor of token ids. + """ + left_idx = 0 + right_idx = posmaps.shape[-1] - 1 + + # Avoiding altering the input tensor + posmaps = posmaps.clone() + + posmaps[:, 0 : left_idx + 1] = False + posmaps[:, right_idx:] = False + + token_ids = [] + for posmap in posmaps: + non_zero_idx = posmap.nonzero(as_tuple=True)[0].tolist() + token_ids.append([input_ids[i] for i in non_zero_idx]) + + return token_ids + + +def _is_list_of_candidate_labels(text) -> bool: + """Check that text is list/tuple of strings and each string is a candidate label and not merged candidate labels text. + Merged candidate labels text is a string with candidate labels separated by a dot. + """ + if isinstance(text, (list, tuple)): + return all(isinstance(t, str) and "." not in t for t in text) + return False + + +def _merge_candidate_labels_text(text: list[str]) -> str: + """ + Merge candidate labels text into a single string. Ensure all labels are lowercase. + For example, ["A cat", "a dog"] -> "a cat. a dog." + """ + labels = [t.strip().lower() for t in text] # ensure lowercase + merged_labels_str = ". ".join(labels) + "." # join with dot and add a dot at the end + return merged_labels_str + + +class DictWithDeprecationWarning(dict): + message = ( + "The key `labels` is will return integer ids in `GroundingDinoProcessor.post_process_grounded_object_detection` " + "output since v4.51.0. Use `text_labels` instead to retrieve string object names." + ) + + def __getitem__(self, key): + if key == "labels": + warnings.warn(self.message, FutureWarning) + return super().__getitem__(key) + + def get(self, key, *args, **kwargs): + if key == "labels": + warnings.warn(self.message, FutureWarning) + return super().get(key, *args, **kwargs) + + +class GroundingDinoImagesKwargs(ImagesKwargs, total=False): + annotations: Optional[Union[AnnotationType, list[AnnotationType]]] + return_segmentation_masks: Optional[bool] + masks_path: Optional[Union[str, pathlib.Path]] + do_convert_annotations: Optional[bool] + format: Optional[Union[str, AnnotationFormat]] + + +class GroundingDinoProcessorKwargs(ProcessingKwargs, total=False): + images_kwargs: GroundingDinoImagesKwargs + _defaults = { + "text_kwargs": { + "add_special_tokens": True, + "padding": False, + "stride": 0, + "return_overflowing_tokens": False, + "return_special_tokens_mask": False, + "return_offsets_mapping": False, + "return_token_type_ids": True, + "return_length": False, + "verbose": True, + } + } + + +class GroundingDinoProcessor(ProcessorMixin): + r""" + Constructs a Grounding DINO processor which wraps a Deformable DETR image processor and a BERT tokenizer into a + single processor. + + [`GroundingDinoProcessor`] offers all the functionalities of [`GroundingDinoImageProcessor`] and + [`AutoTokenizer`]. See the docstring of [`~GroundingDinoProcessor.__call__`] and [`~GroundingDinoProcessor.decode`] + for more information. + + Args: + image_processor (`GroundingDinoImageProcessor`): + An instance of [`GroundingDinoImageProcessor`]. The image processor is a required input. + tokenizer (`AutoTokenizer`): + An instance of ['PreTrainedTokenizer`]. The tokenizer is a required input. + """ + + attributes = ["image_processor", "tokenizer"] + image_processor_class = "GroundingDinoImageProcessor" + tokenizer_class = "AutoTokenizer" + valid_processor_kwargs = GroundingDinoProcessorKwargs + + def __init__(self, image_processor, tokenizer): + super().__init__(image_processor, tokenizer) + + def __call__( + self, + images: Optional[ImageInput] = None, + text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None, + **kwargs: Unpack[GroundingDinoProcessorKwargs], + ) -> BatchEncoding: + """ + This method uses [`GroundingDinoImageProcessor.__call__`] method to prepare image(s) for the model, and + [`BertTokenizerFast.__call__`] to prepare text for the model. + + Args: + images (`ImageInput`, `list[ImageInput]`, *optional*): + The image or batch of images to be processed. The image might be either PIL image, numpy array or a torch tensor. + text (`TextInput`, `PreTokenizedInput`, `list[TextInput]`, `list[PreTokenizedInput]`, *optional*): + Candidate labels to be detected on the image. The text might be one of the following: + - A list of candidate labels (strings) to be detected on the image (e.g. ["a cat", "a dog"]). + - A batch of candidate labels to be detected on the batch of images (e.g. [["a cat", "a dog"], ["a car", "a person"]]). + - A merged candidate labels string to be detected on the image, separated by "." (e.g. "a cat. a dog."). + - A batch of merged candidate labels text to be detected on the batch of images (e.g. ["a cat. a dog.", "a car. a person."]). + """ + if text is not None: + text = self._preprocess_input_text(text) + return super().__call__(images=images, text=text, **kwargs) + + def _preprocess_input_text(self, text): + """ + Preprocess input text to ensure that labels are in the correct format for the model. + If the text is a list of candidate labels, merge the candidate labels into a single string, + for example, ["a cat", "a dog"] -> "a cat. a dog.". In case candidate labels are already in a form of + "a cat. a dog.", the text is returned as is. + """ + + if _is_list_of_candidate_labels(text): + text = _merge_candidate_labels_text(text) + + # for batched input + elif isinstance(text, (list, tuple)) and all(_is_list_of_candidate_labels(t) for t in text): + text = [_merge_candidate_labels_text(sample) for sample in text] + + return text + + def post_process_grounded_object_detection( + self, + outputs: "GroundingDinoObjectDetectionOutput", + input_ids: Optional[TensorType] = None, + threshold: float = 0.25, + text_threshold: float = 0.25, + target_sizes: Optional[Union[TensorType, list[tuple]]] = None, + text_labels: Optional[list[list[str]]] = None, + ): + """ + Converts the raw output of [`GroundingDinoForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y, + bottom_right_x, bottom_right_y) format and get the associated text label. + + Args: + outputs ([`GroundingDinoObjectDetectionOutput`]): + Raw outputs of the model. + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + The token ids of the input text. If not provided will be taken from the model output. + threshold (`float`, *optional*, defaults to 0.25): + Threshold to keep object detection predictions based on confidence score. + text_threshold (`float`, *optional*, defaults to 0.25): + Score threshold to keep text detection predictions. + target_sizes (`torch.Tensor` or `list[tuple[int, int]]`, *optional*): + Tensor of shape `(batch_size, 2)` or list of tuples (`tuple[int, int]`) containing the target size + `(height, width)` of each image in the batch. If unset, predictions will not be resized. + text_labels (`list[list[str]]`, *optional*): + List of candidate labels to be detected on each image. At the moment it's *NOT used*, but required + to be in signature for the zero-shot object detection pipeline. Text labels are instead extracted + from the `input_ids` tensor provided in `outputs`. + + Returns: + `list[Dict]`: A list of dictionaries, each dictionary containing the + - **scores**: tensor of confidence scores for detected objects + - **boxes**: tensor of bounding boxes in [x0, y0, x1, y1] format + - **labels**: list of text labels for each detected object (will be replaced with integer ids in v4.51.0) + - **text_labels**: list of text labels for detected objects + """ + batch_logits, batch_boxes = outputs.logits, outputs.pred_boxes + input_ids = input_ids if input_ids is not None else outputs.input_ids + + if target_sizes is not None and len(target_sizes) != len(batch_logits): + raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits") + + batch_probs = torch.sigmoid(batch_logits) # (batch_size, num_queries, 256) + batch_scores = torch.max(batch_probs, dim=-1)[0] # (batch_size, num_queries) + + # Convert to [x0, y0, x1, y1] format + batch_boxes = center_to_corners_format(batch_boxes) + + # Convert from relative [0, 1] to absolute [0, height] coordinates + if target_sizes is not None: + if isinstance(target_sizes, list): + img_h = torch.Tensor([i[0] for i in target_sizes]) + img_w = torch.Tensor([i[1] for i in target_sizes]) + else: + img_h, img_w = target_sizes.unbind(1) + + scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(batch_boxes.device) + batch_boxes = batch_boxes * scale_fct[:, None, :] + + results = [] + for idx, (scores, boxes, probs) in enumerate(zip(batch_scores, batch_boxes, batch_probs)): + keep = scores > threshold + scores = scores[keep] + boxes = boxes[keep] + + # extract text labels + prob = probs[keep] + label_ids = get_phrases_from_posmap(prob > text_threshold, input_ids[idx]) + objects_text_labels = self.batch_decode(label_ids) + + result = DictWithDeprecationWarning( + { + "scores": scores, + "boxes": boxes, + "text_labels": objects_text_labels, + # TODO: @pavel, set labels to None since v4.51.0 or find a way to extract ids + "labels": objects_text_labels, + } + ) + results.append(result) + + return results + + +__all__ = ["GroundingDinoProcessor"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/helium/__pycache__/__init__.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/helium/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..014be320c316c3a476db31b5db27eacfbb6677b8 Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/helium/__pycache__/__init__.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/helium/__pycache__/configuration_helium.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/helium/__pycache__/configuration_helium.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5aa5a5791345075729381a9c0b27aa50f4d264b0 Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/helium/__pycache__/configuration_helium.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/helium/__pycache__/modeling_helium.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/helium/__pycache__/modeling_helium.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7deb5c05546faa9af65fc6325a444840c0f37fad Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/helium/__pycache__/modeling_helium.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/helium/__pycache__/modular_helium.cpython-312.pyc b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/helium/__pycache__/modular_helium.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8adb09335af815342961c1314f82fd289d10813f Binary files /dev/null and b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/helium/__pycache__/modular_helium.cpython-312.pyc differ diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/herbert/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/herbert/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e0d0794a06e8cfbe4178a72e2b09d5292ddbc4fb --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/herbert/__init__.py @@ -0,0 +1,27 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .tokenization_herbert import * + from .tokenization_herbert_fast import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/herbert/tokenization_herbert.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/herbert/tokenization_herbert.py new file mode 100644 index 0000000000000000000000000000000000000000..c1c6bacc87fc68bf5a4d998236e878fdf602a8f0 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/herbert/tokenization_herbert.py @@ -0,0 +1,617 @@ +# coding=utf-8 +# Copyright 2020 The Google AI Language Team Authors, Allegro.pl, Facebook Inc. and the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import json +import os +import re +import unicodedata +from typing import Optional + +from ...tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace +from ...utils import logging + + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = { + "vocab_file": "vocab.json", + "merges_file": "merges.txt", +} + + +# Copied from transformers.models.xlm.tokenization_xlm.get_pairs +def get_pairs(word): + """ + Return set of symbol pairs in a word. word is represented as tuple of symbols (symbols being variable-length + strings) + """ + pairs = set() + prev_char = word[0] + for char in word[1:]: + pairs.add((prev_char, char)) + prev_char = char + return pairs + + +# Copied from transformers.models.xlm.tokenization_xlm.replace_unicode_punct +def replace_unicode_punct(text): + """ + Port of https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/replace-unicode-punctuation.perl + """ + text = text.replace(",", ",") + text = re.sub(r"。\s*", ". ", text) + text = text.replace("、", ",") + text = text.replace("”", '"') + text = text.replace("“", '"') + text = text.replace("∶", ":") + text = text.replace(":", ":") + text = text.replace("?", "?") + text = text.replace("《", '"') + text = text.replace("》", '"') + text = text.replace(")", ")") + text = text.replace("!", "!") + text = text.replace("(", "(") + text = text.replace(";", ";") + text = text.replace("1", "1") + text = text.replace("」", '"') + text = text.replace("「", '"') + text = text.replace("0", "0") + text = text.replace("3", "3") + text = text.replace("2", "2") + text = text.replace("5", "5") + text = text.replace("6", "6") + text = text.replace("9", "9") + text = text.replace("7", "7") + text = text.replace("8", "8") + text = text.replace("4", "4") + text = re.sub(r".\s*", ". ", text) + text = text.replace("~", "~") + text = text.replace("’", "'") + text = text.replace("…", "...") + text = text.replace("━", "-") + text = text.replace("〈", "<") + text = text.replace("〉", ">") + text = text.replace("【", "[") + text = text.replace("】", "]") + text = text.replace("%", "%") + return text + + +# Copied from transformers.models.xlm.tokenization_xlm.remove_non_printing_char +def remove_non_printing_char(text): + """ + Port of https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/remove-non-printing-char.perl + """ + output = [] + for char in text: + cat = unicodedata.category(char) + if cat.startswith("C"): + continue + output.append(char) + return "".join(output) + + +# Copied from transformers.models.bert.tokenization_bert.whitespace_tokenize +def whitespace_tokenize(text): + """Runs basic whitespace cleaning and splitting on a piece of text.""" + text = text.strip() + if not text: + return [] + tokens = text.split() + return tokens + + +# Copied from transformers.models.bert.tokenization_bert.BasicTokenizer +class BasicTokenizer: + """ + Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.). + + Args: + do_lower_case (`bool`, *optional*, defaults to `True`): + Whether or not to lowercase the input when tokenizing. + never_split (`Iterable`, *optional*): + Collection of tokens which will never be split during tokenization. Only has an effect when + `do_basic_tokenize=True` + tokenize_chinese_chars (`bool`, *optional*, defaults to `True`): + Whether or not to tokenize Chinese characters. + + This should likely be deactivated for Japanese (see this + [issue](https://github.com/huggingface/transformers/issues/328)). + strip_accents (`bool`, *optional*): + Whether or not to strip all accents. If this option is not specified, then it will be determined by the + value for `lowercase` (as in the original BERT). + do_split_on_punc (`bool`, *optional*, defaults to `True`): + In some instances we want to skip the basic punctuation splitting so that later tokenization can capture + the full context of the words, such as contractions. + """ + + def __init__( + self, + do_lower_case=True, + never_split=None, + tokenize_chinese_chars=True, + strip_accents=None, + do_split_on_punc=True, + ): + if never_split is None: + never_split = [] + self.do_lower_case = do_lower_case + self.never_split = set(never_split) + self.tokenize_chinese_chars = tokenize_chinese_chars + self.strip_accents = strip_accents + self.do_split_on_punc = do_split_on_punc + + def tokenize(self, text, never_split=None): + """ + Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer. + + Args: + never_split (`List[str]`, *optional*) + Kept for backward compatibility purposes. Now implemented directly at the base class level (see + [`PreTrainedTokenizer.tokenize`]) List of token not to split. + """ + # union() returns a new set by concatenating the two sets. + never_split = self.never_split.union(set(never_split)) if never_split else self.never_split + text = self._clean_text(text) + + # This was added on November 1st, 2018 for the multilingual and Chinese + # models. This is also applied to the English models now, but it doesn't + # matter since the English models were not trained on any Chinese data + # and generally don't have any Chinese data in them (there are Chinese + # characters in the vocabulary because Wikipedia does have some Chinese + # words in the English Wikipedia.). + if self.tokenize_chinese_chars: + text = self._tokenize_chinese_chars(text) + # prevents treating the same character with different unicode codepoints as different characters + unicode_normalized_text = unicodedata.normalize("NFC", text) + orig_tokens = whitespace_tokenize(unicode_normalized_text) + split_tokens = [] + for token in orig_tokens: + if token not in never_split: + if self.do_lower_case: + token = token.lower() + if self.strip_accents is not False: + token = self._run_strip_accents(token) + elif self.strip_accents: + token = self._run_strip_accents(token) + split_tokens.extend(self._run_split_on_punc(token, never_split)) + + output_tokens = whitespace_tokenize(" ".join(split_tokens)) + return output_tokens + + def _run_strip_accents(self, text): + """Strips accents from a piece of text.""" + text = unicodedata.normalize("NFD", text) + output = [] + for char in text: + cat = unicodedata.category(char) + if cat == "Mn": + continue + output.append(char) + return "".join(output) + + def _run_split_on_punc(self, text, never_split=None): + """Splits punctuation on a piece of text.""" + if not self.do_split_on_punc or (never_split is not None and text in never_split): + return [text] + chars = list(text) + i = 0 + start_new_word = True + output = [] + while i < len(chars): + char = chars[i] + if _is_punctuation(char): + output.append([char]) + start_new_word = True + else: + if start_new_word: + output.append([]) + start_new_word = False + output[-1].append(char) + i += 1 + + return ["".join(x) for x in output] + + def _tokenize_chinese_chars(self, text): + """Adds whitespace around any CJK character.""" + output = [] + for char in text: + cp = ord(char) + if self._is_chinese_char(cp): + output.append(" ") + output.append(char) + output.append(" ") + else: + output.append(char) + return "".join(output) + + def _is_chinese_char(self, cp): + """Checks whether CP is the codepoint of a CJK character.""" + # This defines a "chinese character" as anything in the CJK Unicode block: + # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) + # + # Note that the CJK Unicode block is NOT all Japanese and Korean characters, + # despite its name. The modern Korean Hangul alphabet is a different block, + # as is Japanese Hiragana and Katakana. Those alphabets are used to write + # space-separated words, so they are not treated specially and handled + # like the all of the other languages. + if ( + (cp >= 0x4E00 and cp <= 0x9FFF) + or (cp >= 0x3400 and cp <= 0x4DBF) + or (cp >= 0x20000 and cp <= 0x2A6DF) + or (cp >= 0x2A700 and cp <= 0x2B73F) + or (cp >= 0x2B740 and cp <= 0x2B81F) + or (cp >= 0x2B820 and cp <= 0x2CEAF) + or (cp >= 0xF900 and cp <= 0xFAFF) + or (cp >= 0x2F800 and cp <= 0x2FA1F) + ): + return True + + return False + + def _clean_text(self, text): + """Performs invalid character removal and whitespace cleanup on text.""" + output = [] + for char in text: + cp = ord(char) + if cp == 0 or cp == 0xFFFD or _is_control(char): + continue + if _is_whitespace(char): + output.append(" ") + else: + output.append(char) + return "".join(output) + + +class HerbertTokenizer(PreTrainedTokenizer): + """ + Construct a BPE tokenizer for HerBERT. + + Peculiarities: + + - uses BERT's pre-tokenizer: BaseTokenizer splits tokens on spaces, and also on punctuation. Each occurrence of a + punctuation character will be treated separately. + + - Such pretokenized input is BPE subtokenized + + This tokenizer inherits from [`XLMTokenizer`] which contains most of the methods. Users should refer to the + superclass for more information regarding methods. + """ + + vocab_files_names = VOCAB_FILES_NAMES + + def __init__( + self, + vocab_file, + merges_file, + tokenizer_file=None, + cls_token="", + unk_token="", + pad_token="", + mask_token="", + sep_token="", + bos_token="", + do_lowercase_and_remove_accent=False, + additional_special_tokens=[ + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + ], + lang2id=None, + id2lang=None, + **kwargs, + ): + try: + import sacremoses + except ImportError: + raise ImportError( + "You need to install sacremoses to use HerbertTokenizer. " + "See https://pypi.org/project/sacremoses/ for installation." + ) + + self.sm = sacremoses + + # cache of sm.MosesPunctNormalizer instance + self.cache_moses_punct_normalizer = {} + # cache of sm.MosesTokenizer instance + self.cache_moses_tokenizer = {} + self.lang_with_custom_tokenizer = {"zh", "th", "ja"} + # True for current supported model (v1.2.0), False for XLM-17 & 100 + self.do_lowercase_and_remove_accent = do_lowercase_and_remove_accent + self.lang2id = lang2id + self.id2lang = id2lang + if lang2id is not None and id2lang is not None: + assert len(lang2id) == len(id2lang) + + self.ja_word_tokenizer = None + self.zh_word_tokenizer = None + + with open(vocab_file, encoding="utf-8") as vocab_handle: + self.encoder = json.load(vocab_handle) + self.decoder = {v: k for k, v in self.encoder.items()} + with open(merges_file, encoding="utf-8") as merges_handle: + merges = merges_handle.read().split("\n")[:-1] + merges = [tuple(merge.split()[:2]) for merge in merges] + self.bpe_ranks = dict(zip(merges, range(len(merges)))) + self.cache = {} + + super().__init__( + unk_token=unk_token, + bos_token=bos_token, + sep_token=sep_token, + pad_token=pad_token, + cls_token=cls_token, + mask_token=mask_token, + additional_special_tokens=additional_special_tokens, + lang2id=lang2id, + id2lang=id2lang, + do_lowercase_and_remove_accent=do_lowercase_and_remove_accent, + tokenizer_file=None, + **kwargs, + ) + + self.bert_pre_tokenizer = BasicTokenizer( + do_lower_case=False, + never_split=self.all_special_tokens, + tokenize_chinese_chars=False, + strip_accents=False, + ) + + @property + # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.do_lower_case + def do_lower_case(self): + return self.do_lowercase_and_remove_accent + + # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.moses_punct_norm + def moses_punct_norm(self, text, lang): + if lang not in self.cache_moses_punct_normalizer: + punct_normalizer = self.sm.MosesPunctNormalizer(lang=lang) + self.cache_moses_punct_normalizer[lang] = punct_normalizer + else: + punct_normalizer = self.cache_moses_punct_normalizer[lang] + return punct_normalizer.normalize(text) + + # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.moses_tokenize + def moses_tokenize(self, text, lang): + if lang not in self.cache_moses_tokenizer: + moses_tokenizer = self.sm.MosesTokenizer(lang=lang) + self.cache_moses_tokenizer[lang] = moses_tokenizer + else: + moses_tokenizer = self.cache_moses_tokenizer[lang] + return moses_tokenizer.tokenize(text, return_str=False, escape=False) + + # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.moses_pipeline + def moses_pipeline(self, text, lang): + text = replace_unicode_punct(text) + text = self.moses_punct_norm(text, lang) + text = remove_non_printing_char(text) + return text + + # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.ja_tokenize + def ja_tokenize(self, text): + if self.ja_word_tokenizer is None: + try: + import Mykytea + + self.ja_word_tokenizer = Mykytea.Mykytea( + f"-model {os.path.expanduser('~')}/local/share/kytea/model.bin" + ) + except (AttributeError, ImportError): + logger.error( + "Make sure you install KyTea (https://github.com/neubig/kytea) and it's python wrapper" + " (https://github.com/chezou/Mykytea-python) with the following steps" + ) + logger.error("1. git clone git@github.com:neubig/kytea.git && cd kytea") + logger.error("2. autoreconf -i") + logger.error("3. ./configure --prefix=$HOME/local") + logger.error("4. make && make install") + logger.error("5. pip install kytea") + raise + return list(self.ja_word_tokenizer.getWS(text)) + + @property + # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.vocab_size + def vocab_size(self): + return len(self.encoder) + + # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.get_vocab + def get_vocab(self): + return dict(self.encoder, **self.added_tokens_encoder) + + # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.bpe + def bpe(self, token): + word = tuple(token[:-1]) + (token[-1] + "",) + if token in self.cache: + return self.cache[token] + pairs = get_pairs(word) + + if not pairs: + return token + "" + + while True: + bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf"))) + if bigram not in self.bpe_ranks: + break + first, second = bigram + new_word = [] + i = 0 + while i < len(word): + try: + j = word.index(first, i) + except ValueError: + new_word.extend(word[i:]) + break + else: + new_word.extend(word[i:j]) + i = j + + if word[i] == first and i < len(word) - 1 and word[i + 1] == second: + new_word.append(first + second) + i += 2 + else: + new_word.append(word[i]) + i += 1 + new_word = tuple(new_word) + word = new_word + if len(word) == 1: + break + else: + pairs = get_pairs(word) + word = " ".join(word) + if word == "\n ": + word = "\n" + self.cache[token] = word + return word + + def _tokenize(self, text): + pre_tokens = self.bert_pre_tokenizer.tokenize(text) + + split_tokens = [] + for token in pre_tokens: + if token: + split_tokens.extend(list(self.bpe(token).split(" "))) + + return split_tokens + + # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer._convert_token_to_id + def _convert_token_to_id(self, token): + """Converts a token (str) in an id using the vocab.""" + return self.encoder.get(token, self.encoder.get(self.unk_token)) + + # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer._convert_id_to_token + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.decoder.get(index, self.unk_token) + + # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.convert_tokens_to_string + def convert_tokens_to_string(self, tokens): + """Converts a sequence of tokens (string) in a single string.""" + out_string = "".join(tokens).replace("", " ").strip() + return out_string + + # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.build_inputs_with_special_tokens + def build_inputs_with_special_tokens( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None + ) -> list[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. An XLM sequence has the following format: + + - single sequence: ` X ` + - pair of sequences: ` A B ` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + + """ + bos = [self.bos_token_id] + sep = [self.sep_token_id] + + if token_ids_1 is None: + return bos + token_ids_0 + sep + return bos + token_ids_0 + sep + token_ids_1 + sep + + # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.get_special_tokens_mask + def get_special_tokens_mask( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False + ) -> list[int]: + """ + Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer `prepare_for_model` method. + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not the token list is already formatted with special tokens for the model. + + Returns: + `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + + if already_has_special_tokens: + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) + + if token_ids_1 is not None: + return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] + return [1] + ([0] * len(token_ids_0)) + [1] + + # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.save_vocabulary + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]: + if not os.path.isdir(save_directory): + logger.error(f"Vocabulary path ({save_directory}) should be a directory") + return + vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + merge_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"] + ) + + with open(vocab_file, "w", encoding="utf-8") as f: + f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n") + + index = 0 + with open(merge_file, "w", encoding="utf-8") as writer: + for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]): + if index != token_index: + logger.warning( + f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive." + " Please check that the tokenizer is not corrupted!" + ) + index = token_index + writer.write(" ".join(bpe_tokens) + "\n") + index += 1 + + return vocab_file, merge_file + + # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.__getstate__ + def __getstate__(self): + state = self.__dict__.copy() + state["sm"] = None + return state + + # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.__setstate__ + def __setstate__(self, d): + self.__dict__ = d + + try: + import sacremoses + except ImportError: + raise ImportError( + "You need to install sacremoses to use XLMTokenizer. " + "See https://pypi.org/project/sacremoses/ for installation." + ) + + self.sm = sacremoses + + +__all__ = ["HerbertTokenizer"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/herbert/tokenization_herbert_fast.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/herbert/tokenization_herbert_fast.py new file mode 100644 index 0000000000000000000000000000000000000000..fdc24e3c6a6e20a847043379f898fead1beb819f --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/herbert/tokenization_herbert_fast.py @@ -0,0 +1,133 @@ +# coding=utf-8 +# Copyright 2020 The Google AI Language Team Authors, Allegro.pl, Facebook Inc. and the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Optional + +from ...tokenization_utils_fast import PreTrainedTokenizerFast +from ...utils import logging +from .tokenization_herbert import HerbertTokenizer + + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"} + + +class HerbertTokenizerFast(PreTrainedTokenizerFast): + """ + Construct a "Fast" BPE tokenizer for HerBERT (backed by HuggingFace's *tokenizers* library). + + Peculiarities: + + - uses BERT's pre-tokenizer: BertPreTokenizer splits tokens on spaces, and also on punctuation. Each occurrence of + a punctuation character will be treated separately. + + This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the methods. Users should refer to the + superclass for more information regarding methods. + + Args: + vocab_file (`str`): + Path to the vocabulary file. + merges_file (`str`): + Path to the merges file. + """ + + vocab_files_names = VOCAB_FILES_NAMES + slow_tokenizer_class = HerbertTokenizer + + def __init__( + self, + vocab_file=None, + merges_file=None, + tokenizer_file=None, + cls_token="", + unk_token="", + pad_token="", + mask_token="", + sep_token="", + **kwargs, + ): + super().__init__( + vocab_file, + merges_file, + tokenizer_file=tokenizer_file, + cls_token=cls_token, + unk_token=unk_token, + pad_token=pad_token, + mask_token=mask_token, + sep_token=sep_token, + **kwargs, + ) + + def build_inputs_with_special_tokens( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None + ) -> list[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. An HerBERT, like BERT sequence has the following format: + + - single sequence: ` X ` + - pair of sequences: ` A B ` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + + cls = [self.cls_token_id] + sep = [self.sep_token_id] + if token_ids_1 is None: + return cls + token_ids_0 + sep + + return cls + token_ids_0 + sep + token_ids_1 + sep + + def get_special_tokens_mask( + self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False + ) -> list[int]: + """ + Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer `prepare_for_model` method. + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not the token list is already formatted with special tokens for the model. + + Returns: + `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + if already_has_special_tokens: + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) + + if token_ids_1 is None: + return [1] + ([0] * len(token_ids_0)) + [1] + return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]: + files = self._tokenizer.model.save(save_directory, name=filename_prefix) + return tuple(files) + + +__all__ = ["HerbertTokenizerFast"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/hunyuan_v1_moe/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/hunyuan_v1_moe/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..cd107ee7a3c16d2527806035f67702e9220cae51 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/hunyuan_v1_moe/__init__.py @@ -0,0 +1,14 @@ +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_hunyuan_v1_moe import * + from .modeling_hunyuan import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py new file mode 100644 index 0000000000000000000000000000000000000000..386ddac1d3ebb37de330a3940ad5ac556be5bcf6 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py @@ -0,0 +1,204 @@ +# coding=utf-8 +# Copyright (C) 2025 THL A29 Limited, a Tencent company and the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""HunYuanMoEV1 model configuration""" + +from typing import Union + +from transformers.configuration_utils import PretrainedConfig +from transformers.utils import logging + + +logger = logging.get_logger(__name__) + + +class HunYuanMoEV1Config(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`HunYuanMoEV1Model`]. It is used to instantiate an + HunYuan model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of the HunYuan-7B. + Hunyuan-A13B-Instruct [tencent/Hunyuan-A13B-Instruct](https://huggingface.co/tencent/Hunyuan-A13B-Instruct). + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 290943): + Vocabulary size of the HunYuan model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`HunYuanMoEV1Model`] + hidden_size (`int`, *optional*, defaults to 4096): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 11008): + Dimension of the MLP representations or shared MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 32): + Number of hidden layers in the Transformer decoder. + num_attention_heads (`int`, *optional*, defaults to 32): + Number of attention heads for each attention layer in the Transformer decoder. + num_key_value_heads (`int`, *optional*): + This is the number of key_value heads that should be used to implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When + converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed + by meanpooling all the original heads within that group. For more details checkout [this + paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to + `num_attention_heads`. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the decoder. + max_position_embeddings (`int`, *optional*, defaults to 2048): + The maximum sequence length that this model might ever be used with. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + rms_norm_eps (`float`, *optional*, defaults to 1e-05): + The epsilon used by the rms normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + pad_token_id (`int`, *optional*, defaults to 0): + Padding token id. + bos_token_id (`int`, *optional*, defaults to 1): + Beginning of stream token id. + eos_token_id (`int`, *optional*, defaults to 2): + End of stream token id. + eod_token_id (int, *optional*, defaults to 3): + Token ID representing the end-of-document marker. Used to indicate the termination of a text sequence. + Example: In multi-document processing, this token helps the model distinguish between separate documents. + sep_token_id (`int`, *optional*, defaults to 4): + Token ID representing the separator token (`[SEP]`), used to demarcate boundaries between different text segments. + pretraining_tp (`int`, *optional*, defaults to 1): + Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this + document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is + necessary to ensure exact reproducibility of the pretraining results. Please refer to [this + issue](https://github.com/pytorch/pytorch/issues/76232). + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether to tie weight embeddings + rope_theta (`float`, *optional*, defaults to 10000.0): + The base period of the RoPE embeddings. + rope_scaling (`Dict`, *optional*): + Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling + strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is + `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update + `max_position_embeddings` to the expected new maximum. See the following thread for more information on how + these scaling strategies behave: + https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an + experimental feature, subject to breaking API changes in future versions. + attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`): + Whether to use a bias in the query, key, value and output projection layers during self-attention. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + num_experts (`int` or `List`, *optional*, defaults to 1): + The number of experts for moe. If it is a list, it will be used as the number of experts for each layer. + moe_topk (int or List, *optional*, defaults to 1): + Number of experts selected per token (Top-K routing). List form enables layer-wise customization. + head_dim (`int`, *optional*, defaults to 128): + The attention head dimension. + """ + + model_type = "hunyuan_v1_moe" + keys_to_ignore_at_inference = ["past_key_values"] + + def __init__( + self, + vocab_size=290943, + hidden_size=4096, + intermediate_size: int = 11008, + num_hidden_layers=32, + num_attention_heads=32, + num_key_value_heads=None, + hidden_act="silu", + max_position_embeddings=2048, + initializer_range=0.02, + rms_norm_eps=1e-5, + use_cache=True, + pad_token_id=0, + bos_token_id=1, + eos_token_id=2, + eod_token_id=3, + sep_token_id=4, + pretraining_tp=1, + tie_word_embeddings=False, + rope_theta=10000.0, + rope_scaling=None, + attention_bias=False, + attention_dropout=0.0, + num_experts: Union[int, list] = 1, + moe_topk: Union[int, list] = 1, + head_dim=None, + **kwargs, + ): + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.num_experts = num_experts + self.moe_topk = moe_topk + + self.head_dim = head_dim + # for backward compatibility + if num_key_value_heads is None: + num_key_value_heads = num_attention_heads + + self.num_key_value_heads = num_key_value_heads + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + self.pretraining_tp = pretraining_tp + self.use_cache = use_cache + self.rope_theta = rope_theta + self.rope_scaling = rope_scaling + # self._rope_scaling_validation() # TODO: Need validation? + self.attention_bias = attention_bias + self.attention_dropout = attention_dropout + + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + sep_token_id=sep_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) + + def _rope_scaling_validation(self): + """ + Validate the `rope_scaling` configuration. + """ + if self.rope_scaling is None: + return + + if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2: + raise ValueError( + "`rope_scaling` must be a dictionary with with two fields, `type` and `factor` or `type` and `alpha`, " + f"got {self.rope_scaling}" + ) + rope_scaling_type = self.rope_scaling.get("type", None) + rope_scaling_factor = self.rope_scaling.get("factor", None) + rope_scaling_alpha = self.rope_scaling.get("alpha", None) + if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]: + raise ValueError( + f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}" + ) + if rope_scaling_factor is None and rope_scaling_alpha is None: + raise ValueError("`rope_scaling`'s factor or alpha field must be have one, got both of none") + if rope_scaling_factor is not None: + if not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0: + raise ValueError(f"`rope_scaling`'s factor field must be a float > 1.0, got {rope_scaling_factor}") + if rope_scaling_alpha is not None: + if not isinstance(rope_scaling_alpha, float) or rope_scaling_alpha <= 1.0: + raise ValueError(f"`rope_scaling`'s alpha field must be a float > 1.0, got {rope_scaling_alpha}") + + +__all__ = ["HunYuanMoEV1Config"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py new file mode 100644 index 0000000000000000000000000000000000000000..043d1f8243a35638747f3ff44946748dc472fcf1 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py @@ -0,0 +1,584 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_hunyuan_v1_moe.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright (C) 2025 THL A29 Limited, a Tencent company and the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Callable, Optional, Union + +import torch +import torch.nn.functional as F +from torch import nn + +from transformers.cache_utils import Cache + +from ...activations import ACT2FN +from ...cache_utils import DynamicCache +from ...generation import GenerationMixin +from ...integrations import use_kernel_forward_from_hub +from ...masking_utils import create_causal_mask +from ...modeling_layers import GenericForSequenceClassification, GradientCheckpointingLayer +from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast +from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update +from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel +from ...processing_utils import Unpack +from ...utils import TransformersKwargs, auto_docstring, can_return_tuple +from ...utils.deprecation import deprecate_kwarg +from ...utils.generic import check_model_inputs +from .configuration_hunyuan_v1_moe import HunYuanMoEV1Config + + +@use_kernel_forward_from_hub("RMSNorm") +class HunYuanMoEV1RMSNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + """ + HunYuanMoEV1RMSNorm is equivalent to T5LayerNorm + """ + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + return self.weight * hidden_states.to(input_dtype) + + def extra_repr(self): + return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}" + + +class HunYuanMoEV1MLP(nn.Module): + def __init__(self, config: HunYuanMoEV1Config, layer_idx=None, is_shared_mlp=False): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.intermediate_size = config.intermediate_size + self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) + self.act_fn = ACT2FN[config.hidden_act] + self.layer_idx = layer_idx + + def forward(self, x): + down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) + return down_proj + + +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + +def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1): + """Applies Rotary Position Embedding to the query and key tensors. + + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + position_ids (`torch.Tensor`, *optional*): + Deprecated and unused. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + cos = cos.unsqueeze(unsqueeze_dim) + sin = sin.unsqueeze(unsqueeze_dim) + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + +def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: + """ + This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, + num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) + """ + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) + + +def eager_attention_forward( + module: nn.Module, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attention_mask: Optional[torch.Tensor], + scaling: float, + dropout: float = 0.0, + **kwargs: Unpack[TransformersKwargs], +): + key_states = repeat_kv(key, module.num_key_value_groups) + value_states = repeat_kv(value, module.num_key_value_groups) + + attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling + if attention_mask is not None: + causal_mask = attention_mask[:, :, :, : key_states.shape[-2]] + attn_weights = attn_weights + causal_mask + + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype) + attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) + attn_output = torch.matmul(attn_weights, value_states) + attn_output = attn_output.transpose(1, 2).contiguous() + + return attn_output, attn_weights + + +class HunYuanMoEV1Attention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config: HunYuanMoEV1Config, layer_idx: int): + super().__init__() + self.config = config + self.layer_idx = layer_idx + self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads) + self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads + self.scaling = self.head_dim**-0.5 + self.attention_dropout = config.attention_dropout + self.is_causal = True + + self.q_proj = nn.Linear( + config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias + ) + self.k_proj = nn.Linear( + config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias + ) + self.v_proj = nn.Linear( + config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias + ) + self.o_proj = nn.Linear( + config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias + ) + self.query_layernorm = HunYuanMoEV1RMSNorm(self.head_dim, eps=config.rms_norm_eps) + self.key_layernorm = HunYuanMoEV1RMSNorm(self.head_dim, eps=config.rms_norm_eps) + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + position_embeddings: tuple[torch.Tensor, torch.Tensor], + attention_mask: Optional[torch.Tensor], + past_key_values: Optional[Cache] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> tuple[torch.Tensor, torch.Tensor]: + input_shape = hidden_states.shape[:-1] + hidden_shape = (*input_shape, -1, self.head_dim) + + query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2) + key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2) + value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2) + + cos, sin = position_embeddings + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) + query_states = self.query_layernorm(query_states) + key_states = self.key_layernorm(key_states) + + if past_key_values is not None: + # sin and cos are specific to RoPE models; cache_position needed for the static cache + cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} + key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs) + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + attn_output, attn_weights = attention_interface( + self, + query_states, + key_states, + value_states, + attention_mask, + dropout=0.0 if not self.training else self.attention_dropout, + scaling=self.scaling, + **kwargs, + ) + + attn_output = attn_output.reshape(*input_shape, -1).contiguous() + attn_output = self.o_proj(attn_output) + return attn_output, attn_weights + + +class HunYuanMoEV1Gate(nn.Module): + def __init__(self, config: HunYuanMoEV1Config, layer_idx: Optional[int] = None): + super().__init__() + self.config = config + self.layer_idx = layer_idx + num_experts = config.num_experts if isinstance(config.num_experts, int) else config.num_experts[layer_idx] + self.wg = nn.Linear(config.hidden_size, num_experts, bias=False, dtype=torch.float32) + + def forward(self, hidden_states): + bsz, seq_len, hidden_size = hidden_states.shape + hidden_states = hidden_states.reshape(-1, hidden_size) + if self.wg.weight.dtype == torch.float32: + hidden_states = hidden_states.float() + logits = self.wg(hidden_states) + return logits + + +class HunYuanMoEV1Moe(nn.Module): + def __init__(self, config: HunYuanMoEV1Config, layer_idx: Optional[int] = None): + super().__init__() + self.config = config + self.layer_idx = layer_idx + self.num_experts = config.num_experts if isinstance(config.num_experts, int) else config.num_experts[layer_idx] + self.top_k = config.moe_topk if isinstance(config.moe_topk, int) else config.moe_topk[layer_idx] + self.gate = HunYuanMoEV1Gate(config, layer_idx=layer_idx) + # self.wg = nn.Linear(config.hidden_size, config.num_experts, bias=False, dtype=torch.float32) + self.experts = nn.ModuleList( + [HunYuanMoEV1MLP(config, layer_idx=layer_idx, is_shared_mlp=False) for _ in range(self.num_experts)] + ) + + self.shared_mlp = HunYuanMoEV1MLP(config, layer_idx=layer_idx, is_shared_mlp=True) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + batch_size, sequence_length, hidden_dim = hidden_states.shape + hidden_states_mlp = self.shared_mlp(hidden_states) + router_logits = self.gate(hidden_states) + hidden_states = hidden_states.view(-1, hidden_dim) + # router_logits: (batch * sequence_length, n_experts) + + routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float) + routing_weights, selected_experts = torch.topk(routing_weights, self.top_k, dim=-1) + routing_weights /= routing_weights.sum(dim=-1, keepdim=True) + # we cast back to the input dtype + routing_weights = routing_weights.to(hidden_states.dtype) + + final_hidden_states = torch.zeros( + (batch_size * sequence_length, hidden_dim), dtype=hidden_states.dtype, device=hidden_states.device + ) + + # One hot encode the selected experts to create an expert mask + # this will be used to easily index which expert is going to be sollicitated + expert_mask = torch.nn.functional.one_hot(selected_experts, num_classes=self.num_experts).permute(2, 1, 0) + + # Loop over all available experts in the model and perform the computation on each expert + expert_hit = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero() + for expert_idx in expert_hit: + expert_layer = self.experts[expert_idx] + idx, top_x = torch.where(expert_mask[expert_idx].squeeze(0)) + + # Index the correct hidden states and compute the expert hidden state for + # the current expert. We need to make sure to multiply the output hidden + # states by `routing_weights` on the corresponding tokens (top-1 and top-2) + current_state = hidden_states[None, top_x].reshape(-1, hidden_dim) + current_hidden_states = expert_layer(current_state) * routing_weights[top_x, idx, None] + + # However `index_add_` only support torch tensors for indexing so we'll use + # the `top_x` tensor here. + final_hidden_states.index_add_(0, top_x, current_hidden_states.to(hidden_states.dtype)) + final_hidden_states = final_hidden_states.reshape(batch_size, sequence_length, hidden_dim) + return final_hidden_states + hidden_states_mlp + + +class HunYuanMoEV1DecoderLayer(GradientCheckpointingLayer): + def __init__(self, config: HunYuanMoEV1Config, layer_idx: int): + super().__init__() + self.hidden_size = config.hidden_size + self.self_attn = HunYuanMoEV1Attention(config=config, layer_idx=layer_idx) + self.mlp = HunYuanMoEV1Moe(config, layer_idx=layer_idx) + self.input_layernorm = HunYuanMoEV1RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = HunYuanMoEV1RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.layer_idx = layer_idx + + @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + use_cache: Optional[bool] = False, + cache_position: Optional[torch.LongTensor] = None, + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + **kwargs: Unpack[TransformersKwargs], + ) -> torch.Tensor: + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + # Self Attention + hidden_states, _ = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + use_cache=use_cache, + cache_position=cache_position, + position_embeddings=position_embeddings, + **kwargs, + ) + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + return hidden_states + + +@auto_docstring +class HunYuanMoEV1PreTrainedModel(PreTrainedModel): + config: HunYuanMoEV1Config + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["HunYuanMoEV1DecoderLayer"] + _skip_keys_device_placement = ["past_key_values"] + _supports_flash_attn = True + _supports_sdpa = True + _supports_flex_attn = True + _can_compile_fullgraph = False + _supports_attention_backend = True + _can_record_outputs = { + "hidden_states": HunYuanMoEV1DecoderLayer, + "attentions": HunYuanMoEV1Attention, + } + + def _init_weights(self, module): + std = self.config.initializer_range + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + +class HunYuanMoEV1RotaryEmbedding(nn.Module): + inv_freq: torch.Tensor # fix linting for `register_buffer` + + def __init__(self, config: HunYuanMoEV1Config, device=None): + super().__init__() + # BC: "rope_type" was originally "type" + if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): + self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) + else: + self.rope_type = "default" + self.max_seq_len_cached = config.max_position_embeddings + self.original_max_seq_len = config.max_position_embeddings + + self.config = config + self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + if self.rope_type == "dynamic" and config.rope_scaling["alpha"]: + # DynamicNTKAlphaRotary + self.dim = config.head_dim + base = config.rope_theta * config.rope_scaling.get("alpha") ** (self.dim / (self.dim - 2)) + inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)) + self.attention_scaling = 1.0 + else: + inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + + self.register_buffer("inv_freq", inv_freq, persistent=False) + self.original_inv_freq = self.inv_freq + + @torch.no_grad() + @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) + def forward(self, x, position_ids): + inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device) + position_ids_expanded = position_ids[:, None, :].float() + + device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" + with torch.autocast(device_type=device_type, enabled=False): # Force float32 + freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) + emb = torch.cat((freqs, freqs), dim=-1) + cos = emb.cos() * self.attention_scaling + sin = emb.sin() * self.attention_scaling + + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) + + +@auto_docstring +class HunYuanMoEV1Model(HunYuanMoEV1PreTrainedModel): + def __init__(self, config: HunYuanMoEV1Config): + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) + self.layers = nn.ModuleList( + [HunYuanMoEV1DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + ) + self.norm = HunYuanMoEV1RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.rotary_emb = HunYuanMoEV1RotaryEmbedding(config=config) + self.gradient_checkpointing = False + + # Initialize weights and apply final processing + self.post_init() + + @check_model_inputs + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + cache_position: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> BaseModelOutputWithPast: + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") + + if inputs_embeds is None: + inputs_embeds: torch.Tensor = self.embed_tokens(input_ids) + + if use_cache and past_key_values is None: + past_key_values = DynamicCache(config=self.config) + + if cache_position is None: + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + cache_position: torch.Tensor = torch.arange( + past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device + ) + + if position_ids is None: + position_ids = cache_position.unsqueeze(0) + + causal_mask = create_causal_mask( + config=self.config, + input_embeds=inputs_embeds, + attention_mask=attention_mask, + cache_position=cache_position, + past_key_values=past_key_values, + position_ids=position_ids, + ) + + hidden_states = inputs_embeds + position_embeddings = self.rotary_emb(hidden_states, position_ids) + + for decoder_layer in self.layers[: self.config.num_hidden_layers]: + hidden_states = decoder_layer( + hidden_states, + attention_mask=causal_mask, + position_ids=position_ids, + past_key_values=past_key_values, + cache_position=cache_position, + position_embeddings=position_embeddings, + **kwargs, + ) + + hidden_states = self.norm(hidden_states) + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=past_key_values, + ) + + +@auto_docstring +class HunYuanMoEV1ForCausalLM(HunYuanMoEV1PreTrainedModel, GenerationMixin): + _tied_weights_keys = ["lm_head.weight"] + _tp_plan = {"lm_head": "colwise_rep"} + _pp_plan = {"lm_head": (["hidden_states"], ["logits"])} + + def __init__(self, config): + super().__init__(config) + self.model = HunYuanMoEV1Model(config) + self.vocab_size = config.vocab_size + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + @can_return_tuple + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + logits_to_keep: Union[int, torch.Tensor] = 0, + **kwargs: Unpack[TransformersKwargs], + ) -> CausalLMOutputWithPast: + r""" + Example: + + ```python + >>> from transformers import AutoTokenizer, HunYuanMoEV1ForCausalLM + + >>> model = HunYuanMoEV1ForCausalLM.from_pretrained("meta-hunyuan_v1_moe/HunYuanMoEV1-2-7b-hf") + >>> tokenizer = AutoTokenizer.from_pretrained("meta-hunyuan_v1_moe/HunYuanMoEV1-2-7b-hf") + + >>> prompt = "Hey, are you conscious? Can you talk to me?" + >>> inputs = tokenizer(prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(inputs.input_ids, max_length=30) + >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you." + ```""" + outputs: BaseModelOutputWithPast = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + cache_position=cache_position, + **kwargs, + ) + + hidden_states = outputs.last_hidden_state + # Only compute necessary logits, and do not upcast them to float if we are not computing the loss + slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep + logits = self.lm_head(hidden_states[:, slice_indices, :]) + + loss = None + if labels is not None: + loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs) + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +class HunYuanMoEV1ForSequenceClassification(GenericForSequenceClassification, HunYuanMoEV1PreTrainedModel): + pass + + +__all__ = [ + "HunYuanMoEV1ForCausalLM", + "HunYuanMoEV1Model", + "HunYuanMoEV1PreTrainedModel", + "HunYuanMoEV1ForSequenceClassification", +] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py new file mode 100644 index 0000000000000000000000000000000000000000..a72d6268fe7021b472a141beee79ea3eadca5c84 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py @@ -0,0 +1,273 @@ +# coding=utf-8 +# Copyright (C) 2025 THL A29 Limited, a Tencent company and the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch HunYuanMoEV1 model.""" + +from typing import Callable, Optional + +import torch +import torch.nn.functional as F +from torch import nn + +from transformers.cache_utils import Cache +from transformers.utils import ( + logging, +) + +from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update +from ...modeling_utils import ALL_ATTENTION_FUNCTIONS +from ...processing_utils import Unpack +from ...utils import TransformersKwargs +from ..llama.modeling_llama import ( + LlamaAttention, + LlamaDecoderLayer, + LlamaForCausalLM, + LlamaForSequenceClassification, + LlamaMLP, + LlamaModel, + LlamaPreTrainedModel, + LlamaRMSNorm, + apply_rotary_pos_emb, + eager_attention_forward, +) +from .configuration_hunyuan_v1_moe import HunYuanMoEV1Config + + +logger = logging.get_logger(__name__) + + +class HunYuanMoEV1RMSNorm(LlamaRMSNorm): + pass + + +class HunYuanMoEV1MLP(LlamaMLP): + def __init__(self, config: HunYuanMoEV1Config, layer_idx=None, is_shared_mlp=False): + super().__init__(config) + self.layer_idx = layer_idx + self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) + + +class HunYuanMoEV1Attention(LlamaAttention): + def __init__(self, config: HunYuanMoEV1Config, layer_idx: int): + super().__init__(config, layer_idx) + self.query_layernorm = HunYuanMoEV1RMSNorm(self.head_dim, eps=config.rms_norm_eps) + self.key_layernorm = HunYuanMoEV1RMSNorm(self.head_dim, eps=config.rms_norm_eps) + + def forward( + self, + hidden_states: torch.Tensor, + position_embeddings: tuple[torch.Tensor, torch.Tensor], + attention_mask: Optional[torch.Tensor], + past_key_values: Optional[Cache] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> tuple[torch.Tensor, torch.Tensor]: + input_shape = hidden_states.shape[:-1] + hidden_shape = (*input_shape, -1, self.head_dim) + + query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2) + key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2) + value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2) + + cos, sin = position_embeddings + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) + query_states = self.query_layernorm(query_states) + key_states = self.key_layernorm(key_states) + + if past_key_values is not None: + # sin and cos are specific to RoPE models; cache_position needed for the static cache + cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} + key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs) + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + attn_output, attn_weights = attention_interface( + self, + query_states, + key_states, + value_states, + attention_mask, + dropout=0.0 if not self.training else self.attention_dropout, + scaling=self.scaling, + **kwargs, + ) + + attn_output = attn_output.reshape(*input_shape, -1).contiguous() + attn_output = self.o_proj(attn_output) + return attn_output, attn_weights + + +class HunYuanMoEV1Gate(nn.Module): + def __init__(self, config: HunYuanMoEV1Config, layer_idx: Optional[int] = None): + super().__init__() + self.config = config + self.layer_idx = layer_idx + num_experts = config.num_experts if isinstance(config.num_experts, int) else config.num_experts[layer_idx] + self.wg = nn.Linear(config.hidden_size, num_experts, bias=False, dtype=torch.float32) + + def forward(self, hidden_states): + bsz, seq_len, hidden_size = hidden_states.shape + hidden_states = hidden_states.reshape(-1, hidden_size) + if self.wg.weight.dtype == torch.float32: + hidden_states = hidden_states.float() + logits = self.wg(hidden_states) + return logits + + +class HunYuanMoEV1Moe(nn.Module): + def __init__(self, config: HunYuanMoEV1Config, layer_idx: Optional[int] = None): + super().__init__() + self.config = config + self.layer_idx = layer_idx + self.num_experts = config.num_experts if isinstance(config.num_experts, int) else config.num_experts[layer_idx] + self.top_k = config.moe_topk if isinstance(config.moe_topk, int) else config.moe_topk[layer_idx] + self.gate = HunYuanMoEV1Gate(config, layer_idx=layer_idx) + # self.wg = nn.Linear(config.hidden_size, config.num_experts, bias=False, dtype=torch.float32) + self.experts = nn.ModuleList( + [HunYuanMoEV1MLP(config, layer_idx=layer_idx, is_shared_mlp=False) for _ in range(self.num_experts)] + ) + + self.shared_mlp = HunYuanMoEV1MLP(config, layer_idx=layer_idx, is_shared_mlp=True) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + batch_size, sequence_length, hidden_dim = hidden_states.shape + hidden_states_mlp = self.shared_mlp(hidden_states) + router_logits = self.gate(hidden_states) + hidden_states = hidden_states.view(-1, hidden_dim) + # router_logits: (batch * sequence_length, n_experts) + + routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float) + routing_weights, selected_experts = torch.topk(routing_weights, self.top_k, dim=-1) + routing_weights /= routing_weights.sum(dim=-1, keepdim=True) + # we cast back to the input dtype + routing_weights = routing_weights.to(hidden_states.dtype) + + final_hidden_states = torch.zeros( + (batch_size * sequence_length, hidden_dim), dtype=hidden_states.dtype, device=hidden_states.device + ) + + # One hot encode the selected experts to create an expert mask + # this will be used to easily index which expert is going to be sollicitated + expert_mask = torch.nn.functional.one_hot(selected_experts, num_classes=self.num_experts).permute(2, 1, 0) + + # Loop over all available experts in the model and perform the computation on each expert + expert_hit = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero() + for expert_idx in expert_hit: + expert_layer = self.experts[expert_idx] + idx, top_x = torch.where(expert_mask[expert_idx].squeeze(0)) + + # Index the correct hidden states and compute the expert hidden state for + # the current expert. We need to make sure to multiply the output hidden + # states by `routing_weights` on the corresponding tokens (top-1 and top-2) + current_state = hidden_states[None, top_x].reshape(-1, hidden_dim) + current_hidden_states = expert_layer(current_state) * routing_weights[top_x, idx, None] + + # However `index_add_` only support torch tensors for indexing so we'll use + # the `top_x` tensor here. + final_hidden_states.index_add_(0, top_x, current_hidden_states.to(hidden_states.dtype)) + final_hidden_states = final_hidden_states.reshape(batch_size, sequence_length, hidden_dim) + return final_hidden_states + hidden_states_mlp + + +class HunYuanMoEV1DecoderLayer(LlamaDecoderLayer): + def __init__(self, config: HunYuanMoEV1Config, layer_idx: int): + super().__init__(config, layer_idx) + self.hidden_size = config.hidden_size + self.self_attn = HunYuanMoEV1Attention(config=config, layer_idx=layer_idx) + self.mlp = HunYuanMoEV1Moe(config, layer_idx=layer_idx) + self.input_layernorm = HunYuanMoEV1RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = HunYuanMoEV1RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.layer_idx = layer_idx + + +class HunYuanMoEV1PreTrainedModel(LlamaPreTrainedModel): + _can_compile_fullgraph = False + + def _init_weights(self, module): + std = self.config.initializer_range + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + +class HunYuanMoEV1RotaryEmbedding(nn.Module): + inv_freq: torch.Tensor # fix linting for `register_buffer` + + def __init__(self, config: HunYuanMoEV1Config, device=None): + super().__init__() + # BC: "rope_type" was originally "type" + if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): + self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) + else: + self.rope_type = "default" + self.max_seq_len_cached = config.max_position_embeddings + self.original_max_seq_len = config.max_position_embeddings + + self.config = config + self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + if self.rope_type == "dynamic" and config.rope_scaling["alpha"]: + # DynamicNTKAlphaRotary + self.dim = config.head_dim + base = config.rope_theta * config.rope_scaling.get("alpha") ** (self.dim / (self.dim - 2)) + inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)) + self.attention_scaling = 1.0 + else: + inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + + self.register_buffer("inv_freq", inv_freq, persistent=False) + self.original_inv_freq = self.inv_freq + + @torch.no_grad() + @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) + def forward(self, x, position_ids): + inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device) + position_ids_expanded = position_ids[:, None, :].float() + + device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" + with torch.autocast(device_type=device_type, enabled=False): # Force float32 + freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) + emb = torch.cat((freqs, freqs), dim=-1) + cos = emb.cos() * self.attention_scaling + sin = emb.sin() * self.attention_scaling + + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) + + +class HunYuanMoEV1Model(LlamaModel): + pass + + +class HunYuanMoEV1ForCausalLM(LlamaForCausalLM): + pass + + +class HunYuanMoEV1ForSequenceClassification(LlamaForSequenceClassification): + pass + + +__all__ = [ + "HunYuanMoEV1ForCausalLM", + "HunYuanMoEV1Model", + "HunYuanMoEV1PreTrainedModel", + "HunYuanMoEV1ForSequenceClassification", +] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/ijepa/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/ijepa/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8923af1de116219405577646ae2dcedee5602ccc --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/ijepa/__init__.py @@ -0,0 +1,27 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_ijepa import * + from .modeling_ijepa import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/ijepa/configuration_ijepa.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/ijepa/configuration_ijepa.py new file mode 100644 index 0000000000000000000000000000000000000000..5f528adad0d55711f40ea21f58e1e0196822f449 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/ijepa/configuration_ijepa.py @@ -0,0 +1,121 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""I-JEPA model configuration""" + +from ...configuration_utils import PretrainedConfig + + +class IJepaConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`IJepaModel`]. It is used to instantiate an IJEPA + model according to the specified arguments, defining the model architecture. Instantiating a configuration with the + defaults will yield a similar configuration to that of the I-JEPA + [facebook/ijepa_vith14_1k](https://huggingface.co/facebook/ijepa_vith14_1k) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + intermediate_size (`int`, *optional*, defaults to 3072): + Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"` and `"gelu_new"` are supported. + hidden_dropout_prob (`float`, *optional*, defaults to 0.0): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-12): + The epsilon used by the layer normalization layers. + image_size (`int`, *optional*, defaults to 224): + The size (resolution) of each image. + patch_size (`int`, *optional*, defaults to 16): + The size (resolution) of each patch. + num_channels (`int`, *optional*, defaults to 3): + The number of input channels. + qkv_bias (`bool`, *optional*, defaults to `True`): + Whether to add a bias to the queries, keys and values. + pooler_output_size (`int`, *optional*): + Dimensionality of the pooler layer. If None, defaults to `hidden_size`. + pooler_act (`str`, *optional*, defaults to `"tanh"`): + The activation function to be used by the pooler. Keys of ACT2FN are supported for Flax and + Pytorch, and elements of https://www.tensorflow.org/api_docs/python/tf/keras/activations are + supported for Tensorflow. + + Example: + + ```python + >>> from transformers import IJepaConfig, IJepaModel + + >>> # Initializing a IJEPA ijepa-base-patch16-224 style configuration + >>> configuration = IJepaConfig() + + >>> # Initializing a model (with random weights) from the ijepa-base-patch16-224 style configuration + >>> model = IJepaModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "ijepa" + + def __init__( + self, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + hidden_act="gelu", + hidden_dropout_prob=0.0, + attention_probs_dropout_prob=0.0, + initializer_range=0.02, + layer_norm_eps=1e-12, + image_size=224, + patch_size=16, + num_channels=3, + qkv_bias=True, + pooler_output_size=None, + pooler_act="tanh", + **kwargs, + ): + super().__init__(**kwargs) + + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.qkv_bias = qkv_bias + self.pooler_output_size = pooler_output_size if pooler_output_size else hidden_size + self.pooler_act = pooler_act + + +__all__ = ["IJepaConfig"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/ijepa/modeling_ijepa.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/ijepa/modeling_ijepa.py new file mode 100644 index 0000000000000000000000000000000000000000..b26a78e49ab0e251cd5bdee256a01b8d9da61451 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/ijepa/modeling_ijepa.py @@ -0,0 +1,540 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/ijepa/modular_ijepa.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_ijepa.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +import collections.abc +from typing import Callable, Optional, Union + +import torch +import torch.nn as nn + +from ...activations import ACT2FN +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, ImageClassifierOutput +from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel +from ...processing_utils import Unpack +from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer +from ...utils import TransformersKwargs, auto_docstring, torch_int +from ...utils.generic import can_return_tuple, check_model_inputs +from .configuration_ijepa import IJepaConfig + + +class IJepaPatchEmbeddings(nn.Module): + """ + This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial + `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a + Transformer. + """ + + def __init__(self, config: IJepaConfig): + super().__init__() + image_size, patch_size = config.image_size, config.patch_size + num_channels, hidden_size = config.num_channels, config.hidden_size + + image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size) + patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size) + num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.num_patches = num_patches + + self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size) + + def forward(self, pixel_values: torch.Tensor, interpolate_pos_encoding: bool = False) -> torch.Tensor: + batch_size, num_channels, height, width = pixel_values.shape + if num_channels != self.num_channels: + raise ValueError( + "Make sure that the channel dimension of the pixel values match with the one set in the configuration." + f" Expected {self.num_channels} but got {num_channels}." + ) + if not interpolate_pos_encoding: + if height != self.image_size[0] or width != self.image_size[1]: + raise ValueError( + f"Input image size ({height}*{width}) doesn't match model" + f" ({self.image_size[0]}*{self.image_size[1]})." + ) + embeddings = self.projection(pixel_values).flatten(2).transpose(1, 2) + return embeddings + + +class IJepaEmbeddings(nn.Module): + """ + Construct the CLS token, position and patch embeddings. Optionally, also the mask token. + """ + + def __init__(self, config: IJepaConfig, use_mask_token: bool = False) -> None: + super().__init__() + self.mask_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size)) if use_mask_token else None + self.patch_embeddings = IJepaPatchEmbeddings(config) + num_patches = self.patch_embeddings.num_patches + self.position_embeddings = nn.Parameter(torch.randn(1, num_patches, config.hidden_size)) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.patch_size = config.patch_size + self.config = config + + def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor: + """ + This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution + images. This method is also adapted to support torch.jit tracing. + + Adapted from: + - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and + - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211 + """ + + num_patches = embeddings.shape[1] + num_positions = self.position_embeddings.shape[1] + + # always interpolate when tracing to ensure the exported model works for dynamic input shapes + if not torch.jit.is_tracing() and num_patches == num_positions and height == width: + return self.position_embeddings + + patch_pos_embed = self.position_embeddings + + dim = embeddings.shape[-1] + + new_height = height // self.patch_size + new_width = width // self.patch_size + + sqrt_num_positions = torch_int(num_positions**0.5) + patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim) + patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2) + + patch_pos_embed = nn.functional.interpolate( + patch_pos_embed, + size=(new_height, new_width), + mode="bicubic", + align_corners=False, + ) + + patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim) + + return patch_pos_embed + + def forward( + self, + pixel_values: torch.Tensor, + bool_masked_pos: Optional[torch.BoolTensor] = None, + interpolate_pos_encoding: bool = False, + ) -> torch.Tensor: + batch_size, _, height, width = pixel_values.shape + embeddings = self.patch_embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding) + + if bool_masked_pos is not None: + seq_length = embeddings.shape[1] + mask_tokens = self.mask_token.expand(batch_size, seq_length, -1) + # replace the masked visual tokens by mask_tokens + mask = bool_masked_pos.unsqueeze(-1).type_as(mask_tokens) + embeddings = embeddings * (1.0 - mask) + mask_tokens * mask + + # add positional encoding to each token + if interpolate_pos_encoding: + embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width) + else: + embeddings = embeddings + self.position_embeddings + + embeddings = self.dropout(embeddings) + + return embeddings + + +def eager_attention_forward( + module: nn.Module, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attention_mask: Optional[torch.Tensor], + scaling: float, + dropout: float = 0.0, + **kwargs, +): + # Take the dot product between "query" and "key" to get the raw attention scores. + attn_weights = torch.matmul(query, key.transpose(-1, -2)) * scaling + + # Normalize the attention scores to probabilities. + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) + + # Mask heads if we want to + if attention_mask is not None: + attn_weights = attn_weights * attention_mask + + attn_output = torch.matmul(attn_weights, value) + attn_output = attn_output.transpose(1, 2).contiguous() + + return attn_output, attn_weights + + +class IJepaSelfAttention(nn.Module): + def __init__(self, config: IJepaConfig): + super().__init__() + if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): + raise ValueError( + f"The hidden size {config.hidden_size} is not a multiple of the number of attention " + f"heads {config.num_attention_heads}." + ) + + self.config = config + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + self.dropout_prob = config.attention_probs_dropout_prob + self.scaling = self.attention_head_size**-0.5 + self.is_causal = False + + self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias) + self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias) + self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias) + + def forward( + self, hidden_states: torch.Tensor, head_mask: Optional[torch.Tensor] = None + ) -> tuple[torch.Tensor, torch.Tensor]: + batch_size = hidden_states.shape[0] + new_shape = batch_size, -1, self.num_attention_heads, self.attention_head_size + + key_layer = self.key(hidden_states).view(*new_shape).transpose(1, 2) + value_layer = self.value(hidden_states).view(*new_shape).transpose(1, 2) + query_layer = self.query(hidden_states).view(*new_shape).transpose(1, 2) + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + context_layer, attention_probs = attention_interface( + self, + query_layer, + key_layer, + value_layer, + head_mask, + is_causal=self.is_causal, + scaling=self.scaling, + dropout=0.0 if not self.training else self.dropout_prob, + ) + + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.reshape(new_context_layer_shape) + + return context_layer, attention_probs + + +class IJepaSelfOutput(nn.Module): + """ + The residual connection is defined in IJepaLayer instead of here (as is the case with other models), due to the + layernorm applied before each block. + """ + + def __init__(self, config: IJepaConfig): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + return hidden_states + + +class IJepaAttention(nn.Module): + def __init__(self, config: IJepaConfig): + super().__init__() + self.attention = IJepaSelfAttention(config) + self.output = IJepaSelfOutput(config) + self.pruned_heads = set() + + def prune_heads(self, heads: set[int]): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads + ) + + # Prune linear layers + self.attention.query = prune_linear_layer(self.attention.query, index) + self.attention.key = prune_linear_layer(self.attention.key, index) + self.attention.value = prune_linear_layer(self.attention.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads) + self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + def forward(self, hidden_states: torch.Tensor, head_mask: Optional[torch.Tensor] = None) -> torch.Tensor: + self_attn_output, _ = self.attention(hidden_states, head_mask) + output = self.output(self_attn_output, hidden_states) + return output + + +class IJepaIntermediate(nn.Module): + def __init__(self, config: IJepaConfig): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +class IJepaOutput(nn.Module): + def __init__(self, config: IJepaConfig): + super().__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = hidden_states + input_tensor + return hidden_states + + +class IJepaLayer(GradientCheckpointingLayer): + """This corresponds to the Block class in the timm implementation.""" + + def __init__(self, config: IJepaConfig): + super().__init__() + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 + self.attention = IJepaAttention(config) + self.intermediate = IJepaIntermediate(config) + self.output = IJepaOutput(config) + self.layernorm_before = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.layernorm_after = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + def forward(self, hidden_states: torch.Tensor, head_mask: Optional[torch.Tensor] = None) -> torch.Tensor: + hidden_states_norm = self.layernorm_before(hidden_states) + attention_output = self.attention(hidden_states_norm, head_mask) + + # first residual connection + hidden_states = attention_output + hidden_states + + # in IJepa, layernorm is also applied after self-attention + layer_output = self.layernorm_after(hidden_states) + layer_output = self.intermediate(layer_output) + + # second residual connection is done here + layer_output = self.output(layer_output, hidden_states) + + return layer_output + + +@auto_docstring +class IJepaPreTrainedModel(PreTrainedModel): + config: IJepaConfig + base_model_prefix = "ijepa" + main_input_name = "pixel_values" + supports_gradient_checkpointing = True + _no_split_modules = ["IJepaEmbeddings", "IJepaLayer"] + _supports_sdpa = True + _supports_flash_attn = True + _supports_flex_attn = True + _supports_attention_backend = True + _can_record_outputs = { + "hidden_states": IJepaLayer, + "attentions": IJepaSelfAttention, + } + + def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None: + """Initialize the weights""" + if isinstance(module, (nn.Linear, nn.Conv2d)): + # Upcast the input in `fp32` and cast it back to desired `dtype` to avoid + # `trunc_normal_cpu` not implemented in `half` issues + module.weight.data = nn.init.trunc_normal_( + module.weight.data.to(torch.float32), mean=0.0, std=self.config.initializer_range + ).to(module.weight.dtype) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + elif isinstance(module, IJepaEmbeddings): + module.position_embeddings.data = nn.init.trunc_normal_( + module.position_embeddings.data.to(torch.float32), + mean=0.0, + std=self.config.initializer_range, + ).to(module.position_embeddings.dtype) + if module.mask_token is not None: + module.mask_token.data.zero_() + + +class IJepaEncoder(nn.Module): + def __init__(self, config: IJepaConfig): + super().__init__() + self.config = config + self.layer = nn.ModuleList([IJepaLayer(config) for _ in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + + def forward(self, hidden_states: torch.Tensor, head_mask: Optional[torch.Tensor] = None) -> BaseModelOutput: + for i, layer_module in enumerate(self.layer): + layer_head_mask = head_mask[i] if head_mask is not None else None + hidden_states = layer_module(hidden_states, layer_head_mask) + + return BaseModelOutput(last_hidden_state=hidden_states) + + +class IJepaPooler(nn.Module): + def __init__(self, config: IJepaConfig): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.pooler_output_size) + self.activation = ACT2FN[config.pooler_act] + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. + first_token_tensor = hidden_states[:, 0] + pooled_output = self.dense(first_token_tensor) + pooled_output = self.activation(pooled_output) + return pooled_output + + +@auto_docstring +class IJepaModel(IJepaPreTrainedModel): + def __init__(self, config: IJepaConfig, add_pooling_layer: bool = False, use_mask_token: bool = False): + r""" + add_pooling_layer (bool, *optional*, defaults to `True`): + Whether to add a pooling layer + use_mask_token (`bool`, *optional*, defaults to `False`): + Whether to use a mask token for masked image modeling. + """ + super().__init__(config) + self.config = config + self.embeddings = IJepaEmbeddings(config, use_mask_token=use_mask_token) + self.encoder = IJepaEncoder(config) + + self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.pooler = IJepaPooler(config) if add_pooling_layer else None + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self) -> IJepaPatchEmbeddings: + return self.embeddings.patch_embeddings + + def _prune_heads(self, heads_to_prune: dict[int, list[int]]): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + @check_model_inputs(tie_last_hidden_states=False) + @auto_docstring + def forward( + self, + pixel_values: Optional[torch.Tensor] = None, + bool_masked_pos: Optional[torch.BoolTensor] = None, + head_mask: Optional[torch.Tensor] = None, + interpolate_pos_encoding: Optional[bool] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> BaseModelOutputWithPooling: + r""" + bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*): + Boolean masked positions. Indicates which patches are masked (1) and which aren't (0). + """ + + if pixel_values is None: + raise ValueError("You have to specify pixel_values") + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + + # TODO: maybe have a cleaner way to cast the input (from `ImageProcessor` side?) + expected_dtype = self.embeddings.patch_embeddings.projection.weight.dtype + if pixel_values.dtype != expected_dtype: + pixel_values = pixel_values.to(expected_dtype) + + embedding_output = self.embeddings( + pixel_values, bool_masked_pos=bool_masked_pos, interpolate_pos_encoding=interpolate_pos_encoding + ) + + encoder_outputs: BaseModelOutput = self.encoder(embedding_output, head_mask=head_mask) + + sequence_output = encoder_outputs.last_hidden_state + sequence_output = self.layernorm(sequence_output) + pooled_output = self.pooler(sequence_output) if self.pooler is not None else None + + return BaseModelOutputWithPooling(last_hidden_state=sequence_output, pooler_output=pooled_output) + + +@auto_docstring( + custom_intro=""" + IJepa Model transformer with an image classification head on top (a linear layer on top of the final hidden states) + e.g. for ImageNet. + + + + Note that it's possible to fine-tune IJepa on higher resolution images than the ones it has been trained on, by + setting `interpolate_pos_encoding` to `True` in the forward of the model. This will interpolate the pre-trained + position embeddings to the higher resolution. + + + """ +) +class IJepaForImageClassification(IJepaPreTrainedModel): + def __init__(self, config: IJepaConfig): + super().__init__(config) + + self.num_labels = config.num_labels + self.ijepa = IJepaModel(config, add_pooling_layer=False) + + # Classifier head + self.classifier = nn.Linear(config.hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity() + + # Initialize weights and apply final processing + self.post_init() + + @can_return_tuple + @auto_docstring + def forward( + self, + pixel_values: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + interpolate_pos_encoding: Optional[bool] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> ImageClassifierOutput: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the image classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + + outputs: BaseModelOutputWithPooling = self.ijepa( + pixel_values, + head_mask=head_mask, + interpolate_pos_encoding=interpolate_pos_encoding, + **kwargs, + ) + sequence_output = outputs.last_hidden_state + logits = self.classifier(sequence_output.mean(dim=1)) + + loss = None + if labels is not None: + loss = self.loss_function(labels, logits, self.config, **kwargs) + + return ImageClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +__all__ = ["IJepaPreTrainedModel", "IJepaModel", "IJepaForImageClassification"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/ijepa/modular_ijepa.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/ijepa/modular_ijepa.py new file mode 100644 index 0000000000000000000000000000000000000000..7b8e6e152f3c115db4a1712895239467ea45e7df --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/ijepa/modular_ijepa.py @@ -0,0 +1,186 @@ +from typing import Optional, Union + +import torch +import torch.nn as nn + +from transformers.models.ijepa.configuration_ijepa import IJepaConfig + +from ...modeling_outputs import BaseModelOutputWithPooling, ImageClassifierOutput +from ...processing_utils import Unpack +from ...utils import TransformersKwargs, auto_docstring, torch_int +from ..vit.modeling_vit import ViTEmbeddings, ViTForImageClassification, ViTModel, ViTPreTrainedModel + + +class IJepaEmbeddings(ViTEmbeddings): + def __init__(self, config: IJepaConfig, use_mask_token: bool = False) -> None: + super().__init__(config, use_mask_token) + # Remove cls_token from IJepaEmbeddings, as it is not used in the model + del self.cls_token + num_patches = self.patch_embeddings.num_patches + self.position_embeddings = nn.Parameter(torch.randn(1, num_patches, config.hidden_size)) + + def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor: + """ + This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution + images. This method is also adapted to support torch.jit tracing. + + Adapted from: + - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and + - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211 + """ + + num_patches = embeddings.shape[1] + num_positions = self.position_embeddings.shape[1] + + # always interpolate when tracing to ensure the exported model works for dynamic input shapes + if not torch.jit.is_tracing() and num_patches == num_positions and height == width: + return self.position_embeddings + + patch_pos_embed = self.position_embeddings + + dim = embeddings.shape[-1] + + new_height = height // self.patch_size + new_width = width // self.patch_size + + sqrt_num_positions = torch_int(num_positions**0.5) + patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim) + patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2) + + patch_pos_embed = nn.functional.interpolate( + patch_pos_embed, + size=(new_height, new_width), + mode="bicubic", + align_corners=False, + ) + + patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim) + + return patch_pos_embed + + def forward( + self, + pixel_values: torch.Tensor, + bool_masked_pos: Optional[torch.BoolTensor] = None, + interpolate_pos_encoding: bool = False, + ) -> torch.Tensor: + batch_size, _, height, width = pixel_values.shape + embeddings = self.patch_embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding) + + if bool_masked_pos is not None: + seq_length = embeddings.shape[1] + mask_tokens = self.mask_token.expand(batch_size, seq_length, -1) + # replace the masked visual tokens by mask_tokens + mask = bool_masked_pos.unsqueeze(-1).type_as(mask_tokens) + embeddings = embeddings * (1.0 - mask) + mask_tokens * mask + + # add positional encoding to each token + if interpolate_pos_encoding: + embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width) + else: + embeddings = embeddings + self.position_embeddings + + embeddings = self.dropout(embeddings) + + return embeddings + + +@auto_docstring +class IJepaPreTrainedModel(ViTPreTrainedModel): + def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None: + """Initialize the weights""" + if isinstance(module, (nn.Linear, nn.Conv2d)): + # Upcast the input in `fp32` and cast it back to desired `dtype` to avoid + # `trunc_normal_cpu` not implemented in `half` issues + module.weight.data = nn.init.trunc_normal_( + module.weight.data.to(torch.float32), mean=0.0, std=self.config.initializer_range + ).to(module.weight.dtype) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + elif isinstance(module, IJepaEmbeddings): + module.position_embeddings.data = nn.init.trunc_normal_( + module.position_embeddings.data.to(torch.float32), + mean=0.0, + std=self.config.initializer_range, + ).to(module.position_embeddings.dtype) + if module.mask_token is not None: + module.mask_token.data.zero_() + + +class IJepaModel(IJepaPreTrainedModel, ViTModel): + def __init__(self, config: IJepaConfig, add_pooling_layer: bool = False, use_mask_token: bool = False): + r""" + add_pooling_layer (bool, *optional*, defaults to `True`): + Whether to add a pooling layer + use_mask_token (`bool`, *optional*, defaults to `False`): + Whether to use a mask token for masked image modeling. + """ + super().__init__(config) + self.config = config + self.embeddings = IJepaEmbeddings(config, use_mask_token=use_mask_token) + + +@auto_docstring( + custom_intro=""" + IJepa Model transformer with an image classification head on top (a linear layer on top of the final hidden states) + e.g. for ImageNet. + + + + Note that it's possible to fine-tune IJepa on higher resolution images than the ones it has been trained on, by + setting `interpolate_pos_encoding` to `True` in the forward of the model. This will interpolate the pre-trained + position embeddings to the higher resolution. + + + """ +) +class IJepaForImageClassification(IJepaPreTrainedModel, ViTForImageClassification): + def __init__(self, config: IJepaConfig): + super().__init__(config) + self.ijepa = IJepaModel(config, add_pooling_layer=False) + self.post_init() + + def forward( + self, + pixel_values: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + interpolate_pos_encoding: Optional[bool] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> ImageClassifierOutput: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the image classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + + outputs: BaseModelOutputWithPooling = self.ijepa( + pixel_values, + head_mask=head_mask, + interpolate_pos_encoding=interpolate_pos_encoding, + **kwargs, + ) + sequence_output = outputs.last_hidden_state + logits = self.classifier(sequence_output.mean(dim=1)) + + loss = None + if labels is not None: + loss = self.loss_function(labels, logits, self.config, **kwargs) + + return ImageClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +__all__ = [ + "IJepaPreTrainedModel", + "IJepaModel", + "IJepaForImageClassification", +] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/imagegpt/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/imagegpt/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..098ffb6296f547e6dd9f1f990d21e28bc5cb0f7b --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/imagegpt/__init__.py @@ -0,0 +1,30 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_imagegpt import * + from .feature_extraction_imagegpt import * + from .image_processing_imagegpt import * + from .image_processing_imagegpt_fast import * + from .modeling_imagegpt import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/imagegpt/configuration_imagegpt.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/imagegpt/configuration_imagegpt.py new file mode 100644 index 0000000000000000000000000000000000000000..8cfa8d5e47826e4323f74b33430fa872005c88ea --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/imagegpt/configuration_imagegpt.py @@ -0,0 +1,200 @@ +# coding=utf-8 +# Copyright 2021 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""OpenAI ImageGPT configuration""" + +from collections import OrderedDict +from collections.abc import Mapping +from typing import TYPE_CHECKING, Any, Optional + +from ...configuration_utils import PretrainedConfig +from ...onnx import OnnxConfig +from ...utils import logging + + +if TYPE_CHECKING: + from ... import FeatureExtractionMixin, TensorType + +logger = logging.get_logger(__name__) + + +class ImageGPTConfig(PretrainedConfig): + """ + This is the configuration class to store the configuration of a [`ImageGPTModel`] or a [`TFImageGPTModel`]. It is + used to instantiate a GPT-2 model according to the specified arguments, defining the model architecture. + Instantiating a configuration with the defaults will yield a similar configuration to that of the ImageGPT + [openai/imagegpt-small](https://huggingface.co/openai/imagegpt-small) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 512): + Vocabulary size of the GPT-2 model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`ImageGPTModel`] or [`TFImageGPTModel`]. + n_positions (`int`, *optional*, defaults to 32*32): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + n_embd (`int`, *optional*, defaults to 512): + Dimensionality of the embeddings and hidden states. + n_layer (`int`, *optional*, defaults to 24): + Number of hidden layers in the Transformer encoder. + n_head (`int`, *optional*, defaults to 8): + Number of attention heads for each attention layer in the Transformer encoder. + n_inner (`int`, *optional*, defaults to None): + Dimensionality of the inner feed-forward layers. `None` will set it to 4 times n_embd + activation_function (`str`, *optional*, defaults to `"quick_gelu"`): + Activation function (can be one of the activation functions defined in src/transformers/activations.py). + Defaults to "quick_gelu". + resid_pdrop (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + embd_pdrop (`int`, *optional*, defaults to 0.1): + The dropout ratio for the embeddings. + attn_pdrop (`float`, *optional*, defaults to 0.1): + The dropout ratio for the attention. + layer_norm_epsilon (`float`, *optional*, defaults to 1e-5): + The epsilon to use in the layer normalization layers. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + scale_attn_weights (`bool`, *optional*, defaults to `True`): + Scale attention weights by dividing by sqrt(hidden_size).. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). + scale_attn_by_inverse_layer_idx (`bool`, *optional*, defaults to `False`): + Whether to additionally scale attention weights by `1 / layer_idx + 1`. + reorder_and_upcast_attn (`bool`, *optional*, defaults to `False`): + Whether to scale keys (K) prior to computing attention (dot-product) and upcast attention + dot-product/softmax to float() when training with mixed precision. + + Example: + + ```python + >>> from transformers import ImageGPTConfig, ImageGPTModel + + >>> # Initializing a ImageGPT configuration + >>> configuration = ImageGPTConfig() + + >>> # Initializing a model (with random weights) from the configuration + >>> model = ImageGPTModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "imagegpt" + keys_to_ignore_at_inference = ["past_key_values"] + attribute_map = { + "hidden_size": "n_embd", + "max_position_embeddings": "n_positions", + "num_attention_heads": "n_head", + "num_hidden_layers": "n_layer", + } + + def __init__( + self, + vocab_size=512 + 1, # add one for start of sentence (sos) token + n_positions=32 * 32, + n_embd=512, + n_layer=24, + n_head=8, + n_inner=None, + activation_function="quick_gelu", + resid_pdrop=0.1, + embd_pdrop=0.1, + attn_pdrop=0.1, + layer_norm_epsilon=1e-5, + initializer_range=0.02, + scale_attn_weights=True, + use_cache=True, + tie_word_embeddings=False, + scale_attn_by_inverse_layer_idx=False, + reorder_and_upcast_attn=False, + **kwargs, + ): + self.vocab_size = vocab_size + self.n_positions = n_positions + self.n_embd = n_embd + self.n_layer = n_layer + self.n_head = n_head + self.n_inner = n_inner + self.activation_function = activation_function + self.resid_pdrop = resid_pdrop + self.embd_pdrop = embd_pdrop + self.attn_pdrop = attn_pdrop + self.layer_norm_epsilon = layer_norm_epsilon + self.initializer_range = initializer_range + self.scale_attn_weights = scale_attn_weights + self.use_cache = use_cache + self.scale_attn_by_inverse_layer_idx = scale_attn_by_inverse_layer_idx + self.reorder_and_upcast_attn = reorder_and_upcast_attn + self.tie_word_embeddings = tie_word_embeddings + + super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) + + +class ImageGPTOnnxConfig(OnnxConfig): + @property + def inputs(self) -> Mapping[str, Mapping[int, str]]: + return OrderedDict( + [ + ("input_ids", {0: "batch", 1: "sequence"}), + ] + ) + + def generate_dummy_inputs( + self, + preprocessor: "FeatureExtractionMixin", + batch_size: int = 1, + seq_length: int = -1, + is_pair: bool = False, + framework: Optional["TensorType"] = None, + num_channels: int = 3, + image_width: int = 32, + image_height: int = 32, + ) -> Mapping[str, Any]: + """ + Generate inputs to provide to the ONNX exporter for the specific framework + + Args: + preprocessor ([`PreTrainedTokenizerBase`] or [`FeatureExtractionMixin`]): + The preprocessor associated with this model configuration. + batch_size (`int`, *optional*, defaults to -1): + The batch size to export the model for (-1 means dynamic axis). + num_choices (`int`, *optional*, defaults to -1): + The number of candidate answers provided for multiple choice task (-1 means dynamic axis). + seq_length (`int`, *optional*, defaults to -1): + The sequence length to export the model for (-1 means dynamic axis). + is_pair (`bool`, *optional*, defaults to `False`): + Indicate if the input is a pair (sentence 1, sentence 2) + framework (`TensorType`, *optional*, defaults to `None`): + The framework (PyTorch or TensorFlow) that the tokenizer will generate tensors for. + num_channels (`int`, *optional*, defaults to 3): + The number of channels of the generated images. + image_width (`int`, *optional*, defaults to 40): + The width of the generated images. + image_height (`int`, *optional*, defaults to 40): + The height of the generated images. + + Returns: + Mapping[str, Tensor] holding the kwargs to provide to the model's forward function + """ + + input_image = self._generate_dummy_images(batch_size, num_channels, image_height, image_width) + inputs = dict(preprocessor(images=input_image, return_tensors=framework)) + + return inputs + + +__all__ = ["ImageGPTConfig", "ImageGPTOnnxConfig"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/imagegpt/feature_extraction_imagegpt.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/imagegpt/feature_extraction_imagegpt.py new file mode 100644 index 0000000000000000000000000000000000000000..46787f139f10a0c339e4cea9524d34c00a03ceb6 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/imagegpt/feature_extraction_imagegpt.py @@ -0,0 +1,38 @@ +# coding=utf-8 +# Copyright 2021 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Feature extractor class for ImageGPT.""" + +import warnings + +from ...utils import logging +from ...utils.import_utils import requires +from .image_processing_imagegpt import ImageGPTImageProcessor + + +logger = logging.get_logger(__name__) + + +@requires(backends=("vision",)) +class ImageGPTFeatureExtractor(ImageGPTImageProcessor): + def __init__(self, *args, **kwargs) -> None: + warnings.warn( + "The class ImageGPTFeatureExtractor is deprecated and will be removed in version 5 of Transformers." + " Please use ImageGPTImageProcessor instead.", + FutureWarning, + ) + super().__init__(*args, **kwargs) + + +__all__ = ["ImageGPTFeatureExtractor"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/imagegpt/image_processing_imagegpt.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/imagegpt/image_processing_imagegpt.py new file mode 100644 index 0000000000000000000000000000000000000000..aa2114509f7026d9f266be952dc220f2de1b632d --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/imagegpt/image_processing_imagegpt.py @@ -0,0 +1,314 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Image processor class for ImageGPT.""" + +from typing import Optional, Union + +import numpy as np + +from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict +from ...image_transforms import rescale, resize, to_channel_dimension_format +from ...image_utils import ( + ChannelDimension, + ImageInput, + PILImageResampling, + infer_channel_dimension_format, + is_scaled_image, + make_list_of_images, + to_numpy_array, + valid_images, + validate_preprocess_arguments, +) +from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging +from ...utils.import_utils import requires + + +if is_vision_available(): + import PIL + + +logger = logging.get_logger(__name__) + + +def squared_euclidean_distance(a, b): + b = b.T + a2 = np.sum(np.square(a), axis=1) + b2 = np.sum(np.square(b), axis=0) + ab = np.matmul(a, b) + d = a2[:, None] - 2 * ab + b2[None, :] + return d + + +def color_quantize(x, clusters): + x = x.reshape(-1, 3) + d = squared_euclidean_distance(x, clusters) + return np.argmin(d, axis=1) + + +@requires(backends=("vision",)) +class ImageGPTImageProcessor(BaseImageProcessor): + r""" + Constructs a ImageGPT image processor. This image processor can be used to resize images to a smaller resolution + (such as 32x32 or 64x64), normalize them and finally color quantize them to obtain sequences of "pixel values" + (color clusters). + + Args: + clusters (`np.ndarray` or `list[list[int]]`, *optional*): + The color clusters to use, of shape `(n_clusters, 3)` when color quantizing. Can be overridden by `clusters` + in `preprocess`. + do_resize (`bool`, *optional*, defaults to `True`): + Whether to resize the image's dimensions to `(size["height"], size["width"])`. Can be overridden by + `do_resize` in `preprocess`. + size (`dict[str, int]` *optional*, defaults to `{"height": 256, "width": 256}`): + Size of the image after resizing. Can be overridden by `size` in `preprocess`. + resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`): + Resampling filter to use if resizing the image. Can be overridden by `resample` in `preprocess`. + do_normalize (`bool`, *optional*, defaults to `True`): + Whether to normalize the image pixel value to between [-1, 1]. Can be overridden by `do_normalize` in + `preprocess`. + do_color_quantize (`bool`, *optional*, defaults to `True`): + Whether to color quantize the image. Can be overridden by `do_color_quantize` in `preprocess`. + """ + + model_input_names = ["pixel_values"] + + def __init__( + self, + # clusters is a first argument to maintain backwards compatibility with the old ImageGPTImageProcessor + clusters: Optional[Union[list[list[int]], np.ndarray]] = None, + do_resize: bool = True, + size: Optional[dict[str, int]] = None, + resample: PILImageResampling = PILImageResampling.BILINEAR, + do_normalize: bool = True, + do_color_quantize: bool = True, + **kwargs, + ) -> None: + super().__init__(**kwargs) + size = size if size is not None else {"height": 256, "width": 256} + size = get_size_dict(size) + self.clusters = np.array(clusters) if clusters is not None else None + self.do_resize = do_resize + self.size = size + self.resample = resample + self.do_normalize = do_normalize + self.do_color_quantize = do_color_quantize + + # Copied from transformers.models.vit.image_processing_vit.ViTImageProcessor.resize + def resize( + self, + image: np.ndarray, + size: dict[str, int], + resample: PILImageResampling = PILImageResampling.BILINEAR, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ) -> np.ndarray: + """ + Resize an image to `(size["height"], size["width"])`. + + Args: + image (`np.ndarray`): + Image to resize. + size (`dict[str, int]`): + Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image. + resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`): + `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`. + data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the output image. If unset, the channel dimension format of the input + image is used. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + + Returns: + `np.ndarray`: The resized image. + """ + size = get_size_dict(size) + if "height" not in size or "width" not in size: + raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}") + output_size = (size["height"], size["width"]) + return resize( + image, + size=output_size, + resample=resample, + data_format=data_format, + input_data_format=input_data_format, + **kwargs, + ) + + def normalize( + self, + image: np.ndarray, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> np.ndarray: + """ + Normalizes an images' pixel values to between [-1, 1]. + + Args: + image (`np.ndarray`): + Image to normalize. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format of the image. If not provided, it will be the same as the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format of the input image. If not provided, it will be inferred. + """ + image = rescale(image=image, scale=1 / 127.5, data_format=data_format, input_data_format=input_data_format) + image = image - 1 + return image + + @filter_out_non_signature_kwargs() + def preprocess( + self, + images: ImageInput, + do_resize: Optional[bool] = None, + size: Optional[dict[str, int]] = None, + resample: Optional[PILImageResampling] = None, + do_normalize: Optional[bool] = None, + do_color_quantize: Optional[bool] = None, + clusters: Optional[Union[list[list[int]], np.ndarray]] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + data_format: Optional[Union[str, ChannelDimension]] = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> PIL.Image.Image: + """ + Preprocess an image or batch of images. + + Args: + images (`ImageInput`): + Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If + passing in images with pixel values between 0 and 1, set `do_normalize=False`. + do_resize (`bool`, *optional*, defaults to `self.do_resize`): + Whether to resize the image. + size (`dict[str, int]`, *optional*, defaults to `self.size`): + Size of the image after resizing. + resample (`int`, *optional*, defaults to `self.resample`): + Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`, Only + has an effect if `do_resize` is set to `True`. + do_normalize (`bool`, *optional*, defaults to `self.do_normalize`): + Whether to normalize the image + do_color_quantize (`bool`, *optional*, defaults to `self.do_color_quantize`): + Whether to color quantize the image. + clusters (`np.ndarray` or `list[list[int]]`, *optional*, defaults to `self.clusters`): + Clusters used to quantize the image of shape `(n_clusters, 3)`. Only has an effect if + `do_color_quantize` is set to `True`. + return_tensors (`str` or `TensorType`, *optional*): + The type of tensors to return. Can be one of: + - Unset: Return a list of `np.ndarray`. + - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`. + - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. + - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. + - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. + data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): + The channel dimension format for the output image. Can be one of: + - `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `ChannelDimension.LAST`: image in (height, width, num_channels) format. + Only has an effect if `do_color_quantize` is set to `False`. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + """ + do_resize = do_resize if do_resize is not None else self.do_resize + size = size if size is not None else self.size + size = get_size_dict(size) + resample = resample if resample is not None else self.resample + do_normalize = do_normalize if do_normalize is not None else self.do_normalize + do_color_quantize = do_color_quantize if do_color_quantize is not None else self.do_color_quantize + clusters = clusters if clusters is not None else self.clusters + clusters = np.array(clusters) + + images = make_list_of_images(images) + + if not valid_images(images): + raise ValueError( + "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " + "torch.Tensor, tf.Tensor or jax.ndarray." + ) + + # Here, normalize() is using a constant factor to divide pixel values. + # hence, the method does not need image_mean and image_std. + validate_preprocess_arguments( + do_resize=do_resize, + size=size, + resample=resample, + ) + + if do_color_quantize and clusters is None: + raise ValueError("Clusters must be specified if do_color_quantize is True.") + + # All transformations expect numpy arrays. + images = [to_numpy_array(image) for image in images] + + if do_normalize and is_scaled_image(images[0]): + logger.warning_once( + "It looks like you are trying to rescale already rescaled images. If you wish to do this, " + "make sure to set `do_normalize` to `False` and that pixel values are between [-1, 1].", + ) + + if input_data_format is None: + # We assume that all images have the same channel dimension format. + input_data_format = infer_channel_dimension_format(images[0]) + + if do_resize: + images = [ + self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format) + for image in images + ] + + if do_normalize: + images = [self.normalize(image=image, input_data_format=input_data_format) for image in images] + + if do_color_quantize: + images = [to_channel_dimension_format(image, ChannelDimension.LAST, input_data_format) for image in images] + # color quantize from (batch_size, height, width, 3) to (batch_size, height, width) + images = np.array(images) + images = color_quantize(images, clusters).reshape(images.shape[:-1]) + + # flatten to (batch_size, height*width) + batch_size = images.shape[0] + images = images.reshape(batch_size, -1) + + # We need to convert back to a list of images to keep consistent behaviour across processors. + images = list(images) + data = {"input_ids": images} + else: + images = [to_channel_dimension_format(image, data_format, input_data_format) for image in images] + data = {"pixel_values": images} + return BatchFeature(data=data, tensor_type=return_tensors) + + def to_dict(self): + output = super().to_dict() + # Ensure clusters are JSON/equality friendly + if output.get("clusters") is not None and isinstance(output["clusters"], np.ndarray): + output["clusters"] = output["clusters"].tolist() + # Need to set missing keys from slow processor to match the expected behavior in save/load tests compared to fast processor + missing_keys = ["image_mean", "image_std", "rescale_factor", "do_rescale"] + for key in missing_keys: + if key in output: + output[key] = None + + return output + + +__all__ = ["ImageGPTImageProcessor"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/imagegpt/image_processing_imagegpt_fast.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/imagegpt/image_processing_imagegpt_fast.py new file mode 100644 index 0000000000000000000000000000000000000000..7a6bcc53ae1acaefa1f4a923f7d5b422c7a62d8e --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/imagegpt/image_processing_imagegpt_fast.py @@ -0,0 +1,198 @@ +# coding=utf-8 +# Copyright 2025 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Fast Image processor class for ImageGPT.""" + +from typing import Optional, Union + +import numpy as np +import torch +from torchvision.transforms.v2 import functional as F + +from ...image_processing_utils import BatchFeature +from ...image_processing_utils_fast import ( + BaseImageProcessorFast, + DefaultFastImageProcessorKwargs, +) +from ...image_transforms import group_images_by_shape, reorder_images +from ...image_utils import PILImageResampling +from ...processing_utils import Unpack +from ...utils import ( + TensorType, + auto_docstring, +) + + +def squared_euclidean_distance_torch(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor: + """ + Compute squared Euclidean distances between all pixels and clusters. + + Args: + a: (N, 3) tensor of pixel RGB values + b: (M, 3) tensor of cluster RGB values + + Returns: + (N, M) tensor of squared distances + """ + b = b.t() # (3, M) + a2 = torch.sum(a**2, dim=1) # (N,) + b2 = torch.sum(b**2, dim=0) # (M,) + ab = torch.matmul(a, b) # (N, M) + d = a2[:, None] - 2 * ab + b2[None, :] # Squared Euclidean Distance: a^2 - 2ab + b^2 + return d # (N, M) tensor of squared distances + + +def color_quantize_torch(x: torch.Tensor, clusters: torch.Tensor) -> torch.Tensor: + """ + Assign each pixel to its nearest color cluster. + + Args: + x: (H*W, 3) tensor of flattened pixel RGB values + clusters: (n_clusters, 3) tensor of cluster RGB values + + Returns: + (H*W,) tensor of cluster indices + """ + d = squared_euclidean_distance_torch(x, clusters) + return torch.argmin(d, dim=1) + + +class ImageGPTFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): + """ + clusters (`np.ndarray` or `list[list[int]]` or `torch.Tensor`, *optional*): + The color clusters to use, of shape `(n_clusters, 3)` when color quantizing. Can be overridden by `clusters` + in `preprocess`. + do_color_quantize (`bool`, *optional*, defaults to `True`): + Controls whether to apply color quantization to convert continuous pixel values to discrete cluster indices. + When True, each pixel is assigned to its nearest color cluster, enabling ImageGPT's discrete token modeling. + """ + + clusters: Optional[Union[np.ndarray, list[list[int]], torch.Tensor]] + do_color_quantize: Optional[bool] + + +@auto_docstring +class ImageGPTImageProcessorFast(BaseImageProcessorFast): + model_input_names = ["input_ids"] + resample = PILImageResampling.BILINEAR + do_color_quantize = True + clusters = None + image_mean = [0.5, 0.5, 0.5] + image_std = [0.5, 0.5, 0.5] + do_rescale = True + do_normalize = True + valid_kwargs = ImageGPTFastImageProcessorKwargs + + def __init__( + self, + clusters: Optional[Union[list, np.ndarray, torch.Tensor]] = None, # keep as arg for backwards compatibility + **kwargs: Unpack[ImageGPTFastImageProcessorKwargs], + ): + r""" + clusters (`np.ndarray` or `list[list[int]]` or `torch.Tensor`, *optional*): + The color clusters to use, of shape `(n_clusters, 3)` when color quantizing. Can be overridden by `clusters` + in `preprocess`. + """ + clusters = torch.as_tensor(clusters, dtype=torch.float32) if clusters is not None else None + super().__init__(clusters=clusters, **kwargs) + + def _preprocess( + self, + images: list["torch.Tensor"], + do_resize: bool, + size: dict[str, int], + interpolation: Optional["F.InterpolationMode"], + do_center_crop: bool, + crop_size: dict[str, int], + do_rescale: bool, + rescale_factor: float, + do_normalize: bool, + image_mean: Optional[Union[float, list[float]]], + image_std: Optional[Union[float, list[float]]], + do_color_quantize: Optional[bool] = None, + clusters: Optional[Union[list, np.ndarray, torch.Tensor]] = None, + disable_grouping: Optional[bool] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + **kwargs, + ): + # Group images by size for batched resizing + grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping) + resized_images_grouped = {} + for shape, stacked_images in grouped_images.items(): + if do_resize: + stacked_images = self.resize(image=stacked_images, size=size, interpolation=interpolation) + resized_images_grouped[shape] = stacked_images + resized_images = reorder_images(resized_images_grouped, grouped_images_index) + + # Group images by size for further processing + # Needed in case do_resize is False, or resize returns images with different sizes + grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping) + processed_images_grouped = {} + for shape, stacked_images in grouped_images.items(): + if do_center_crop: + stacked_images = self.center_crop(stacked_images, crop_size) + # Fused rescale and normalize + stacked_images = self.rescale_and_normalize( + stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std + ) + processed_images_grouped[shape] = stacked_images + + pixel_values = reorder_images(processed_images_grouped, grouped_images_index) + + # If color quantization is requested, perform it; otherwise return pixel values + if do_color_quantize: + # Prepare clusters + if clusters is None: + raise ValueError("Clusters must be provided for color quantization.") + # Convert to torch tensor if needed (clusters might be passed as list/numpy) + clusters_torch = ( + torch.as_tensor(clusters, dtype=torch.float32) if not isinstance(clusters, torch.Tensor) else clusters + ).to(pixel_values[0].device, dtype=pixel_values[0].dtype) + + # Group images by shape for batch processing + # We need to check if the pixel values are a tensor or a list of tensors + grouped_images, grouped_images_index = group_images_by_shape( + pixel_values, disable_grouping=disable_grouping + ) + # Process each group + input_ids_grouped = {} + + for shape, stacked_images in grouped_images.items(): + input_ids = color_quantize_torch( + stacked_images.permute(0, 2, 3, 1).reshape(-1, 3), clusters_torch + ) # (B*H*W, C) + input_ids_grouped[shape] = input_ids.reshape(stacked_images.shape[0], -1).reshape( + stacked_images.shape[0], -1 + ) # (B, H, W) + + input_ids = reorder_images(input_ids_grouped, grouped_images_index) + + return BatchFeature( + data={"input_ids": torch.stack(input_ids, dim=0) if return_tensors else input_ids}, + tensor_type=return_tensors, + ) + + pixel_values = torch.stack(pixel_values, dim=0) if return_tensors else pixel_values + return BatchFeature(data={"pixel_values": pixel_values}, tensor_type=return_tensors) + + def to_dict(self): + # Convert torch tensors to lists for JSON serialization + output = super().to_dict() + if output.get("clusters") is not None and isinstance(output["clusters"], torch.Tensor): + output["clusters"] = output["clusters"].tolist() + + return output + + +__all__ = ["ImageGPTImageProcessorFast"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/imagegpt/modeling_imagegpt.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/imagegpt/modeling_imagegpt.py new file mode 100644 index 0000000000000000000000000000000000000000..a962141e447946633a2598b3c0bfe43b46f10e6d --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/imagegpt/modeling_imagegpt.py @@ -0,0 +1,1024 @@ +# coding=utf-8 +# Copyright 2021 The OpenAI Team Authors and HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch OpenAI ImageGPT model.""" + +import math +import os +from typing import Any, Optional, Union + +import torch +from torch import nn +from torch.nn import CrossEntropyLoss + +from ...activations import ACT2FN +from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache +from ...generation import GenerationMixin +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_outputs import ( + BaseModelOutputWithPastAndCrossAttentions, + CausalLMOutputWithCrossAttentions, + SequenceClassifierOutputWithPast, +) +from ...modeling_utils import PreTrainedModel +from ...pytorch_utils import Conv1D, find_pruneable_heads_and_indices, prune_conv1d_layer +from ...utils import ( + auto_docstring, + logging, + torch_float, +) +from .configuration_imagegpt import ImageGPTConfig + + +logger = logging.get_logger(__name__) + + +def load_tf_weights_in_imagegpt(model, config, imagegpt_checkpoint_path): + """ + Load tf checkpoints in a pytorch model + """ + try: + import re + + import tensorflow as tf + except ImportError: + logger.error( + "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see " + "https://www.tensorflow.org/install/ for installation instructions." + ) + raise + tf_path = os.path.abspath(imagegpt_checkpoint_path) + logger.info(f"Converting TensorFlow checkpoint from {tf_path}") + # Load weights from TF model + init_vars = tf.train.list_variables(tf_path) + names = [] + arrays = [] + + for name, shape in init_vars: + logger.info(f"Loading TF weight {name} with shape {shape}") + array = tf.train.load_variable(tf_path, name) + names.append(name) + arrays.append(array.squeeze()) + + for name, array in zip(names, arrays): + name = name[6:] # skip "model/" + name = name.split("/") + + # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v + # which are not required for using pretrained model + if any( + n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"] + for n in name + ) or name[-1] in ["_step"]: + logger.info("Skipping {}".format("/".join(name))) + continue + + pointer = model + if name[-1] not in ["wtet"]: + pointer = getattr(pointer, "transformer") + + for m_name in name: + if re.fullmatch(r"[A-Za-z]+\d+", m_name): + scope_names = re.split(r"(\d+)", m_name) + else: + scope_names = [m_name] + + if scope_names[0] == "w" or scope_names[0] == "g": + pointer = getattr(pointer, "weight") + elif scope_names[0] == "b": + pointer = getattr(pointer, "bias") + elif scope_names[0] == "wpe" or scope_names[0] == "wte": + pointer = getattr(pointer, scope_names[0]) + pointer = getattr(pointer, "weight") + elif scope_names[0] in ["q_proj", "k_proj", "v_proj"]: + pointer = getattr(pointer, "c_attn") + pointer = getattr(pointer, "weight") + elif len(name) == 3 and name[1] == "attn" and scope_names[0] == "c_proj": + pointer = getattr(pointer, scope_names[0]) + pointer = getattr(pointer, "weight") + elif scope_names[0] == "wtet": + pointer = getattr(pointer, "lm_head") + pointer = getattr(pointer, "weight") + elif scope_names[0] == "sos": + pointer = getattr(pointer, "wte") + pointer = getattr(pointer, "weight") + else: + pointer = getattr(pointer, scope_names[0]) + if len(scope_names) >= 2: + num = int(scope_names[1]) + pointer = pointer[num] + + if len(name) > 1 and name[1] == "attn" or name[-1] == "wtet" or name[-1] == "sos" or name[-1] == "wte": + pass # array is used to initialize only part of the pointer so sizes won't match + else: + try: + assert pointer.shape == array.shape + except AssertionError as e: + e.args += (pointer.shape, array.shape) + raise + + logger.info(f"Initialize PyTorch weight {name}") + + if name[-1] == "q_proj": + pointer.data[:, : config.n_embd] = torch.from_numpy(array.reshape(config.n_embd, config.n_embd)).T + elif name[-1] == "k_proj": + pointer.data[:, config.n_embd : 2 * config.n_embd] = torch.from_numpy( + array.reshape(config.n_embd, config.n_embd) + ).T + elif name[-1] == "v_proj": + pointer.data[:, 2 * config.n_embd :] = torch.from_numpy(array.reshape(config.n_embd, config.n_embd)).T + elif len(name) == 3 and name[1] == "attn" and name[2] == "c_proj": + pointer.data = torch.from_numpy(array.reshape(config.n_embd, config.n_embd)) + elif name[-1] == "wtet": + pointer.data = torch.from_numpy(array) + elif name[-1] == "wte": + pointer.data[: config.vocab_size - 1, :] = torch.from_numpy(array) + elif name[-1] == "sos": + pointer.data[-1] = torch.from_numpy(array) + else: + pointer.data = torch.from_numpy(array) + + return model + + +class ImageGPTLayerNorm(nn.Module): + def __init__(self, hidden_size: tuple[int], eps: float = 1e-5): + super().__init__() + self.eps = eps + self.weight = nn.Parameter(torch.Tensor(hidden_size)) + + def forward(self, tensor: torch.Tensor) -> torch.Tensor: + # input is not mean centered + tensor = tensor / torch.sqrt(torch.mean(torch.square(tensor), axis=-1, keepdim=True) + self.eps) + tensor = tensor * self.weight + return tensor + + +class ImageGPTAttention(nn.Module): + def __init__(self, config, is_cross_attention: Optional[bool] = False, layer_idx: Optional[int] = None): + super().__init__() + + max_positions = config.max_position_embeddings + self.register_buffer( + "bias", + torch.tril(torch.ones((max_positions, max_positions), dtype=torch.bool)).view( + 1, 1, max_positions, max_positions + ), + persistent=False, + ) + self.register_buffer("masked_bias", torch.tensor(-1e4), persistent=False) + + self.embed_dim = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.embed_dim // self.num_heads + self.split_size = self.embed_dim + if self.head_dim * self.num_heads != self.embed_dim: + raise ValueError( + f"`embed_dim` must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:" + f" {self.num_heads})." + ) + + self.scale_attn_weights = config.scale_attn_weights + self.is_cross_attention = is_cross_attention + + # Layer-wise attention scaling, reordering, and upcasting + self.scale_attn_by_inverse_layer_idx = config.scale_attn_by_inverse_layer_idx + self.layer_idx = layer_idx + self.reorder_and_upcast_attn = config.reorder_and_upcast_attn + + if self.is_cross_attention: + self.c_attn = Conv1D(2 * self.embed_dim, self.embed_dim) + self.q_attn = Conv1D(self.embed_dim, self.embed_dim) + else: + self.c_attn = Conv1D(3 * self.embed_dim, self.embed_dim) + self.c_proj = Conv1D(self.embed_dim, self.embed_dim) + + self.attn_dropout = nn.Dropout(config.attn_pdrop) + self.resid_dropout = nn.Dropout(config.resid_pdrop) + + self.pruned_heads = set() + + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices(heads, self.num_heads, self.head_dim, self.pruned_heads) + index_attn = torch.cat([index, index + self.split_size, index + (2 * self.split_size)]) + + # Prune conv1d layers + self.c_attn = prune_conv1d_layer(self.c_attn, index_attn, dim=1) + self.c_proj = prune_conv1d_layer(self.c_proj, index, dim=0) + + # Update hyper params + self.split_size = (self.split_size // self.num_heads) * (self.num_heads - len(heads)) + self.num_heads = self.num_heads - len(heads) + self.pruned_heads = self.pruned_heads.union(heads) + + def _attn(self, query, key, value, attention_mask=None, head_mask=None): + attn_weights = torch.matmul(query, key.transpose(-1, -2)) + + if self.scale_attn_weights: + attn_weights = attn_weights / torch_float(value.size(-1) ** 0.5) + + # Layer-wise attention scaling + if self.scale_attn_by_inverse_layer_idx: + attn_weights = attn_weights / float(self.layer_idx + 1) + + if not self.is_cross_attention: + # if only "normal" attention layer implements causal mask + query_length, key_length = query.size(-2), key.size(-2) + causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length] + mask_value = torch.finfo(attn_weights.dtype).min + # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`. + # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device` + mask_value = torch.tensor(mask_value, dtype=attn_weights.dtype, device=attn_weights.device) + attn_weights = torch.where(causal_mask, attn_weights, mask_value) + + if attention_mask is not None: + # Apply the attention mask + attn_weights = attn_weights + attention_mask + + attn_weights = nn.Softmax(dim=-1)(attn_weights) + + # Downcast (if necessary) back to V's dtype (if in mixed-precision) -- No-Op otherwise + attn_weights = attn_weights.type(value.dtype) + attn_weights = self.attn_dropout(attn_weights) + + # Mask heads if we want to + if head_mask is not None: + attn_weights = attn_weights * head_mask + + attn_output = torch.matmul(attn_weights, value) + + return attn_output, attn_weights + + def _upcast_and_reordered_attn(self, query, key, value, attention_mask=None, head_mask=None): + # Use `torch.baddbmm` (a bit more efficient w/ alpha param for scaling -- from Megatron-LM) + bsz, num_heads, q_seq_len, dk = query.size() + _, _, k_seq_len, _ = key.size() + + # Preallocate attn_weights for `baddbmm` + attn_weights = torch.empty(bsz * num_heads, q_seq_len, k_seq_len, dtype=torch.float32, device=query.device) + + # Compute Scale Factor + scale_factor = 1.0 + if self.scale_attn_weights: + scale_factor /= float(value.size(-1)) ** 0.5 + + if self.scale_attn_by_inverse_layer_idx: + scale_factor /= float(self.layer_idx + 1) + + # Upcast (turn off autocast) and reorder (Scale K by 1 / root(dk)) + with torch.autocast(query.device.type, enabled=False): + q, k = query.reshape(-1, q_seq_len, dk), key.transpose(-1, -2).reshape(-1, dk, k_seq_len) + attn_weights = torch.baddbmm(attn_weights, q.float(), k.float(), beta=0, alpha=scale_factor) + attn_weights = attn_weights.reshape(bsz, num_heads, q_seq_len, k_seq_len) + + if not self.is_cross_attention: + # if only "normal" attention layer implements causal mask + query_length, key_length = query.size(-2), key.size(-2) + causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length] + mask_value = torch.finfo(attn_weights.dtype).min + # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`. + # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device` + mask_value = torch.tensor(mask_value, dtype=attn_weights.dtype, device=attn_weights.device) + attn_weights = torch.where(causal_mask, attn_weights, mask_value) + + if attention_mask is not None: + # Apply the attention mask + attn_weights = attn_weights + attention_mask + + attn_weights = nn.Softmax(dim=-1)(attn_weights) + + # Downcast (if necessary) back to V's dtype (if in mixed-precision) -- No-Op if otherwise + if attn_weights.dtype != torch.float32: + raise RuntimeError("Error with upcasting, attn_weights does not have dtype torch.float32") + attn_weights = attn_weights.type(value.dtype) + attn_weights = self.attn_dropout(attn_weights) + + # Mask heads if we want to + if head_mask is not None: + attn_weights = attn_weights * head_mask + + attn_output = torch.matmul(attn_weights, value) + + return attn_output, attn_weights + + def _split_heads(self, tensor, num_heads, attn_head_size): + """ + Splits hidden_size dim into attn_head_size and num_heads + """ + new_shape = tensor.size()[:-1] + (num_heads, attn_head_size) + tensor = tensor.view(*new_shape) + return tensor.permute(0, 2, 1, 3) # (batch, head, seq_length, head_features) + + def _merge_heads(self, tensor, num_heads, attn_head_size): + """ + Merges attn_head_size dim and num_attn_heads dim into hidden_size + """ + tensor = tensor.permute(0, 2, 1, 3).contiguous() + new_shape = tensor.size()[:-2] + (num_heads * attn_head_size,) + return tensor.view(new_shape) + + def forward( + self, + hidden_states: torch.Tensor, + layer_past: Optional[Cache] = None, + attention_mask: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + use_cache: Optional[bool] = False, + output_attentions: Optional[bool] = False, + cache_position: Optional[torch.Tensor] = None, + ) -> tuple: + is_cross_attention = encoder_hidden_states is not None + bsz, seq_len, _ = hidden_states.shape + + if layer_past is not None: + if isinstance(layer_past, EncoderDecoderCache): + is_updated = layer_past.is_updated.get(self.layer_idx) + if is_cross_attention: + # after the first generated id, we can subsequently re-use all key/value_states from cache + curr_past_key_value = layer_past.cross_attention_cache + else: + curr_past_key_value = layer_past.self_attention_cache + else: + curr_past_key_value = layer_past + + current_states = encoder_hidden_states if is_cross_attention else hidden_states + if is_cross_attention: + if not hasattr(self, "q_attn"): + raise ValueError( + "If class is used as cross attention, the weights `q_attn` have to be defined. " + "Please make sure to instantiate class with `ImageGPTAttention(..., is_cross_attention=True)`." + ) + + if layer_past is not None and is_updated: + # reuse k,v, cross_attentions, and compute only q + query = self.q_attn(hidden_states) + key = curr_past_key_value.layers[self.layer_idx].keys + value = curr_past_key_value.layers[self.layer_idx].values + else: + query = self.q_attn(hidden_states) + key, value = self.c_attn(current_states).split(self.split_size, dim=2) + key = key.view(bsz, -1, self.num_heads, self.head_dim).transpose(1, 2) + value = value.view(bsz, -1, self.num_heads, self.head_dim).transpose(1, 2) + else: + query, key, value = self.c_attn(current_states).split(self.split_size, dim=2) + key = key.view(bsz, -1, self.num_heads, self.head_dim).transpose(1, 2) + value = value.view(bsz, -1, self.num_heads, self.head_dim).transpose(1, 2) + + if layer_past is not None: + # save all key/value_states to cache to be re-used for fast auto-regressive generation + cache_position = cache_position if not is_cross_attention else None + key, value = curr_past_key_value.update(key, value, self.layer_idx, {"cache_position": cache_position}) + # set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls + if is_cross_attention: + layer_past.is_updated[self.layer_idx] = True + + query = query.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2) + + if self.reorder_and_upcast_attn: + attn_output, attn_weights = self._upcast_and_reordered_attn(query, key, value, attention_mask, head_mask) + else: + attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask) + + attn_output = self._merge_heads(attn_output, self.num_heads, self.head_dim) + attn_output = self.c_proj(attn_output) + attn_output = self.resid_dropout(attn_output) + + return attn_output, attn_weights + + +class ImageGPTMLP(nn.Module): + def __init__(self, intermediate_size, config): + super().__init__() + embed_dim = config.hidden_size + self.c_fc = Conv1D(intermediate_size, embed_dim) + self.c_proj = Conv1D(embed_dim, intermediate_size) + self.act = ACT2FN[config.activation_function] + self.dropout = nn.Dropout(config.resid_pdrop) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.c_fc(hidden_states) + hidden_states = self.act(hidden_states) + hidden_states = self.c_proj(hidden_states) + hidden_states = self.dropout(hidden_states) + return hidden_states + + +class ImageGPTBlock(GradientCheckpointingLayer): + def __init__(self, config, layer_idx=None): + super().__init__() + hidden_size = config.hidden_size + inner_dim = config.n_inner if config.n_inner is not None else 4 * hidden_size + + self.ln_1 = ImageGPTLayerNorm(hidden_size, eps=config.layer_norm_epsilon) + self.attn = ImageGPTAttention(config, layer_idx=layer_idx) + self.ln_2 = ImageGPTLayerNorm(hidden_size, eps=config.layer_norm_epsilon) + + if config.add_cross_attention: + self.crossattention = ImageGPTAttention(config, is_cross_attention=True, layer_idx=layer_idx) + self.ln_cross_attn = ImageGPTLayerNorm(hidden_size, eps=config.layer_norm_epsilon) + + self.mlp = ImageGPTMLP(inner_dim, config) + + def forward( + self, + hidden_states: torch.Tensor, + layer_past: Optional[Cache] = None, + attention_mask: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + use_cache: Optional[bool] = False, + output_attentions: Optional[bool] = False, + cache_position: Optional[torch.Tensor] = None, + ) -> tuple: + residual = hidden_states + hidden_states = self.ln_1(hidden_states) + attn_outputs = self.attn( + hidden_states, + layer_past=layer_past, + attention_mask=attention_mask, + head_mask=head_mask, + use_cache=use_cache, + output_attentions=output_attentions, + cache_position=cache_position, + ) + attn_output = attn_outputs[0] + outputs = attn_outputs[1:] + # residual connection + hidden_states = attn_output + residual + + if encoder_hidden_states is not None: + # add one self-attention block for cross-attention + if not hasattr(self, "crossattention"): + raise ValueError( + f"If `encoder_hidden_states` are passed, {self} has to be instantiated with " + "cross-attention layers by setting `config.add_cross_attention=True`" + ) + residual = hidden_states + hidden_states = self.ln_cross_attn(hidden_states) + cross_attn_outputs = self.crossattention( + hidden_states, + layer_past=layer_past, + attention_mask=attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + output_attentions=output_attentions, + cache_position=cache_position, + ) + attn_output = cross_attn_outputs[0] + # residual connection + hidden_states = residual + attn_output + outputs = outputs + cross_attn_outputs[1:] # add cross attentions if we output attention weights + + residual = hidden_states + hidden_states = self.ln_2(hidden_states) + feed_forward_hidden_states = self.mlp(hidden_states) + # residual connection + hidden_states = residual + feed_forward_hidden_states + + return (hidden_states,) + outputs + + +@auto_docstring +class ImageGPTPreTrainedModel(PreTrainedModel): + config: ImageGPTConfig + load_tf_weights = load_tf_weights_in_imagegpt + base_model_prefix = "transformer" + main_input_name = "input_ids" + supports_gradient_checkpointing = True + _no_split_modules = ["ImageGPTBlock"] + + def __init__(self, *inputs, **kwargs): + super().__init__(*inputs, **kwargs) + + def _init_weights(self, module): + """Initialize the weights.""" + if isinstance(module, (nn.Linear, Conv1D)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, ImageGPTLayerNorm): + module.weight.data.fill_(1.0) + + # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme: + # > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale + # > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers. + # > -- GPT-2 :: https://openai.com/blog/better-language-models/ + # + # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py + for name, p in module.named_parameters(): + if "c_proj" in name and "weight" in name: + # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block + p.data.normal_(mean=0.0, std=(self.config.initializer_range / math.sqrt(2 * self.config.n_layer))) + + +@auto_docstring +class ImageGPTModel(ImageGPTPreTrainedModel): + def __init__(self, config: ImageGPTConfig): + super().__init__(config) + + self.embed_dim = config.hidden_size + + self.wte = nn.Embedding(config.vocab_size, self.embed_dim) + self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim) + + self.drop = nn.Dropout(config.embd_pdrop) + self.h = nn.ModuleList([ImageGPTBlock(config, layer_idx=i) for i in range(config.num_hidden_layers)]) + self.ln_f = ImageGPTLayerNorm(self.embed_dim, eps=config.layer_norm_epsilon) + + # Model parallel + self.model_parallel = False + self.device_map = None + self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.wte + + def set_input_embeddings(self, new_embeddings): + self.wte = new_embeddings + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} + """ + for layer, heads in heads_to_prune.items(): + self.h[layer].attn.prune_heads(heads) + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + past_key_values: Optional[Cache] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.Tensor] = None, + **kwargs: Any, + ) -> Union[tuple, BaseModelOutputWithPastAndCrossAttentions]: + r""" + input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`): + `input_ids_length` = `sequence_length` if `past_key_values` is `None` else + `past_key_values.get_seq_length()` (`sequence_length` of input past key value states). Indices of input + sequence tokens in the vocabulary. + + If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as + `input_ids`. + + Indices can be obtained using [`AutoImageProcessor`]. See [`ImageGPTImageProcessor.__call__`] for details. + + Examples: + + ```python + >>> from transformers import AutoImageProcessor, ImageGPTModel + >>> from PIL import Image + >>> import requests + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> image_processor = AutoImageProcessor.from_pretrained("openai/imagegpt-small") + >>> model = ImageGPTModel.from_pretrained("openai/imagegpt-small") + + >>> inputs = image_processor(images=image, return_tensors="pt") + >>> outputs = model(**inputs) + >>> last_hidden_states = outputs.last_hidden_state + ```""" + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask) + input_shape = input_ids.size() + input_ids = input_ids.view(-1, input_shape[-1]) + batch_size = input_ids.shape[0] + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + batch_size = inputs_embeds.shape[0] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + device = input_ids.device if input_ids is not None else inputs_embeds.device + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + if use_cache and past_key_values is None: + past_key_values = EncoderDecoderCache(DynamicCache(config=self.config), DynamicCache(config=self.config)) + if use_cache and isinstance(past_key_values, tuple): + logger.warning_once( + "Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. " + "You should pass an instance of `EncoderDecoderCache` instead, e.g. " + "`past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`." + ) + past_key_values = EncoderDecoderCache.from_legacy_cache(past_key_values) + + past_length = past_key_values.get_seq_length() if past_key_values is not None else past_key_values + + if token_type_ids is not None: + token_type_ids = token_type_ids.view(-1, input_shape[-1]) + + if position_ids is None: + position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device) + position_ids = position_ids.unsqueeze(0) + + # ImageGPTAttention mask. + if attention_mask is not None: + if batch_size <= 0: + raise ValueError("batch_size has to be defined and > 0") + attention_mask = attention_mask.view(batch_size, -1) + # We create a 3D attention mask from a 2D tensor mask. + # Sizes are [batch_size, 1, 1, to_seq_length] + # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] + # this attention mask is more simple than the triangular masking of causal attention + # used in OpenAI GPT, we just need to prepare the broadcast dimension here. + attention_mask = attention_mask[:, None, None, :] + + # Since attention_mask is 1.0 for positions we want to attend and 0.0 for + # masked positions, this operation will create a tensor which is 0.0 for + # positions we want to attend and the dtype's smallest value for masked positions. + # Since we are adding it to the raw scores before the softmax, this is + # effectively the same as removing these entirely. + attention_mask = attention_mask.to(dtype=self.dtype) # fp16 compatibility + attention_mask = (1.0 - attention_mask) * torch.finfo(self.dtype).min + + # If a 2D or 3D attention mask is provided for the cross-attention + # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] + if self.config.add_cross_attention and encoder_hidden_states is not None: + encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size() + encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) + if encoder_attention_mask is None: + encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device) + encoder_attention_mask = self.invert_attention_mask(encoder_attention_mask) + else: + encoder_attention_mask = None + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # head_mask has shape n_layer x batch x n_heads x N x N + head_mask = self.get_head_mask(head_mask, self.config.n_layer) + + if inputs_embeds is None: + inputs_embeds = self.wte(input_ids) + position_embeds = self.wpe(position_ids) + hidden_states = inputs_embeds + position_embeds.to(inputs_embeds.device) + + if token_type_ids is not None: + token_type_embeds = self.wte(token_type_ids) + hidden_states = hidden_states + token_type_embeds + + hidden_states = self.drop(hidden_states) + output_shape = input_shape + (hidden_states.size(-1),) + + all_self_attentions = () if output_attentions else None + all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None + all_hidden_states = () if output_hidden_states else None + for i, block in enumerate(self.h): + # Model parallel + if self.model_parallel: + torch.cuda.set_device(hidden_states.device) + # Ensure that attention_mask is always on the same device as hidden_states + if attention_mask is not None: + attention_mask = attention_mask.to(hidden_states.device) + if isinstance(head_mask, torch.Tensor): + head_mask = head_mask.to(hidden_states.device) + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + outputs = block( + hidden_states, + past_key_values, + attention_mask, + head_mask[i], + encoder_hidden_states, # as a positional argument for gradient checkpointing + encoder_attention_mask=encoder_attention_mask, + use_cache=use_cache, + output_attentions=output_attentions, + cache_position=cache_position, + ) + + hidden_states = outputs[0] + if output_attentions: + all_self_attentions = all_self_attentions + (outputs[1],) + if self.config.add_cross_attention: + all_cross_attentions = all_cross_attentions + (outputs[2],) + + # Model Parallel: If it's the last layer for that device, put things on the next device + if self.model_parallel: + for k, v in self.device_map.items(): + if i == v[-1] and "cuda:" + str(k) != self.last_device: + hidden_states = hidden_states.to("cuda:" + str(k + 1)) + + hidden_states = self.ln_f(hidden_states) + hidden_states = hidden_states.view(*output_shape) + + # Add last hidden state + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple( + v + for v in [hidden_states, past_key_values, all_hidden_states, all_self_attentions, all_cross_attentions] + if v is not None + ) + + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=past_key_values, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + cross_attentions=all_cross_attentions, + ) + + +@auto_docstring( + custom_intro=""" + The ImageGPT Model transformer with a language modeling head on top (linear layer with weights tied to the input + embeddings). + """ +) +class ImageGPTForCausalImageModeling(ImageGPTPreTrainedModel, GenerationMixin): + _tied_weights_keys = ["lm_head.weight"] + + def __init__(self, config: ImageGPTConfig): + super().__init__(config) + self.transformer = ImageGPTModel(config) + self.lm_head = nn.Linear(config.n_embd, config.vocab_size - 1, bias=False) + + # Model parallel + self.model_parallel = False + self.device_map = None + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + past_key_values: Optional[Cache] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.Tensor] = None, + **kwargs: Any, + ) -> Union[tuple, CausalLMOutputWithCrossAttentions]: + r""" + input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`): + `input_ids_length` = `sequence_length` if `past_key_values` is `None` else + `past_key_values.get_seq_length()` (`sequence_length` of input past key value states). Indices of input + sequence tokens in the vocabulary. + + If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as + `input_ids`. + + Indices can be obtained using [`AutoImageProcessor`]. See [`ImageGPTImageProcessor.__call__`] for details. + labels (`torch.LongTensor` of shape `(batch_size, input_ids_length)`, *optional*): + Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set + `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100` + are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]` + + Examples: + + ```python + >>> from transformers import AutoImageProcessor, ImageGPTForCausalImageModeling + >>> import torch + >>> import matplotlib.pyplot as plt + >>> import numpy as np + + >>> image_processor = AutoImageProcessor.from_pretrained("openai/imagegpt-small") + >>> model = ImageGPTForCausalImageModeling.from_pretrained("openai/imagegpt-small") + >>> device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + >>> model.to(device) # doctest: +IGNORE_RESULT + + >>> # unconditional generation of 8 images + >>> batch_size = 4 + >>> context = torch.full((batch_size, 1), model.config.vocab_size - 1) # initialize with SOS token + >>> context = context.to(device) + >>> output = model.generate( + ... input_ids=context, max_length=model.config.n_positions + 1, temperature=1.0, do_sample=True, top_k=40 + ... ) + + >>> clusters = image_processor.clusters + >>> height = image_processor.size["height"] + >>> width = image_processor.size["width"] + + >>> samples = output[:, 1:].detach().cpu().numpy() + >>> samples_img = [ + ... np.reshape(np.rint(127.5 * (clusters[s] + 1.0)), [height, width, 3]).astype(np.uint8) for s in samples + ... ] # convert color cluster tokens back to pixels + >>> f, axes = plt.subplots(1, batch_size, dpi=300) + + >>> for img, ax in zip(samples_img, axes): # doctest: +IGNORE_RESULT + ... ax.axis("off") + ... ax.imshow(img) + ```""" + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + transformer_outputs = self.transformer( + input_ids, + past_key_values=past_key_values, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + cache_position=cache_position, + ) + hidden_states = transformer_outputs[0] + + lm_logits = self.lm_head(hidden_states) + + loss = None + if labels is not None: + # Shift so that tokens < n predict n + shift_logits = lm_logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + # Flatten the tokens + loss_fct = CrossEntropyLoss() + loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) + + if not return_dict: + output = (lm_logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return CausalLMOutputWithCrossAttentions( + loss=loss, + logits=lm_logits, + past_key_values=transformer_outputs.past_key_values, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + cross_attentions=transformer_outputs.cross_attentions, + ) + + +@auto_docstring( + custom_intro=""" + The ImageGPT Model transformer with an image classification head on top (linear layer). + [`ImageGPTForImageClassification`] average-pools the hidden states in order to do the classification. + """ +) +class ImageGPTForImageClassification(ImageGPTPreTrainedModel): + def __init__(self, config: ImageGPTConfig): + super().__init__(config) + self.num_labels = config.num_labels + self.transformer = ImageGPTModel(config) + self.score = nn.Linear(config.n_embd, self.num_labels, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + @auto_docstring + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + past_key_values: Optional[Cache] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + **kwargs: Any, + ) -> Union[tuple, SequenceClassifierOutputWithPast]: + r""" + input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`): + `input_ids_length` = `sequence_length` if `past_key_values` is `None` else + `past_key_values.get_seq_length()` (`sequence_length` of input past key value states). Indices of input + sequence tokens in the vocabulary. + + If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as + `input_ids`. + + Indices can be obtained using [`AutoImageProcessor`]. See [`ImageGPTImageProcessor.__call__`] for details. + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + + Examples: + + ```python + >>> from transformers import AutoImageProcessor, ImageGPTForImageClassification + >>> from PIL import Image + >>> import requests + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> image_processor = AutoImageProcessor.from_pretrained("openai/imagegpt-small") + >>> model = ImageGPTForImageClassification.from_pretrained("openai/imagegpt-small") + + >>> inputs = image_processor(images=image, return_tensors="pt") + >>> outputs = model(**inputs) + >>> logits = outputs.logits + ```""" + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + transformer_outputs = self.transformer( + input_ids, + past_key_values=past_key_values, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = transformer_outputs[0] + # average-pool the hidden states along the sequence dimension + pooled_hidden_states = hidden_states.mean(dim=1) + # project from (batch_size, hidden_size) to (batch_size, num_labels) + logits = self.score(pooled_hidden_states) + + loss = None + if labels is not None: + loss = self.loss_function(labels, logits, self.config) + + if not return_dict: + output = (logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutputWithPast( + loss=loss, + logits=logits, + past_key_values=transformer_outputs.past_key_values, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) + + +__all__ = [ + "ImageGPTForCausalImageModeling", + "ImageGPTForImageClassification", + "ImageGPTModel", + "ImageGPTPreTrainedModel", + "load_tf_weights_in_imagegpt", +] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/internvl/__init__.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/internvl/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..6d4ffe7befafddc423a58d41934a260137da0a13 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/internvl/__init__.py @@ -0,0 +1,29 @@ +# Copyright 2025 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_internvl import * + from .modeling_internvl import * + from .processing_internvl import * + from .video_processing_internvl import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/internvl/configuration_internvl.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/internvl/configuration_internvl.py new file mode 100644 index 0000000000000000000000000000000000000000..17be5388b6ab7dcbf596c2deee69b4a467755fe2 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/internvl/configuration_internvl.py @@ -0,0 +1,225 @@ +# coding=utf-8 +# Copyright 2025 HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from ...configuration_utils import PretrainedConfig +from ..auto import CONFIG_MAPPING, AutoConfig + + +class InternVLVisionConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`InternVLVisionModel`]. It is used to instantiate an InternVLVisionModel + model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield + a similar configuration to that of the InternVL3-1B. + e.g. [OpenGVLab/InternVL3-1B-hf](https://huggingface.co/OpenGVLab/InternVL3-1B-hf) + + Args: + hidden_size (`int`, *optional*, defaults to 1024): + Dimensionality of the encoder layers and the pooler layer. + num_hidden_layers (`int`, *optional*, defaults to 24): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 16): + Number of attention heads for each attention layer in the Transformer encoder. + attention_bias (`bool`, *optional*, defaults to `False`): + Whether to add a bias to the queries, keys and values. + use_qk_norm (`bool`, *optional*, defaults to `False`): + Whether to apply normalization to the queries and keys before the attention operation. + intermediate_size (`int`, *optional*, defaults to 4096): + Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"` and `"gelu_new"` are supported. + hidden_dropout_prob (`float`, *optional*, defaults to 0.0): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_dropout (`float`, *optional*, defaults to 0.0): + Dropout probability for attention weights. + projection_dropout (`float`, *optional*, defaults to 0.0): + Dropout probability for the projection layer. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + norm_type (`str`, *optional*, defaults to `"layer_norm"`): + The type of normalization to use in the encoder. Can be `"layer_norm"` or `"rms_norm"`. + layer_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon used by the layer normalization layers. + image_size (`int` or `list[int]`, *optional*, defaults to `[448, 448]`): + The size (resolution) of each image. + patch_size (`int` or `list[int]`, *optional*, defaults to `[14, 14]`): + The size (resolution) of each patch. + num_channels (`int`, *optional*, defaults to 3): + The number of input channels. + use_mask_token (`bool`, *optional*, defaults to `False`): + Whether to use a mask token for masked image modeling. + use_absolute_position_embeddings (`bool`, *optional*, defaults to `True`): + Whether to use BERT-style absolute position embeddings. + layer_scale_init_value (`float`, *optional*, defaults to 0.1): + Scale to use in the self-attention layers. 0.1 for base, 1e-5 for large. Set 0 to disable layer scale. + use_mean_pooling (`bool`, *optional*, defaults to `True`): + Whether to mean pool the final hidden states of the patches instead of using the final hidden state of the + CLS token, before applying the classification head. + + Example: + + ```python + >>> from transformers import InternVLVisionConfig, InternVLVisionModel + + >>> # Initializing a InternVLVisionModel OpenGVLab/InternVL3-1B-hf style configuration + >>> configuration = InternVLVisionConfig() + + >>> # Initializing a model (with random weights) from the OpenGVLab/InternVL3-1B-hf configuration + >>> model = InternVLVisionModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "internvl_vision" + base_config_key = "vision_config" + + def __init__( + self, + hidden_size=1024, + num_hidden_layers=24, + num_attention_heads=16, + attention_bias=False, + use_qk_norm=False, + intermediate_size=4096, + hidden_act="gelu", + hidden_dropout_prob=0.0, + attention_dropout=0.0, + projection_dropout=0.0, + initializer_range=0.02, + norm_type="layer_norm", + layer_norm_eps=1e-06, + image_size=[448, 448], + patch_size=[14, 14], + num_channels=3, + use_mask_token=False, + use_absolute_position_embeddings=True, + layer_scale_init_value=0.1, + use_mean_pooling=True, + **kwargs, + ): + super().__init__(**kwargs) + + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.attention_bias = attention_bias + self.use_qk_norm = use_qk_norm + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_dropout = attention_dropout + self.projection_dropout = projection_dropout + self.initializer_range = initializer_range + self.norm_type = norm_type + self.layer_norm_eps = layer_norm_eps + + image_size = image_size if isinstance(image_size, (list, tuple)) else (image_size, image_size) + patch_size = patch_size if isinstance(patch_size, (list, tuple)) else (patch_size, patch_size) + self.image_size = image_size + self.patch_size = patch_size + + self.num_channels = num_channels + self.use_mask_token = use_mask_token + self.use_absolute_position_embeddings = use_absolute_position_embeddings + self.layer_scale_init_value = layer_scale_init_value + self.use_mean_pooling = use_mean_pooling + + +class InternVLConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`InternVLForConditionalGeneration`]. It is used to instantiate a + InternVL model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of InternVL3-1B. + e.g. [OpenGVLab/InternVL3-1B-hf](https://huggingface.co/OpenGVLab/InternVL3-1B-hf) + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vision_config (`Union[AutoConfig, dict]`, *optional*, defaults to `InternVisonConfig`): + The config object or dictionary of the vision backbone. + text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `Qwen2Config`): + The config object or dictionary of the text backbone. + image_token_id (`int`, *optional*, defaults to 151667): + The image token index to encode the image prompt. + image_seq_length (`int`, *optional*, defaults to 256): + Number of image tokens to use per image patch. + downsample_ratio (`float`, *optional*, defaults to 0.5): + Factor by which to downsample the image. + projector_hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the projector. + vision_feature_layer (`int`, *optional*, defaults to -1): + The index of the layer to use as the image features. + vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`): + The feature selection strategy used to select the vision feature from the vision backbone. + Can be one of `"default"` or `"full"`. + + ```python + >>> from transformers import InternVLForConditionalGeneration, InternVLConfig + + >>> # Initializing a InternVL style configuration + >>> configuration = InternVLConfig() + + >>> # Initializing a model (with random weights) from the OpenGVLab/InternVL3-1B-hf configuration + >>> model = InternVLForConditionalGeneration(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "internvl" + sub_configs = {"text_config": AutoConfig, "vision_config": InternVLVisionConfig} + + def __init__( + self, + vision_config=None, + text_config=None, + image_token_id=151667, + image_seq_length=256, + downsample_ratio=0.5, + projector_hidden_act="gelu", + vision_feature_layer=-1, + vision_feature_select_strategy="default", + **kwargs, + ): + self.image_token_id = image_token_id + self.image_seq_length = image_seq_length + self.downsample_ratio = downsample_ratio + self.projector_hidden_act = projector_hidden_act + self.vision_feature_layer = vision_feature_layer + self.vision_feature_select_strategy = vision_feature_select_strategy + + if isinstance(vision_config, dict): + self.vision_config = InternVLVisionConfig(**vision_config) + elif isinstance(vision_config, InternVLVisionConfig): + self.vision_config = vision_config + elif vision_config is None: + self.vision_config = InternVLVisionConfig() + + if isinstance(text_config, dict): + text_config["model_type"] = text_config.get("model_type", "qwen2") + text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config) + elif text_config is None: + text_config = CONFIG_MAPPING["qwen2"]() + + self.text_config = text_config + + super().__init__(**kwargs) + + +__all__ = ["InternVLVisionConfig", "InternVLConfig"] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/internvl/modeling_internvl.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/internvl/modeling_internvl.py new file mode 100644 index 0000000000000000000000000000000000000000..df66e8b14cf161e29a1f7a1ca2e382353820646b --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/internvl/modeling_internvl.py @@ -0,0 +1,949 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/internvl/modular_internvl.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_internvl.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright 2025 HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import collections.abc +from dataclasses import dataclass +from typing import Callable, Optional, Union + +import torch +import torch.nn as nn + +from ...activations import ACT2FN +from ...cache_utils import Cache +from ...generation import GenerationMixin +from ...integrations import use_kernel_forward_from_hub +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPast, BaseModelOutputWithPooling +from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel +from ...processing_utils import Unpack +from ...utils import ModelOutput, TransformersKwargs, auto_docstring, can_return_tuple, torch_int +from ...utils.generic import check_model_inputs +from ..auto import AutoModel +from .configuration_internvl import InternVLConfig, InternVLVisionConfig + + +@use_kernel_forward_from_hub("RMSNorm") +class InternVLVisionRMSNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + """ + InternVLVisionRMSNorm is equivalent to T5LayerNorm + """ + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + return self.weight * hidden_states.to(input_dtype) + + def extra_repr(self): + return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}" + + +def eager_attention_forward( + module: nn.Module, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attention_mask: Optional[torch.Tensor], + scaling: float, + dropout: float = 0.0, + **kwargs, +): + key_states = key + value_states = value + + attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling + if attention_mask is not None: + causal_mask = attention_mask[:, :, :, : key_states.shape[-2]] + attn_weights = attn_weights + causal_mask + + # No upcasting of the attention weights to float32 in this implementation + attn_weights = nn.functional.softmax(attn_weights, dim=-1) + attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) + attn_output = torch.matmul(attn_weights, value_states) + attn_output = attn_output.transpose(1, 2).contiguous() + + return attn_output, attn_weights + + +class InternVLVisionAttention(nn.Module): + """Attention Class for InternVL Vision Encoder""" + + def __init__(self, config: InternVLVisionConfig): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.embed_dim // self.num_heads + if self.head_dim * self.num_heads != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:" + f" {self.num_heads})." + ) + self.scale = self.head_dim**-0.5 + self.attention_dropout = config.attention_dropout + proj_dropout = config.projection_dropout + qk_norm = config.use_qk_norm + + # Needed for flash attention + self.is_causal = False + + self.q_proj = nn.Linear(self.embed_dim, self.num_heads * self.head_dim, bias=config.attention_bias) + self.k_proj = nn.Linear(self.embed_dim, self.num_heads * self.head_dim, bias=config.attention_bias) + self.v_proj = nn.Linear(self.embed_dim, self.num_heads * self.head_dim, bias=config.attention_bias) + self.projection_layer = nn.Linear(self.embed_dim, self.embed_dim) + self.projection_dropout = nn.Dropout(proj_dropout) if proj_dropout > 0 else nn.Identity() + + self.q_norm = InternVLVisionRMSNorm(self.embed_dim) if qk_norm else nn.Identity() + self.k_norm = InternVLVisionRMSNorm(self.embed_dim) if qk_norm else nn.Identity() + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + **kwargs: Unpack[TransformersKwargs], + ): + batch_size, seq_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + query_states = self.q_norm(query_states) + key_states = self.k_norm(key_states) + + query_states = query_states.reshape(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.reshape(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2) + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + attn_output, attn_weights = attention_interface( + self, + query_states, + key_states, + value_states, + attention_mask, + dropout=0.0 if not self.training else self.attention_dropout, + scaling=self.scale, + is_causal=False, + **kwargs, + ) + attn_output = attn_output.reshape(batch_size, seq_len, self.embed_dim) + + output = self.projection_layer(attn_output) + output = self.projection_dropout(output) + + return output, attn_weights + + +@dataclass +@auto_docstring( + custom_intro=""" + Class for outputs of [`InternVLVisionModel`]. + """ +) +class InternVLVisionModelOutputWithPooling(BaseModelOutputWithPooling): + r""" + pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`): + Average of the last layer hidden states of the patch tokens (excluding the *[CLS]* token) if + *config.use_mean_pooling* is set to True. If set to False, then the final hidden state of the *[CLS]* token + will be returned. + """ + + +class InternVLVisionPatchEmbeddings(nn.Module): + """ + This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial + `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a + Transformer. + """ + + def __init__(self, config): + super().__init__() + image_size, patch_size = config.image_size, config.patch_size + num_channels, hidden_size = config.num_channels, config.hidden_size + + num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) + patch_shape = (image_size[0] // patch_size[0], image_size[1] // patch_size[1]) + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.num_patches = num_patches + self.patch_shape = patch_shape + + self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size) + + def forward(self, pixel_values: torch.Tensor) -> torch.Tensor: + batch_size, num_channels, height, width = pixel_values.shape + if num_channels != self.num_channels: + raise ValueError( + "Make sure that the channel dimension of the pixel values match with the one set in the configuration." + ) + + embeddings = self.projection(pixel_values) + patch_height, patch_width = embeddings.shape[2], embeddings.shape[3] + embeddings = embeddings.flatten(2).transpose(1, 2) + + return embeddings, (patch_height, patch_width) + + +# Based on timm implementation, which can be found here: +# https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py +class InternVLVisionEmbeddings(nn.Module): + """ + Construct the CLS token, position and patch embeddings. Optionally, also the mask token. + + """ + + def __init__(self, config: InternVLVisionConfig) -> None: + super().__init__() + + self.cls_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size)) + if config.use_mask_token: + self.mask_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size)) + else: + self.mask_token = None + self.patch_embeddings = InternVLVisionPatchEmbeddings(config) + self.patch_size = config.patch_size + self.image_size = ( + config.image_size + if isinstance(config.image_size, collections.abc.Iterable) + else (config.image_size, config.image_size) + ) + num_patches = self.patch_embeddings.num_patches + if config.use_absolute_position_embeddings: + self.position_embeddings = nn.Parameter(torch.zeros(1, num_patches + 1, config.hidden_size)) + else: + self.position_embeddings = None + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor: + """ + This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution + images. This method is also adapted to support torch.jit tracing. + + Adapted from: + - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and + - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211 + """ + + num_patches = embeddings.shape[1] - 1 + num_positions = self.position_embeddings.shape[1] - 1 + + # always interpolate when tracing to ensure the exported model works for dynamic input shapes + if not torch.jit.is_tracing() and num_patches == num_positions and height == width: + return self.position_embeddings + + class_pos_embed = self.position_embeddings[:, :1] + patch_pos_embed = self.position_embeddings[:, 1:] + + dim = embeddings.shape[-1] + + new_height = height // self.patch_size[0] + new_width = width // self.patch_size[1] + + sqrt_num_positions = torch_int(num_positions**0.5) + patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim) + patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2) + + patch_pos_embed = nn.functional.interpolate( + patch_pos_embed, + size=(new_height, new_width), + mode="bicubic", + align_corners=False, + ) + + patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim) + + return torch.cat((class_pos_embed, patch_pos_embed), dim=1) + + def forward( + self, + pixel_values: torch.Tensor, + bool_masked_pos: Optional[torch.BoolTensor] = None, + ) -> torch.Tensor: + _, _, height, width = pixel_values.shape + embeddings, (patch_height, patch_width) = self.patch_embeddings(pixel_values) + batch_size, seq_len, _ = embeddings.size() + + if bool_masked_pos is not None: + mask_tokens = self.mask_token.expand(batch_size, seq_len, -1) + # replace the masked visual tokens by mask_tokens + w = bool_masked_pos.unsqueeze(-1).type_as(mask_tokens) + embeddings = embeddings * (1 - w) + mask_tokens * w + + cls_tokens = self.cls_token.expand(batch_size, -1, -1) + embeddings = torch.cat((cls_tokens, embeddings), dim=1) + + if self.position_embeddings is not None: + embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width) + + embeddings = self.dropout(embeddings) + + return embeddings, (patch_height, patch_width) + + +class InternVLVisionMLP(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.activation_fn = ACT2FN[config.hidden_act] + self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size) + self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.fc1(hidden_states) + hidden_states = self.activation_fn(hidden_states) + hidden_states = self.fc2(hidden_states) + return hidden_states + + +NORM2FN = {"layer_norm": nn.LayerNorm, "rms_norm": InternVLVisionRMSNorm} + + +class InternVLVisionLayer(GradientCheckpointingLayer): + """This corresponds to the Block class in the timm implementation.""" + + def __init__(self, config: InternVLVisionConfig) -> None: + super().__init__() + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 + self.attention = InternVLVisionAttention(config) + self.mlp = InternVLVisionMLP(config) + # InternVL uses different layernorm implementations for different models + self.layernorm_before = NORM2FN[config.norm_type](config.hidden_size, eps=config.layer_norm_eps) + self.layernorm_after = NORM2FN[config.norm_type](config.hidden_size, eps=config.layer_norm_eps) + + init_values = config.layer_scale_init_value + self.lambda_1 = nn.Parameter(init_values * torch.ones(config.hidden_size), requires_grad=True) + self.lambda_2 = nn.Parameter(init_values * torch.ones(config.hidden_size), requires_grad=True) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward( + self, + hidden_states: torch.Tensor, + ) -> Union[tuple[torch.Tensor], tuple[torch.Tensor, torch.Tensor]]: + attention_output, _ = self.attention( + self.layernorm_before(hidden_states), # in InternVLVision, layernorm is applied before self-attention + ) + + attention_output = self.lambda_1 * attention_output + + # first residual connection + hidden_states = attention_output + hidden_states + + # in InternVLVision, layernorm is also applied after self-attention + layer_output = self.layernorm_after(hidden_states) + + layer_output = self.mlp(layer_output) + layer_output = self.dropout(layer_output) + + if self.lambda_2 is not None: + layer_output = self.lambda_2 * layer_output + + # second residual connection + layer_output = layer_output + hidden_states + + return layer_output + + +class InternVLVisionEncoder(nn.Module): + def __init__(self, config: InternVLVisionConfig) -> None: + super().__init__() + self.config = config + self.layer = nn.ModuleList([InternVLVisionLayer(config) for i in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + + def forward( + self, + hidden_states: torch.Tensor, + ) -> Union[tuple, BaseModelOutput]: + for layer_module in self.layer: + hidden_states = layer_module(hidden_states) + + return BaseModelOutput( + last_hidden_state=hidden_states, + ) + + +@auto_docstring +class InternVLVisionPreTrainedModel(PreTrainedModel): + config: InternVLVisionConfig + base_model_prefix = "internvl_vision" + main_input_name = "pixel_values" + supports_gradient_checkpointing = True + _no_split_modules = ["InternVLVisionLayer"] + _supports_sdpa = True + _supports_flash_attn = True + _supports_flex_attn = True + _supports_attention_backend = True + + _can_record_outputs = { + "hidden_states": InternVLVisionLayer, + "attentions": InternVLVisionAttention, + } + + def _init_weights(self, module): + """Initialize the weights""" + super()._init_weights(module) + if isinstance(module, InternVLVisionEmbeddings): + module.cls_token.data.zero_() + if module.mask_token is not None: + module.mask_token.data.zero_() + if module.position_embeddings is not None: + module.position_embeddings.data.zero_() + elif isinstance(module, InternVLVisionLayer): + module.lambda_1.data.fill_(self.config.layer_scale_init_value) + module.lambda_2.data.fill_(self.config.layer_scale_init_value) + + +@auto_docstring +class InternVLVisionModel(InternVLVisionPreTrainedModel): + def __init__(self, config: InternVLVisionConfig) -> None: + super().__init__(config) + self.config = config + + self.embeddings = InternVLVisionEmbeddings(config) + self.encoder = InternVLVisionEncoder(config) + + self.layernorm = ( + nn.Identity() if config.use_mean_pooling else nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + ) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embeddings.patch_embeddings + + @check_model_inputs(tie_last_hidden_states=False) + @auto_docstring + def forward( + self, + pixel_values: torch.Tensor, + bool_masked_pos: Optional[torch.BoolTensor] = None, + ) -> Union[tuple, InternVLVisionModelOutputWithPooling]: + r""" + bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*): + Boolean masked positions. Indicates which patches are masked (1) and which aren't (0). + """ + embedding_output, _ = self.embeddings(pixel_values, bool_masked_pos=bool_masked_pos) + + encoder_outputs = self.encoder(embedding_output) + sequence_output = encoder_outputs[0] + sequence_output = self.layernorm(sequence_output) + + return InternVLVisionModelOutputWithPooling( + last_hidden_state=sequence_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + +@auto_docstring +class InternVLPreTrainedModel(PreTrainedModel): + config: InternVLConfig + base_model_prefix = "" + supports_gradient_checkpointing = True + _skip_keys_device_placement = "past_key_values" + + _supports_flash_attn = True + _supports_sdpa = True + + _can_compile_fullgraph = True + _supports_flex_attn = True + _supports_attention_backend = True + + +class InternVLMultiModalProjector(nn.Module): + def __init__(self, config: InternVLConfig): + super().__init__() + self.layer_norm = nn.LayerNorm(config.vision_config.hidden_size * int(1 / config.downsample_ratio) ** 2) + self.linear_1 = nn.Linear( + config.vision_config.hidden_size * int(1 / config.downsample_ratio) ** 2, config.text_config.hidden_size + ) + self.act = ACT2FN[config.projector_hidden_act] + self.linear_2 = nn.Linear(config.text_config.hidden_size, config.text_config.hidden_size) + + def forward(self, image_features): + hidden_states = self.layer_norm(image_features) + hidden_states = self.linear_1(hidden_states) + hidden_states = self.act(hidden_states) + hidden_states = self.linear_2(hidden_states) + return hidden_states + + +@dataclass +@auto_docstring( + custom_intro=""" + Base class for InternVL outputs, with hidden states and attentions. + """ +) +class InternVLModelOutputWithPast(BaseModelOutputWithPast): + r""" + past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache). + + Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see + `past_key_values` input) to speed up sequential decoding. + image_hidden_states (`torch.FloatTensor`, *optional*): + A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`. + image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state. + """ + + image_hidden_states: Optional[torch.FloatTensor] = None + + +@auto_docstring( + custom_intro=""" + The InternVL model which consists of a vision backbone and a language model, without a language modeling head. + """ +) +class InternVLModel(InternVLPreTrainedModel): + _checkpoint_conversion_mapping = {"language_model.model": "language_model"} + + def __init__(self, config: InternVLConfig): + super().__init__(config) + self.vision_tower = AutoModel.from_config(config.vision_config) + + self.multi_modal_projector = InternVLMultiModalProjector(config) + self.language_model = AutoModel.from_config(config.text_config) + self.post_init() + + def get_input_embeddings(self): + return self.language_model.get_input_embeddings() + + def set_input_embeddings(self, value): + self.language_model.set_input_embeddings(value) + + def set_decoder(self, decoder): + self.language_model = decoder + + def get_decoder(self): + return self.language_model + + def get_image_features( + self, + pixel_values: torch.FloatTensor, + vision_feature_layer: Optional[Union[int, list[int]]] = None, + vision_feature_select_strategy: Optional[str] = None, + **kwargs, + ): + """ + Obtains image last hidden states from the vision tower and apply multimodal projection. + + Args: + pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`) + The tensors corresponding to the input images. + vision_feature_layer (`int` or `list[int]`): + Layer index or list of layer indices to extract features from. + Returns: + vision_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`. + """ + vision_feature_layer = ( + vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer + ) + vision_feature_select_strategy = ( + vision_feature_select_strategy + if vision_feature_select_strategy is not None + else self.config.vision_feature_select_strategy + ) + pixel_values = pixel_values.to(dtype=self.dtype) # fp16 compatibility + + downsample_ratio = self.config.downsample_ratio + if vision_feature_layer == -1: + vision_features = self.vision_tower(pixel_values=pixel_values).last_hidden_state + else: + vision_features = self.vision_model(pixel_values=pixel_values).hidden_states[vision_feature_layer] + if vision_feature_select_strategy == "default": + vision_features = vision_features[:, 1:, :] + + # Calculate dimensions based on vision features + channels = vision_features.shape[1] + feature_size = int(channels**0.5) + batch_size = vision_features.shape[0] + + # Reshape tensor to spatial dimensions + vision_features = vision_features.reshape(batch_size, feature_size, feature_size, -1) + + # Apply downsampling using pixel shuffle + vision_features = self.pixel_shuffle(vision_features, scale_factor=downsample_ratio) + + # Reshape tensor to prepare for projection + vision_features = vision_features.reshape(batch_size, -1, vision_features.shape[-1]) + + # Project features through multi-modal projector + vision_features = self.multi_modal_projector(vision_features) + return vision_features + + def get_placeholder_mask( + self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor, image_features: torch.FloatTensor + ): + """ + Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is + equal to the length of multimodal features. If the lengths are different, an error is raised. + """ + if input_ids is None: + special_image_mask = inputs_embeds == self.get_input_embeddings()( + torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device) + ) + special_image_mask = special_image_mask.all(-1) + else: + special_image_mask = input_ids == self.config.image_token_id + + n_image_tokens = special_image_mask.sum() + special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device) + n_image_features = image_features.shape[0] * image_features.shape[1] + if inputs_embeds[special_image_mask].numel() != image_features.numel(): + raise ValueError( + f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}" + ) + return special_image_mask + + @can_return_tuple + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + vision_feature_layer: Optional[Union[int, list[int]]] = None, + vision_feature_select_strategy: Optional[str] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> Union[tuple, InternVLModelOutputWithPast]: + vision_feature_layer = ( + vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer + ) + vision_feature_select_strategy = ( + vision_feature_select_strategy + if vision_feature_select_strategy is not None + else self.config.vision_feature_select_strategy + ) + + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") + + if inputs_embeds is None: + inputs_embeds = self.get_input_embeddings()(input_ids) + + if pixel_values is not None: + image_features = self.get_image_features( + pixel_values=pixel_values, + vision_feature_layer=vision_feature_layer, + vision_feature_select_strategy=vision_feature_select_strategy, + ) + image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype) + special_image_mask = self.get_placeholder_mask( + input_ids, inputs_embeds=inputs_embeds, image_features=image_features + ) + inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features) + + outputs = self.language_model( + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + cache_position=cache_position, + **kwargs, + ) + + return InternVLModelOutputWithPast( + last_hidden_state=outputs.last_hidden_state, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + image_hidden_states=image_features if pixel_values is not None else None, + ) + + def pixel_shuffle(self, vision_features: torch.Tensor, scale_factor: float = 0.5): + """Perform pixel shuffle downsampling on vision features. + + Args: + vision_features (`torch.Tensor`): + Input tensor of shape (batch_size, width, height, channels). + scale_factor (`float`, *optional*, defaults to `0.5`): + Factor by which to downsample. Default is 0.5, which halves the dimensions. + + Returns: + vision_features (`torch.Tensor`): + Downsampled tensor of shape (batch_size, height*scale_factor, width*scale_factor, channels/(scale_factor^2)). + """ + batch_size, width, height, channels = vision_features.size() + + if height % scale_factor != 0 or width % scale_factor != 0: + raise ValueError("Height and width must be divisible by scale_factor for proper downsampling.") + + # Reshape to allow downsampling + vision_features = vision_features.view( + batch_size, width, int(height * scale_factor), int(channels / scale_factor) + ) + # Permute dimensions to align downsampled axis correctly + vision_features = vision_features.permute(0, 2, 1, 3).contiguous() + + # Reshape to achieve final downsampled dimensions + vision_features = vision_features.view( + batch_size, int(height * scale_factor), int(width * scale_factor), int(channels / (scale_factor**2)) + ) + + # Swap height and width back for proper orientation + vision_features = vision_features.permute(0, 2, 1, 3).contiguous() + + return vision_features + + +@dataclass +@auto_docstring( + custom_intro=""" + Base class for InternVL causal language model (or autoregressive) outputs. + """ +) +class InternVLCausalLMOutputWithPast(ModelOutput): + r""" + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Language modeling loss (for next-token prediction). + logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache). + + Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see + `past_key_values` input) to speed up sequential decoding. + image_hidden_states (`torch.FloatTensor`, *optional*): + A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`. + image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state. + """ + + loss: Optional[torch.FloatTensor] = None + logits: Optional[torch.FloatTensor] = None + past_key_values: Optional[Cache] = None + hidden_states: Optional[tuple[torch.FloatTensor]] = None + attentions: Optional[tuple[torch.FloatTensor]] = None + image_hidden_states: Optional[torch.FloatTensor] = None + + +@auto_docstring( + custom_intro=""" + The INTERNVL model which consists of a vision backbone and a language model. + """ +) +class InternVLForConditionalGeneration(InternVLPreTrainedModel, GenerationMixin): + _checkpoint_conversion_mapping = { + "^language_model.model": "model.language_model", + "^vision_tower": "model.vision_tower", + "^multi_modal_projector": "model.multi_modal_projector", + "^language_model.lm_head": "lm_head", + } + _tied_weights_keys = ["lm_head.weight"] + + def __init__(self, config: InternVLConfig): + super().__init__(config) + self.model = InternVLModel(config) + self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False) + self.post_init() + + def get_input_embeddings(self): + return self.model.get_input_embeddings() + + def set_input_embeddings(self, value): + self.model.set_input_embeddings(value) + + def get_output_embeddings(self) -> nn.Module: + return self.lm_head + + def set_decoder(self, decoder): + self.model.set_decoder(decoder) + + def get_decoder(self): + return self.model.get_decoder() + + def get_image_features( + self, + pixel_values: torch.FloatTensor, + vision_feature_layer: Optional[Union[int, list[int]]] = None, + vision_feature_select_strategy: Optional[str] = None, + **kwargs, + ): + return self.model.get_image_features( + pixel_values=pixel_values, + vision_feature_layer=vision_feature_layer, + vision_feature_select_strategy=vision_feature_select_strategy, + **kwargs, + ) + + # Make modules available through conditional class for BC + @property + def language_model(self): + return self.model.language_model + + @property + def vision_tower(self): + return self.model.vision_tower + + @property + def multi_modal_projector(self): + return self.model.multi_modal_projector + + @can_return_tuple + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + vision_feature_layer: Optional[Union[int, list[int]]] = None, + vision_feature_select_strategy: Optional[str] = None, + labels: Optional[torch.LongTensor] = None, + cache_position: Optional[torch.LongTensor] = None, + logits_to_keep: Union[int, torch.Tensor] = 0, + image_sizes: Optional[torch.Tensor] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> Union[tuple, InternVLCausalLMOutputWithPast]: + r""" + Example: + + ```python + >>> import torch + >>> from transformers import AutoProcessor, AutoModelForImageTextToText + + >>> torch_device = "cuda" + >>> processor = AutoProcessor.from_pretrained("OpenGVLab/InternVL3-1B-hf") + >>> model = AutoModelForImageTextToText.from_pretrained( + ... "OpenGVLab/InternVL3-1B-hf", dtype=torch.bfloat16, device_map=torch_device + ... ) + + >>> messages = [ + ... { + ... "role": "user", + ... "content": [ + ... { + ... "type": "image", + ... "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg", + ... }, + ... { + ... "type": "image", + ... "url": "https://thumbs.dreamstime.com/b/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg", + ... }, + ... {"type": "text", "text": "These images depict two different landmarks. Can you identify them?"}, + ... ], + ... }, + ... ] + + >>> inputs = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt").to(torch_device) + >>> generate_ids = model.generate(**inputs, max_new_tokens=200) + >>> print(processor.decode(generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True)) + The images depict the Statue of Liberty and the Golden Gate Bridge. + ```""" + vision_feature_layer = ( + vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer + ) + vision_feature_select_strategy = ( + vision_feature_select_strategy + if vision_feature_select_strategy is not None + else self.config.vision_feature_select_strategy + ) + + outputs = self.model( + input_ids=input_ids, + pixel_values=pixel_values, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + vision_feature_layer=vision_feature_layer, + vision_feature_select_strategy=vision_feature_select_strategy, + cache_position=cache_position, + image_sizes=image_sizes, + **kwargs, + ) + + hidden_states = outputs[0] + # Only compute necessary logits, and do not upcast them to float if we are not computing the loss + slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep + logits = self.lm_head(hidden_states[:, slice_indices, :]) + + loss = None + if labels is not None: + loss = self.loss_function( + logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **kwargs + ) + + return InternVLCausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + image_hidden_states=outputs.image_hidden_states, + ) + + def prepare_inputs_for_generation( + self, + input_ids, + past_key_values=None, + inputs_embeds=None, + pixel_values=None, + attention_mask=None, + cache_position=None, + logits_to_keep=None, + **kwargs, + ): + # Overwritten -- in specific circumstances we don't want to forward image inputs to the model + + model_inputs = super().prepare_inputs_for_generation( + input_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + cache_position=cache_position, + logits_to_keep=logits_to_keep, + **kwargs, + ) + + if cache_position[0] == 0: + # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore + # Otherwise we need pixel values to be passed to model + model_inputs["pixel_values"] = pixel_values + + return model_inputs + + +__all__ = [ + "InternVLVisionPreTrainedModel", + "InternVLVisionModel", + "InternVLPreTrainedModel", + "InternVLModel", + "InternVLForConditionalGeneration", +] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/internvl/modular_internvl.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/internvl/modular_internvl.py new file mode 100644 index 0000000000000000000000000000000000000000..d282de65a4b54b3544a75d5e7af737ae4efeae4e --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/internvl/modular_internvl.py @@ -0,0 +1,655 @@ +# coding=utf-8 +# Copyright 2025 HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import collections.abc +from dataclasses import dataclass +from typing import Callable, Optional, Union + +import torch +import torch.nn as nn + +from ...activations import ACT2FN +from ...cache_utils import Cache +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling +from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel +from ...processing_utils import Unpack +from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging, torch_int +from ...utils.generic import check_model_inputs +from ..clip.modeling_clip import CLIPMLP +from ..janus.modeling_janus import JanusVisionAttention +from ..llama.modeling_llama import LlamaRMSNorm +from ..llava.modeling_llava import ( + LlavaCausalLMOutputWithPast, + LlavaForConditionalGeneration, + LlavaModel, + LlavaModelOutputWithPast, + LlavaPreTrainedModel, +) +from .configuration_internvl import InternVLConfig, InternVLVisionConfig + + +logger = logging.get_logger(__name__) + + +def eager_attention_forward( + module: nn.Module, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attention_mask: Optional[torch.Tensor], + scaling: float, + dropout: float = 0.0, + **kwargs, +): + key_states = key + value_states = value + + attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling + if attention_mask is not None: + causal_mask = attention_mask[:, :, :, : key_states.shape[-2]] + attn_weights = attn_weights + causal_mask + + # No upcasting of the attention weights to float32 in this implementation + attn_weights = nn.functional.softmax(attn_weights, dim=-1) + attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) + attn_output = torch.matmul(attn_weights, value_states) + attn_output = attn_output.transpose(1, 2).contiguous() + + return attn_output, attn_weights + + +class InternVLVisionRMSNorm(LlamaRMSNorm): + pass + + +class InternVLVisionAttention(JanusVisionAttention): + def __init__(self, config: InternVLVisionConfig): + super().__init__(config) + del self.num_key_value_groups + + # Needed for flash attention + self.is_causal = False + qk_norm = config.use_qk_norm + + self.q_norm = InternVLVisionRMSNorm(self.embed_dim) if qk_norm else nn.Identity() + self.k_norm = InternVLVisionRMSNorm(self.embed_dim) if qk_norm else nn.Identity() + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + **kwargs: Unpack[TransformersKwargs], + ): + batch_size, seq_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + query_states = self.q_norm(query_states) + key_states = self.k_norm(key_states) + + query_states = query_states.reshape(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.reshape(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2) + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + attn_output, attn_weights = attention_interface( + self, + query_states, + key_states, + value_states, + attention_mask, + dropout=0.0 if not self.training else self.attention_dropout, + scaling=self.scale, + is_causal=False, + **kwargs, + ) + attn_output = attn_output.reshape(batch_size, seq_len, self.embed_dim) + + output = self.projection_layer(attn_output) + output = self.projection_dropout(output) + + return output, attn_weights + + +@dataclass +@auto_docstring( + custom_intro=""" + Class for outputs of [`InternVLVisionModel`]. + """ +) +class InternVLVisionModelOutputWithPooling(BaseModelOutputWithPooling): + r""" + pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`): + Average of the last layer hidden states of the patch tokens (excluding the *[CLS]* token) if + *config.use_mean_pooling* is set to True. If set to False, then the final hidden state of the *[CLS]* token + will be returned. + """ + + +class InternVLVisionPatchEmbeddings(nn.Module): + """ + This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial + `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a + Transformer. + """ + + def __init__(self, config): + super().__init__() + image_size, patch_size = config.image_size, config.patch_size + num_channels, hidden_size = config.num_channels, config.hidden_size + + num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) + patch_shape = (image_size[0] // patch_size[0], image_size[1] // patch_size[1]) + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.num_patches = num_patches + self.patch_shape = patch_shape + + self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size) + + def forward(self, pixel_values: torch.Tensor) -> torch.Tensor: + batch_size, num_channels, height, width = pixel_values.shape + if num_channels != self.num_channels: + raise ValueError( + "Make sure that the channel dimension of the pixel values match with the one set in the configuration." + ) + + embeddings = self.projection(pixel_values) + patch_height, patch_width = embeddings.shape[2], embeddings.shape[3] + embeddings = embeddings.flatten(2).transpose(1, 2) + + return embeddings, (patch_height, patch_width) + + +# Based on timm implementation, which can be found here: +# https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py +class InternVLVisionEmbeddings(nn.Module): + """ + Construct the CLS token, position and patch embeddings. Optionally, also the mask token. + + """ + + def __init__(self, config: InternVLVisionConfig) -> None: + super().__init__() + + self.cls_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size)) + if config.use_mask_token: + self.mask_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size)) + else: + self.mask_token = None + self.patch_embeddings = InternVLVisionPatchEmbeddings(config) + self.patch_size = config.patch_size + self.image_size = ( + config.image_size + if isinstance(config.image_size, collections.abc.Iterable) + else (config.image_size, config.image_size) + ) + num_patches = self.patch_embeddings.num_patches + if config.use_absolute_position_embeddings: + self.position_embeddings = nn.Parameter(torch.zeros(1, num_patches + 1, config.hidden_size)) + else: + self.position_embeddings = None + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor: + """ + This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution + images. This method is also adapted to support torch.jit tracing. + + Adapted from: + - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and + - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211 + """ + + num_patches = embeddings.shape[1] - 1 + num_positions = self.position_embeddings.shape[1] - 1 + + # always interpolate when tracing to ensure the exported model works for dynamic input shapes + if not torch.jit.is_tracing() and num_patches == num_positions and height == width: + return self.position_embeddings + + class_pos_embed = self.position_embeddings[:, :1] + patch_pos_embed = self.position_embeddings[:, 1:] + + dim = embeddings.shape[-1] + + new_height = height // self.patch_size[0] + new_width = width // self.patch_size[1] + + sqrt_num_positions = torch_int(num_positions**0.5) + patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim) + patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2) + + patch_pos_embed = nn.functional.interpolate( + patch_pos_embed, + size=(new_height, new_width), + mode="bicubic", + align_corners=False, + ) + + patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim) + + return torch.cat((class_pos_embed, patch_pos_embed), dim=1) + + def forward( + self, + pixel_values: torch.Tensor, + bool_masked_pos: Optional[torch.BoolTensor] = None, + ) -> torch.Tensor: + _, _, height, width = pixel_values.shape + embeddings, (patch_height, patch_width) = self.patch_embeddings(pixel_values) + batch_size, seq_len, _ = embeddings.size() + + if bool_masked_pos is not None: + mask_tokens = self.mask_token.expand(batch_size, seq_len, -1) + # replace the masked visual tokens by mask_tokens + w = bool_masked_pos.unsqueeze(-1).type_as(mask_tokens) + embeddings = embeddings * (1 - w) + mask_tokens * w + + cls_tokens = self.cls_token.expand(batch_size, -1, -1) + embeddings = torch.cat((cls_tokens, embeddings), dim=1) + + if self.position_embeddings is not None: + embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width) + + embeddings = self.dropout(embeddings) + + return embeddings, (patch_height, patch_width) + + +class InternVLVisionMLP(CLIPMLP): + pass + + +NORM2FN = {"layer_norm": nn.LayerNorm, "rms_norm": InternVLVisionRMSNorm} + + +class InternVLVisionLayer(GradientCheckpointingLayer): + """This corresponds to the Block class in the timm implementation.""" + + def __init__(self, config: InternVLVisionConfig) -> None: + super().__init__() + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 + self.attention = InternVLVisionAttention(config) + self.mlp = InternVLVisionMLP(config) + # InternVL uses different layernorm implementations for different models + self.layernorm_before = NORM2FN[config.norm_type](config.hidden_size, eps=config.layer_norm_eps) + self.layernorm_after = NORM2FN[config.norm_type](config.hidden_size, eps=config.layer_norm_eps) + + init_values = config.layer_scale_init_value + self.lambda_1 = nn.Parameter(init_values * torch.ones(config.hidden_size), requires_grad=True) + self.lambda_2 = nn.Parameter(init_values * torch.ones(config.hidden_size), requires_grad=True) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward( + self, + hidden_states: torch.Tensor, + ) -> Union[tuple[torch.Tensor], tuple[torch.Tensor, torch.Tensor]]: + attention_output, _ = self.attention( + self.layernorm_before(hidden_states), # in InternVLVision, layernorm is applied before self-attention + ) + + attention_output = self.lambda_1 * attention_output + + # first residual connection + hidden_states = attention_output + hidden_states + + # in InternVLVision, layernorm is also applied after self-attention + layer_output = self.layernorm_after(hidden_states) + + layer_output = self.mlp(layer_output) + layer_output = self.dropout(layer_output) + + if self.lambda_2 is not None: + layer_output = self.lambda_2 * layer_output + + # second residual connection + layer_output = layer_output + hidden_states + + return layer_output + + +class InternVLVisionEncoder(nn.Module): + def __init__(self, config: InternVLVisionConfig) -> None: + super().__init__() + self.config = config + self.layer = nn.ModuleList([InternVLVisionLayer(config) for i in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + + def forward( + self, + hidden_states: torch.Tensor, + ) -> Union[tuple, BaseModelOutput]: + for layer_module in self.layer: + hidden_states = layer_module(hidden_states) + + return BaseModelOutput( + last_hidden_state=hidden_states, + ) + + +@auto_docstring +class InternVLVisionPreTrainedModel(PreTrainedModel): + config: InternVLVisionConfig + base_model_prefix = "internvl_vision" + main_input_name = "pixel_values" + supports_gradient_checkpointing = True + _no_split_modules = ["InternVLVisionLayer"] + _supports_sdpa = True + _supports_flash_attn = True + _supports_flex_attn = True + _supports_attention_backend = True + + _can_record_outputs = { + "hidden_states": InternVLVisionLayer, + "attentions": InternVLVisionAttention, + } + + def _init_weights(self, module): + """Initialize the weights""" + super()._init_weights(module) + if isinstance(module, InternVLVisionEmbeddings): + module.cls_token.data.zero_() + if module.mask_token is not None: + module.mask_token.data.zero_() + if module.position_embeddings is not None: + module.position_embeddings.data.zero_() + elif isinstance(module, InternVLVisionLayer): + module.lambda_1.data.fill_(self.config.layer_scale_init_value) + module.lambda_2.data.fill_(self.config.layer_scale_init_value) + + +@auto_docstring +class InternVLVisionModel(InternVLVisionPreTrainedModel): + def __init__(self, config: InternVLVisionConfig) -> None: + super().__init__(config) + self.config = config + + self.embeddings = InternVLVisionEmbeddings(config) + self.encoder = InternVLVisionEncoder(config) + + self.layernorm = ( + nn.Identity() if config.use_mean_pooling else nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + ) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embeddings.patch_embeddings + + @check_model_inputs(tie_last_hidden_states=False) + @auto_docstring + def forward( + self, + pixel_values: torch.Tensor, + bool_masked_pos: Optional[torch.BoolTensor] = None, + ) -> Union[tuple, InternVLVisionModelOutputWithPooling]: + r""" + bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*): + Boolean masked positions. Indicates which patches are masked (1) and which aren't (0). + """ + embedding_output, _ = self.embeddings(pixel_values, bool_masked_pos=bool_masked_pos) + + encoder_outputs = self.encoder(embedding_output) + sequence_output = encoder_outputs[0] + sequence_output = self.layernorm(sequence_output) + + return InternVLVisionModelOutputWithPooling( + last_hidden_state=sequence_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + +class InternVLPreTrainedModel(LlavaPreTrainedModel): + pass + + +INTERNVL_INPUTS_DOCSTRING = None + + +class InternVLMultiModalProjector(nn.Module): + def __init__(self, config: InternVLConfig): + super().__init__() + self.layer_norm = nn.LayerNorm(config.vision_config.hidden_size * int(1 / config.downsample_ratio) ** 2) + self.linear_1 = nn.Linear( + config.vision_config.hidden_size * int(1 / config.downsample_ratio) ** 2, config.text_config.hidden_size + ) + self.act = ACT2FN[config.projector_hidden_act] + self.linear_2 = nn.Linear(config.text_config.hidden_size, config.text_config.hidden_size) + + def forward(self, image_features): + hidden_states = self.layer_norm(image_features) + hidden_states = self.linear_1(hidden_states) + hidden_states = self.act(hidden_states) + hidden_states = self.linear_2(hidden_states) + return hidden_states + + +class InternVLModelOutputWithPast(LlavaModelOutputWithPast): + pass + + +class InternVLModel(LlavaModel): + def pixel_shuffle(self, vision_features: torch.Tensor, scale_factor: float = 0.5): + """Perform pixel shuffle downsampling on vision features. + + Args: + vision_features (`torch.Tensor`): + Input tensor of shape (batch_size, width, height, channels). + scale_factor (`float`, *optional*, defaults to `0.5`): + Factor by which to downsample. Default is 0.5, which halves the dimensions. + + Returns: + vision_features (`torch.Tensor`): + Downsampled tensor of shape (batch_size, height*scale_factor, width*scale_factor, channels/(scale_factor^2)). + """ + batch_size, width, height, channels = vision_features.size() + + if height % scale_factor != 0 or width % scale_factor != 0: + raise ValueError("Height and width must be divisible by scale_factor for proper downsampling.") + + # Reshape to allow downsampling + vision_features = vision_features.view( + batch_size, width, int(height * scale_factor), int(channels / scale_factor) + ) + # Permute dimensions to align downsampled axis correctly + vision_features = vision_features.permute(0, 2, 1, 3).contiguous() + + # Reshape to achieve final downsampled dimensions + vision_features = vision_features.view( + batch_size, int(height * scale_factor), int(width * scale_factor), int(channels / (scale_factor**2)) + ) + + # Swap height and width back for proper orientation + vision_features = vision_features.permute(0, 2, 1, 3).contiguous() + + return vision_features + + def get_image_features( + self, + pixel_values: torch.FloatTensor, + vision_feature_layer: Optional[Union[int, list[int]]] = None, + vision_feature_select_strategy: Optional[str] = None, + **kwargs, + ): + """ + Obtains image last hidden states from the vision tower and apply multimodal projection. + + Args: + pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`) + The tensors corresponding to the input images. + vision_feature_layer (`int` or `list[int]`): + Layer index or list of layer indices to extract features from. + Returns: + vision_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`. + """ + vision_feature_layer = ( + vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer + ) + vision_feature_select_strategy = ( + vision_feature_select_strategy + if vision_feature_select_strategy is not None + else self.config.vision_feature_select_strategy + ) + pixel_values = pixel_values.to(dtype=self.dtype) # fp16 compatibility + + downsample_ratio = self.config.downsample_ratio + if vision_feature_layer == -1: + vision_features = self.vision_tower(pixel_values=pixel_values).last_hidden_state + else: + vision_features = self.vision_model(pixel_values=pixel_values).hidden_states[vision_feature_layer] + if vision_feature_select_strategy == "default": + vision_features = vision_features[:, 1:, :] + + # Calculate dimensions based on vision features + channels = vision_features.shape[1] + feature_size = int(channels**0.5) + batch_size = vision_features.shape[0] + + # Reshape tensor to spatial dimensions + vision_features = vision_features.reshape(batch_size, feature_size, feature_size, -1) + + # Apply downsampling using pixel shuffle + vision_features = self.pixel_shuffle(vision_features, scale_factor=downsample_ratio) + + # Reshape tensor to prepare for projection + vision_features = vision_features.reshape(batch_size, -1, vision_features.shape[-1]) + + # Project features through multi-modal projector + vision_features = self.multi_modal_projector(vision_features) + return vision_features + + @can_return_tuple + @auto_docstring + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + vision_feature_layer: Optional[Union[int, list[int]]] = None, + vision_feature_select_strategy: Optional[str] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> Union[tuple, InternVLModelOutputWithPast]: + vision_feature_layer = ( + vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer + ) + vision_feature_select_strategy = ( + vision_feature_select_strategy + if vision_feature_select_strategy is not None + else self.config.vision_feature_select_strategy + ) + + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") + + if inputs_embeds is None: + inputs_embeds = self.get_input_embeddings()(input_ids) + + if pixel_values is not None: + image_features = self.get_image_features( + pixel_values=pixel_values, + vision_feature_layer=vision_feature_layer, + vision_feature_select_strategy=vision_feature_select_strategy, + ) + image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype) + special_image_mask = self.get_placeholder_mask( + input_ids, inputs_embeds=inputs_embeds, image_features=image_features + ) + inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features) + + outputs = self.language_model( + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + cache_position=cache_position, + **kwargs, + ) + + return InternVLModelOutputWithPast( + last_hidden_state=outputs.last_hidden_state, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + image_hidden_states=image_features if pixel_values is not None else None, + ) + + +class InternVLCausalLMOutputWithPast(LlavaCausalLMOutputWithPast): + pass + + +class InternVLForConditionalGeneration(LlavaForConditionalGeneration): + def forward(**super_kwargs): + r""" + Example: + + ```python + >>> import torch + >>> from transformers import AutoProcessor, AutoModelForImageTextToText + + >>> torch_device = "cuda" + >>> processor = AutoProcessor.from_pretrained("OpenGVLab/InternVL3-1B-hf") + >>> model = AutoModelForImageTextToText.from_pretrained( + ... "OpenGVLab/InternVL3-1B-hf", dtype=torch.bfloat16, device_map=torch_device + ... ) + + >>> messages = [ + ... { + ... "role": "user", + ... "content": [ + ... { + ... "type": "image", + ... "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg", + ... }, + ... { + ... "type": "image", + ... "url": "https://thumbs.dreamstime.com/b/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg", + ... }, + ... {"type": "text", "text": "These images depict two different landmarks. Can you identify them?"}, + ... ], + ... }, + ... ] + + >>> inputs = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt").to(torch_device) + >>> generate_ids = model.generate(**inputs, max_new_tokens=200) + >>> print(processor.decode(generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True)) + The images depict the Statue of Liberty and the Golden Gate Bridge. + ```""" + super().forward(**super_kwargs) + + +__all__ = [ + "InternVLVisionPreTrainedModel", + "InternVLVisionModel", + "InternVLPreTrainedModel", + "InternVLModel", + "InternVLForConditionalGeneration", +] diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/internvl/processing_internvl.py b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/internvl/processing_internvl.py new file mode 100644 index 0000000000000000000000000000000000000000..6c0c695754301dbfbe2498647c317947fc548c19 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/transformers/models/internvl/processing_internvl.py @@ -0,0 +1,313 @@ +# coding=utf-8 +# Copyright 2025 HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Optional, Union + +import numpy as np + +from ...image_processing_utils import BatchFeature +from ...image_utils import ImageInput, concatenate_list, make_flat_list_of_images +from ...processing_utils import ImagesKwargs, MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack +from ...tokenization_utils_base import PreTokenizedInput, TextInput +from ...video_utils import VideoInput + + +class InternVLImagesKwargs(ImagesKwargs, total=False): + crop_to_patches: Optional[bool] + min_patches: Optional[int] + max_patches: Optional[int] + + +class InternVLProcessorKwargs(ProcessingKwargs, total=False): + images_kwargs: InternVLImagesKwargs + _defaults = { + "text_kwargs": { + "padding_side": "left", + "return_mm_token_type_ids": False, + }, + "images_kwargs": { + "crop_to_patches": True, + }, + "videos_kwargs": { + "return_tensors": "pt", + }, + } + + +class InternVLProcessor(ProcessorMixin): + r""" + Constructs a InternVL processor which wraps a [`AutoImageProcessor`] and + [`PretrainedTokenizerFast`] tokenizer into a single processor that inherits both the image processor and + tokenizer functionalities. See the [`~InternVLProcessor.__call__`] and [`~InternVLProcessor.decode`] for more information. + Args: + image_processor ([`AutoImageProcessor`], *optional*): + The image processor is a required input. + tokenizer ([`PreTrainedTokenizer`, `PreTrainedTokenizerFast`], *optional*): + The tokenizer is a required input. + video_processor ([`AutoVideoProcessor`], *optional*): + The video processor is a required input. + image_seq_length (`int`, *optional*, defaults to 256): + The number of image token to use per image patch. it should be set so that: + image_seq_length = (config.image_size // config.patch_size) ** 2 * (config.scale_factor**2) + chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages + in a chat into a tokenizable string. + """ + + attributes = ["image_processor", "tokenizer", "video_processor"] + image_processor_class = "AutoImageProcessor" + video_processor_class = "AutoVideoProcessor" + tokenizer_class = "AutoTokenizer" + + def __init__( + self, + image_processor=None, + tokenizer=None, + video_processor=None, + image_seq_length: int = 256, + chat_template=None, + **kwargs, + ): + self.image_seq_length = image_seq_length + self.start_image_token = tokenizer.start_image_token + self.end_image_token = tokenizer.end_image_token + self.start_image_token_id = tokenizer.start_image_token_id + self.end_image_token_id = tokenizer.end_image_token_id + self.image_token = tokenizer.context_image_token + self.video_token = tokenizer.video_token + self.image_token_id = tokenizer.context_image_token_id + self.image_ids = [self.image_token_id, self.start_image_token_id, self.end_image_token_id] + + super().__init__(image_processor, tokenizer, video_processor, chat_template=chat_template, **kwargs) + + def _insert_media_placeholders( + self, + text: list[str], + image_pixel_values, + video_pixel_values, + image_num_patches: list[int], + video_num_patches: list[int], + image_num_patches_indices: np.ndarray, + video_num_patches_indices: np.ndarray, + video_patch_indices: np.ndarray, + ): + """ + Processes interleaved text with and